; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
declare i64 @_Z13get_global_idj(i32) #0
define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) {
; GFX8-LABEL: clmem_read_simplified:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_mov_b32 s38, -1
; GFX8-NEXT: s_mov_b32 s39, 0xe80000
; GFX8-NEXT: s_add_u32 s36, s36, s9
; GFX8-NEXT: s_addc_u32 s37, s37, 0
; GFX8-NEXT: s_getpc_b64 s[0:1]
; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
; GFX8-NEXT: v_mov_b32_e32 v2, s35
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1
; GFX8-NEXT: v_mov_b32_e32 v3, 3
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x800
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x1000
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x1800
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc
; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[3:4]
; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6]
; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8]
; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10]
; GFX8-NEXT: s_movk_i32 s0, 0x2000
; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x2800
; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc
; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14]
; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16]
; GFX8-NEXT: s_movk_i32 s0, 0x3000
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc
; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18]
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x3800, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4]
; GFX8-NEXT: s_waitcnt vmcnt(6)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v11
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v12, vcc
; GFX8-NEXT: s_waitcnt vmcnt(5)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(4)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v13, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v15, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v17, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v18, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc
; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: clmem_read_simplified:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s38, -1
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT: v_and_b32_e32 v18, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v18
; GFX9-NEXT: v_mov_b32_e32 v3, 3
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_movk_i32 s1, 0x2000
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2048
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, s1, v0
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
; GFX9-NEXT: s_movk_i32 s0, 0x1000
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[12:13], v[10:11], off offset:2048
; GFX9-NEXT: global_load_dwordx2 v[14:15], v[6:7], off
; GFX9-NEXT: global_load_dwordx2 v[16:17], v[6:7], off offset:2048
; GFX9-NEXT: s_movk_i32 s0, 0x3000
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v12, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v18, v[0:1], s[34:35]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: clmem_read_simplified:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s38, -1
; GFX10-NEXT: s_mov_b32 s39, 0x31c16000
; GFX10-NEXT: s_add_u32 s36, s36, s9
; GFX10-NEXT: s_addc_u32 s37, s37, 0
; GFX10-NEXT: s_getpc_b64 s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT: s_mov_b32 s32, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_and_b32_e32 v20, 0xffff8000, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v20
; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x1000
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, 0x2000
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[10:11], v[2:3], off
; GFX10-NEXT: global_load_dwordx2 v[12:13], v[8:9], off offset:-2048
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x3000
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[14:15], v[8:9], off
; GFX10-NEXT: global_load_dwordx2 v[16:17], v[2:3], off offset:-2048
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
; GFX10-NEXT: global_load_dwordx2 v[18:19], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(6)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(5)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v12, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(3)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v14, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v16, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v18, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo
; GFX10-NEXT: global_store_dwordx2 v20, v[0:1], s[34:35]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: clmem_read_simplified:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v16, 0xffff8000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v16
; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2048
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v0, 0x2000
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0x1000, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[10:11], v[6:7], off offset:-4096
; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off offset:2048
; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, 0x2000, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: global_load_b64 v[6:7], v[6:7], off
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_load_b64 v[12:13], v[12:13], off offset:2048
; GFX11-NEXT: global_load_b64 v[14:15], v[0:1], off
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v10, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v12, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v13, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35]
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
%conv = and i64 %call, 255
%a0 = shl i64 %call, 7
%idx.ext11 = and i64 %a0, 4294934528
%add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11
%addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv
%load1 = load i64, ptr addrspace(1) %addr1, align 8
%addr2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 256
%load2 = load i64, ptr addrspace(1) %addr2, align 8
%add.1 = add i64 %load2, %load1
%add.ptr8.2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 512
%load3 = load i64, ptr addrspace(1) %add.ptr8.2, align 8
%add.2 = add i64 %load3, %add.1
%add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 768
%load4 = load i64, ptr addrspace(1) %add.ptr8.3, align 8
%add.3 = add i64 %load4, %add.2
%add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1024
%load5 = load i64, ptr addrspace(1) %add.ptr8.4, align 8
%add.4 = add i64 %load5, %add.3
%add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1280
%load6 = load i64, ptr addrspace(1) %add.ptr8.5, align 8
%add.5 = add i64 %load6, %add.4
%add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1536
%load7 = load i64, ptr addrspace(1) %add.ptr8.6, align 8
%add.6 = add i64 %load7, %add.5
%add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1792
%load8 = load i64, ptr addrspace(1) %add.ptr8.7, align 8
%add.7 = add i64 %load8, %add.6
store i64 %add.7, ptr addrspace(1) %add.ptr12, align 8
ret void
}
define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX8-LABEL: clmem_read:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_mov_b32 s38, -1
; GFX8-NEXT: s_mov_b32 s39, 0xe80000
; GFX8-NEXT: s_add_u32 s36, s36, s9
; GFX8-NEXT: s_addc_u32 s37, s37, 0
; GFX8-NEXT: s_getpc_b64 s[0:1]
; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 17, v0
; GFX8-NEXT: v_and_b32_e32 v6, 0xfe000000, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 3
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
; GFX8-NEXT: v_mov_b32_e32 v1, s35
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x5000
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_movk_i32 s0, 0x7f
; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB1_2 Depth 2
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v4, v0
; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: .LBB1_2: ; %for.body
; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffb000, v4
; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v4
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v4
; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8]
; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10]
; GFX8-NEXT: v_addc_u32_e32 v12, vcc, -1, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xffffc800, v4
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xffffd000, v4
; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12]
; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14]
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffd800, v4
; GFX8-NEXT: v_addc_u32_e32 v18, vcc, -1, v5, vcc
; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16]
; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18]
; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffe000, v4
; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe800, v4
; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[19:20]
; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v5, vcc
; GFX8-NEXT: flat_load_dwordx2 v[21:22], v[21:22]
; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xfffff000, v4
; GFX8-NEXT: v_addc_u32_e32 v24, vcc, -1, v5, vcc
; GFX8-NEXT: flat_load_dwordx2 v[23:24], v[23:24]
; GFX8-NEXT: v_add_u32_e32 v25, vcc, 0xfffff800, v4
; GFX8-NEXT: v_addc_u32_e32 v26, vcc, -1, v5, vcc
; GFX8-NEXT: flat_load_dwordx2 v[25:26], v[25:26]
; GFX8-NEXT: flat_load_dwordx2 v[27:28], v[4:5]
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x10000, v4
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; GFX8-NEXT: s_addk_i32 s1, 0x2000
; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff
; GFX8-NEXT: s_waitcnt vmcnt(10)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(9)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v10, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(8)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v11, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v12, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(7)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v13, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(6)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v15, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v16, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(5)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v17, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v18, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(4)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v19, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v20, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v21, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v22, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v23, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v24, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v25, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v26, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v27, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v28, v3, vcc
; GFX8-NEXT: s_cbranch_scc0 .LBB1_2
; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1
; GFX8-NEXT: s_add_i32 s1, s0, -1
; GFX8-NEXT: s_cmp_eq_u32 s0, 0
; GFX8-NEXT: s_cbranch_scc1 .LBB1_5
; GFX8-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX8-NEXT: s_mov_b32 s0, s1
; GFX8-NEXT: s_branch .LBB1_1
; GFX8-NEXT: .LBB1_5: ; %while.end
; GFX8-NEXT: v_mov_b32_e32 v1, s35
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v6
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX900-LABEL: clmem_read:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX900-NEXT: s_mov_b32 s38, -1
; GFX900-NEXT: s_mov_b32 s39, 0xe00000
; GFX900-NEXT: s_add_u32 s36, s36, s9
; GFX900-NEXT: s_addc_u32 s37, s37, 0
; GFX900-NEXT: s_getpc_b64 s[0:1]
; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX900-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX900-NEXT: v_mov_b32_e32 v31, v0
; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: s_mov_b32 s32, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 17, v0
; GFX900-NEXT: v_and_b32_e32 v6, 0xfe000000, v0
; GFX900-NEXT: v_lshl_or_b32 v0, v1, 3, v6
; GFX900-NEXT: v_mov_b32_e32 v1, s35
; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0
; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT: s_movk_i32 s0, 0x5000
; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX900-NEXT: v_mov_b32_e32 v2, 0
; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT: s_movk_i32 s2, 0x7f
; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: s_movk_i32 s0, 0xd000
; GFX900-NEXT: s_movk_i32 s1, 0xe000
; GFX900-NEXT: s_movk_i32 s3, 0xf000
; GFX900-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX900-NEXT: ; =>This Loop Header: Depth=1
; GFX900-NEXT: ; Child Loop BB1_2 Depth 2
; GFX900-NEXT: v_mov_b32_e32 v5, v1
; GFX900-NEXT: v_mov_b32_e32 v4, v0
; GFX900-NEXT: s_mov_b32 s4, 0
; GFX900-NEXT: .LBB1_2: ; %for.body
; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX900-NEXT: ; => This Inner Loop Header: Depth=2
; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v4
; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v5, vcc
; GFX900-NEXT: global_load_dwordx2 v[9:10], v[4:5], off offset:-4096
; GFX900-NEXT: global_load_dwordx2 v[11:12], v[4:5], off offset:-2048
; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v4
; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off
; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v5, vcc
; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048
; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s0, v4
; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v5, vcc
; GFX900-NEXT: global_load_dwordx2 v[15:16], v[15:16], off offset:-2048
; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, s1, v4
; GFX900-NEXT: global_load_dwordx2 v[13:14], v[13:14], off
; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v5, vcc
; GFX900-NEXT: global_load_dwordx2 v[23:24], v[19:20], off offset:-4096
; GFX900-NEXT: global_load_dwordx2 v[25:26], v[19:20], off offset:-2048
; GFX900-NEXT: global_load_dwordx2 v[27:28], v[19:20], off
; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, s3, v4
; GFX900-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v5, vcc
; GFX900-NEXT: global_load_dwordx2 v[19:20], v[21:22], off offset:-2048
; GFX900-NEXT: global_load_dwordx2 v[29:30], v[4:5], off
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, 0x10000, v4
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX900-NEXT: s_addk_i32 s4, 0x2000
; GFX900-NEXT: s_cmp_gt_u32 s4, 0x3fffff
; GFX900-NEXT: s_waitcnt vmcnt(8)
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
; GFX900-NEXT: s_waitcnt vmcnt(7)
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v17, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v18, v3, vcc
; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v14, v3, vcc
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v15, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v16, v3, vcc
; GFX900-NEXT: s_waitcnt vmcnt(4)
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v23, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v24, v3, vcc
; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v25, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v26, v3, vcc
; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v27, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v28, v3, vcc
; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v19, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v20, v3, vcc
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v10, v3, vcc
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v11, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v12, v3, vcc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v29, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v30, v3, vcc
; GFX900-NEXT: s_cbranch_scc0 .LBB1_2
; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1
; GFX900-NEXT: s_add_i32 s4, s2, -1
; GFX900-NEXT: s_cmp_eq_u32 s2, 0
; GFX900-NEXT: s_cbranch_scc1 .LBB1_5
; GFX900-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX900-NEXT: s_mov_b32 s2, s4
; GFX900-NEXT: s_branch .LBB1_1
; GFX900-NEXT: .LBB1_5: ; %while.end
; GFX900-NEXT: v_mov_b32_e32 v1, s35
; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v6
; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX900-NEXT: s_endpgm
;
; GFX10-LABEL: clmem_read:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s38, -1
; GFX10-NEXT: s_mov_b32 s39, 0x31c16000
; GFX10-NEXT: s_add_u32 s36, s36, s9
; GFX10-NEXT: s_addc_u32 s37, s37, 0
; GFX10-NEXT: s_getpc_b64 s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT: s_mov_b32 s32, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 17, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: s_movk_i32 s1, 0x7f
; GFX10-NEXT: v_and_b32_e32 v6, 0xfe000000, v1
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 3, v6
; GFX10-NEXT: v_add_co_u32 v0, s0, v0, s34
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s35, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB1_2 Depth 2
; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: .LBB1_2: ; %for.body
; GFX10-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v4, 0xffffb800
; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v5, vcc_lo
; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v4, 0xffffc800
; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, -1, v5, vcc_lo
; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v4, 0xffffd800
; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, -1, v5, vcc_lo
; GFX10-NEXT: v_add_co_u32 v17, vcc_lo, v4, 0xffffe800
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: global_load_dwordx2 v[11:12], v[7:8], off offset:-2048
; GFX10-NEXT: global_load_dwordx2 v[15:16], v[9:10], off offset:-2048
; GFX10-NEXT: global_load_dwordx2 v[19:20], v[13:14], off offset:-2048
; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, -1, v5, vcc_lo
; GFX10-NEXT: v_add_co_u32 v21, vcc_lo, 0xfffff000, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, -1, v5, vcc_lo
; GFX10-NEXT: s_clause 0x7
; GFX10-NEXT: global_load_dwordx2 v[23:24], v[17:18], off offset:-2048
; GFX10-NEXT: global_load_dwordx2 v[7:8], v[7:8], off
; GFX10-NEXT: global_load_dwordx2 v[9:10], v[9:10], off
; GFX10-NEXT: global_load_dwordx2 v[13:14], v[13:14], off
; GFX10-NEXT: global_load_dwordx2 v[25:26], v[17:18], off
; GFX10-NEXT: global_load_dwordx2 v[27:28], v[21:22], off
; GFX10-NEXT: global_load_dwordx2 v[29:30], v[4:5], off offset:-2048
; GFX10-NEXT: global_load_dwordx2 v[31:32], v[4:5], off
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x10000, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
; GFX10-NEXT: s_addk_i32 s2, 0x2000
; GFX10-NEXT: s_cmp_gt_u32 s2, 0x3fffff
; GFX10-NEXT: s_waitcnt vmcnt(10)
; GFX10-NEXT: v_add_co_u32 v2, s0, v11, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v12, v3, s0
; GFX10-NEXT: s_waitcnt vmcnt(6)
; GFX10-NEXT: v_add_co_u32 v2, s0, v7, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v8, v3, s0
; GFX10-NEXT: v_add_co_u32 v2, s0, v15, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v16, v3, s0
; GFX10-NEXT: s_waitcnt vmcnt(5)
; GFX10-NEXT: v_add_co_u32 v2, s0, v9, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v10, v3, s0
; GFX10-NEXT: v_add_co_u32 v2, s0, v19, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v20, v3, s0
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_add_co_u32 v2, s0, v13, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v14, v3, s0
; GFX10-NEXT: v_add_co_u32 v2, s0, v23, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v24, v3, s0
; GFX10-NEXT: s_waitcnt vmcnt(3)
; GFX10-NEXT: v_add_co_u32 v2, s0, v25, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v26, v3, s0
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_add_co_u32 v2, s0, v27, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v28, v3, s0
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_add_co_u32 v2, s0, v29, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v30, v3, s0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v31, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v32, v3, vcc_lo
; GFX10-NEXT: s_cbranch_scc0 .LBB1_2
; GFX10-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX10-NEXT: ; in Loop: Header=BB1_1 Depth=1
; GFX10-NEXT: s_add_i32 s0, s1, -1
; GFX10-NEXT: s_cmp_eq_u32 s1, 0
; GFX10-NEXT: s_cbranch_scc1 .LBB1_5
; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX10-NEXT: s_mov_b32 s1, s0
; GFX10-NEXT: s_branch .LBB1_1
; GFX10-NEXT: .LBB1_5: ; %while.end
; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v6
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX10-NEXT: s_endpgm
;
; GFX90A-LABEL: clmem_read:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX90A-NEXT: s_mov_b32 s38, -1
; GFX90A-NEXT: s_mov_b32 s39, 0xe00000
; GFX90A-NEXT: s_add_u32 s36, s36, s9
; GFX90A-NEXT: s_addc_u32 s37, s37, 0
; GFX90A-NEXT: s_getpc_b64 s[0:1]
; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX90A-NEXT: v_mov_b32_e32 v31, v0
; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_mov_b32 s32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX90A-NEXT: v_and_b32_e32 v1, 0xff, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 17, v0
; GFX90A-NEXT: v_and_b32_e32 v0, 0xfe000000, v0
; GFX90A-NEXT: v_lshl_or_b32 v1, v1, 3, v0
; GFX90A-NEXT: v_mov_b32_e32 v2, s35
; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v2, vcc
; GFX90A-NEXT: s_movk_i32 s0, 0x5000
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX90A-NEXT: s_movk_i32 s2, 0x7f
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0
; GFX90A-NEXT: s_movk_i32 s0, 0xd000
; GFX90A-NEXT: s_movk_i32 s1, 0xe000
; GFX90A-NEXT: s_movk_i32 s3, 0xf000
; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB1_2 Depth 2
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_mov_b32 s4, 0
; GFX90A-NEXT: .LBB1_2: ; %for.body
; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc
; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off
; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, s0, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v7, vcc
; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
; GFX90A-NEXT: v_add_co_u32_e32 v20, vcc, s1, v6
; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[14:15], off
; GFX90A-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v7, vcc
; GFX90A-NEXT: global_load_dwordx2 v[24:25], v[20:21], off offset:-4096
; GFX90A-NEXT: global_load_dwordx2 v[26:27], v[20:21], off offset:-2048
; GFX90A-NEXT: global_load_dwordx2 v[28:29], v[20:21], off
; GFX90A-NEXT: v_add_co_u32_e32 v22, vcc, s3, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[22:23], off offset:-2048
; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
; GFX90A-NEXT: s_addk_i32 s4, 0x2000
; GFX90A-NEXT: s_cmp_gt_u32 s4, 0x3fffff
; GFX90A-NEXT: s_waitcnt vmcnt(8)
; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v12, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(7)
; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v18, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v19, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(5)
; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v14, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v15, v4, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v16, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v17, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(4)
; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v24, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v25, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(3)
; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v26, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v27, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v28, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v29, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v20, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v21, v4, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v8, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v10, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2
; GFX90A-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX90A-NEXT: ; in Loop: Header=BB1_1 Depth=1
; GFX90A-NEXT: s_add_i32 s4, s2, -1
; GFX90A-NEXT: s_cmp_eq_u32 s2, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB1_5
; GFX90A-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX90A-NEXT: s_mov_b32 s2, s4
; GFX90A-NEXT: s_branch .LBB1_1
; GFX90A-NEXT: .LBB1_5: ; %while.end
; GFX90A-NEXT: v_mov_b32_e32 v1, s35
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
; GFX90A-NEXT: s_endpgm
;
; GFX11-LABEL: clmem_read:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 17, v0
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0xff, v0
; GFX11-NEXT: s_movk_i32 s1, 0x7f
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v6, 0xfe000000, v1
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 3, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s0, v0, s34
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s35, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB1_2 Depth 2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: s_mov_b32 s2, 0
; GFX11-NEXT: .LBB1_2: ; %for.body
; GFX11-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v4, 0xffffc000
; GFX11-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v9, vcc_lo, 0xffffc000, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, 0xffffd000, v4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[13:14], v[7:8], off offset:-4096
; GFX11-NEXT: global_load_b64 v[9:10], v[9:10], off offset:-2048
; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v15, vcc_lo, v4, 0xffffe000
; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT: global_load_b64 v[11:12], v[11:12], off offset:-2048
; GFX11-NEXT: v_add_co_u32 v17, vcc_lo, 0xffffe000, v4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[19:20], v[15:16], off offset:-4096
; GFX11-NEXT: global_load_b64 v[7:8], v[7:8], off
; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v21, vcc_lo, 0xfffff000, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: global_load_b64 v[17:18], v[17:18], off offset:-2048
; GFX11-NEXT: global_load_b64 v[15:16], v[15:16], off
; GFX11-NEXT: global_load_b64 v[21:22], v[21:22], off offset:-2048
; GFX11-NEXT: global_load_b64 v[23:24], v[4:5], off offset:-4096
; GFX11-NEXT: global_load_b64 v[25:26], v[4:5], off offset:-2048
; GFX11-NEXT: global_load_b64 v[27:28], v[4:5], off
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x10000, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
; GFX11-NEXT: s_addk_i32 s2, 0x2000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_cmp_gt_u32 s2, 0x3fffff
; GFX11-NEXT: s_waitcnt vmcnt(10)
; GFX11-NEXT: v_add_co_u32 v2, s0, v13, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v14, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(9)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v9, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v10, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v7, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v8, v3, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v11, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v12, v3, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v19, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v20, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v17, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v18, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v15, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v16, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v21, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v22, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v23, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v24, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v25, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v26, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v27, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v28, v3, vcc_lo
; GFX11-NEXT: s_cbranch_scc0 .LBB1_2
; GFX11-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX11-NEXT: ; in Loop: Header=BB1_1 Depth=1
; GFX11-NEXT: s_add_i32 s0, s1, -1
; GFX11-NEXT: s_cmp_eq_u32 s1, 0
; GFX11-NEXT: s_cbranch_scc1 .LBB1_5
; GFX11-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_branch .LBB1_1
; GFX11-NEXT: .LBB1_5: ; %while.end
; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
%conv = and i64 %call, 255
%a0 = shl i64 %call, 17
%idx.ext11 = and i64 %a0, 4261412864
%add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11
%add.ptr6 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv
br label %for.cond.preheader
while.cond.loopexit: ; preds = %for.body
%dec = add nsw i32 %dec31, -1
%tobool = icmp eq i32 %dec31, 0
br i1 %tobool, label %while.end, label %for.cond.preheader
for.cond.preheader: ; preds = %entry, %while.cond.loopexit
%dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ]
%sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ]
br label %for.body
for.body: ; preds = %for.body, %for.cond.preheader
%block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ]
%sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ]
%conv3 = zext i32 %block.029 to i64
%add.ptr8 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3
%load1 = load i64, ptr addrspace(1) %add.ptr8, align 8
%add = add i64 %load1, %sum.128
%add9 = or disjoint i32 %block.029, 256
%conv3.1 = zext i32 %add9 to i64
%add.ptr8.1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.1
%load2 = load i64, ptr addrspace(1) %add.ptr8.1, align 8
%add.1 = add i64 %load2, %add
%add9.1 = or disjoint i32 %block.029, 512
%conv3.2 = zext i32 %add9.1 to i64
%add.ptr8.2 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.2
%l3 = load i64, ptr addrspace(1) %add.ptr8.2, align 8
%add.2 = add i64 %l3, %add.1
%add9.2 = or disjoint i32 %block.029, 768
%conv3.3 = zext i32 %add9.2 to i64
%add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.3
%l4 = load i64, ptr addrspace(1) %add.ptr8.3, align 8
%add.3 = add i64 %l4, %add.2
%add9.3 = or disjoint i32 %block.029, 1024
%conv3.4 = zext i32 %add9.3 to i64
%add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.4
%l5 = load i64, ptr addrspace(1) %add.ptr8.4, align 8
%add.4 = add i64 %l5, %add.3
%add9.4 = or disjoint i32 %block.029, 1280
%conv3.5 = zext i32 %add9.4 to i64
%add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.5
%l6 = load i64, ptr addrspace(1) %add.ptr8.5, align 8
%add.5 = add i64 %l6, %add.4
%add9.5 = or disjoint i32 %block.029, 1536
%conv3.6 = zext i32 %add9.5 to i64
%add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.6
%load7 = load i64, ptr addrspace(1) %add.ptr8.6, align 8
%add.6 = add i64 %load7, %add.5
%add9.6 = or disjoint i32 %block.029, 1792
%conv3.7 = zext i32 %add9.6 to i64
%add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.7
%load8 = load i64, ptr addrspace(1) %add.ptr8.7, align 8
%add.7 = add i64 %load8, %add.6
%add9.7 = or disjoint i32 %block.029, 2048
%conv3.8 = zext i32 %add9.7 to i64
%add.ptr8.8 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.8
%load9 = load i64, ptr addrspace(1) %add.ptr8.8, align 8
%add.8 = add i64 %load9, %add.7
%add9.8 = or disjoint i32 %block.029, 2304
%conv3.9 = zext i32 %add9.8 to i64
%add.ptr8.9 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.9
%load10 = load i64, ptr addrspace(1) %add.ptr8.9, align 8
%add.9 = add i64 %load10, %add.8
%add9.9 = or disjoint i32 %block.029, 2560
%conv3.10 = zext i32 %add9.9 to i64
%add.ptr8.10 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.10
%load11 = load i64, ptr addrspace(1) %add.ptr8.10, align 8
%add.10 = add i64 %load11, %add.9
%add9.31 = add nuw nsw i32 %block.029, 8192
%cmp.31 = icmp ult i32 %add9.31, 4194304
br i1 %cmp.31, label %for.body, label %while.cond.loopexit
while.end: ; preds = %while.cond.loopexit
store i64 %add.10, ptr addrspace(1) %add.ptr12, align 8
ret void
}
; using 32bit address.
define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX8-LABEL: Address32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_mov_b32 s38, -1
; GFX8-NEXT: s_mov_b32 s39, 0xe80000
; GFX8-NEXT: s_add_u32 s36, s36, s9
; GFX8-NEXT: s_addc_u32 s37, s37, 0
; GFX8-NEXT: s_getpc_b64 s[0:1]
; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
; GFX8-NEXT: v_mov_b32_e32 v2, s35
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1
; GFX8-NEXT: v_mov_b32_e32 v3, 2
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x400
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x800
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0xc00
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x1000
; GFX8-NEXT: v_add_u32_e32 v11, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x1400
; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x1800
; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x1c00
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x2000
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: flat_load_dword v19, v[5:6]
; GFX8-NEXT: flat_load_dword v7, v[7:8]
; GFX8-NEXT: flat_load_dword v8, v[9:10]
; GFX8-NEXT: flat_load_dword v9, v[11:12]
; GFX8-NEXT: flat_load_dword v10, v[13:14]
; GFX8-NEXT: flat_load_dword v11, v[15:16]
; GFX8-NEXT: flat_load_dword v12, v[17:18]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x2400, v3
; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT: flat_load_dword v3, v[3:4]
; GFX8-NEXT: s_waitcnt vmcnt(8)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v19, v0
; GFX8-NEXT: s_waitcnt vmcnt(7)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
; GFX8-NEXT: s_waitcnt vmcnt(6)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v0
; GFX8-NEXT: s_waitcnt vmcnt(5)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0
; GFX8-NEXT: s_waitcnt vmcnt(4)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v10, v0
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v11, v0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v12, v0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT: flat_store_dword v[1:2], v0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: Address32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s38, -1
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v4
; GFX9-NEXT: v_mov_b32_e32 v3, 2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_movk_i32 s0, 0x1000
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v5, v[0:1], off
; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:1024
; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:2048
; GFX9-NEXT: global_load_dword v8, v[0:1], off offset:3072
; GFX9-NEXT: global_load_dword v9, v[2:3], off
; GFX9-NEXT: global_load_dword v10, v[2:3], off offset:1024
; GFX9-NEXT: global_load_dword v11, v[2:3], off offset:2048
; GFX9-NEXT: global_load_dword v12, v[2:3], off offset:3072
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v2, v[0:1], off
; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:1024
; GFX9-NEXT: s_waitcnt vmcnt(8)
; GFX9-NEXT: v_add_u32_e32 v0, v6, v5
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_add3_u32 v0, v7, v0, v8
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_add3_u32 v0, v9, v0, v10
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_add3_u32 v0, v11, v0, v12
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add3_u32 v0, v2, v0, v3
; GFX9-NEXT: global_store_dword v4, v0, s[34:35]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: Address32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s38, -1
; GFX10-NEXT: s_mov_b32 s39, 0x31c16000
; GFX10-NEXT: s_add_u32 s36, s36, s9
; GFX10-NEXT: s_addc_u32 s37, s37, 0
; GFX10-NEXT: s_getpc_b64 s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT: s_mov_b32 s32, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 2
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v8
; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x1000
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, 0x1000, v0
; GFX10-NEXT: s_clause 0x4
; GFX10-NEXT: global_load_dword v9, v[0:1], off
; GFX10-NEXT: global_load_dword v10, v[0:1], off offset:1024
; GFX10-NEXT: global_load_dword v11, v[2:3], off offset:1024
; GFX10-NEXT: global_load_dword v12, v[4:5], off offset:-2048
; GFX10-NEXT: global_load_dword v13, v[4:5], off
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x1800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x2000
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v14, v[6:7], off offset:1024
; GFX10-NEXT: global_load_dword v15, v[2:3], off offset:1024
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: global_load_dword v2, v[4:5], off offset:-2048
; GFX10-NEXT: global_load_dword v3, v[4:5], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:1024
; GFX10-NEXT: s_waitcnt vmcnt(8)
; GFX10-NEXT: v_add_nc_u32_e32 v0, v10, v9
; GFX10-NEXT: s_waitcnt vmcnt(6)
; GFX10-NEXT: v_add3_u32 v0, v12, v0, v11
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_add3_u32 v0, v13, v0, v14
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_add3_u32 v0, v2, v0, v15
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add3_u32 v0, v3, v0, v6
; GFX10-NEXT: global_store_dword v8, v0, s[34:35]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: Address32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v6, 0xffff8000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v6
; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v7, v[0:1], off
; GFX11-NEXT: global_load_b32 v8, v[0:1], off offset:1024
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x1000, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x2000
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: global_load_b32 v9, v[0:1], off offset:2048
; GFX11-NEXT: global_load_b32 v10, v[0:1], off offset:3072
; GFX11-NEXT: global_load_b32 v11, v[4:5], off offset:-4096
; GFX11-NEXT: global_load_b32 v12, v[2:3], off offset:1024
; GFX11-NEXT: global_load_b32 v13, v[2:3], off offset:2048
; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:3072
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v3, v[4:5], off
; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:1024
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: v_add_nc_u32_e32 v1, v8, v7
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v1, v9, v1, v10
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_add3_u32 v1, v11, v1, v12
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v1, v13, v1, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add3_u32 v0, v3, v1, v0
; GFX11-NEXT: global_store_b32 v6, v0, s[34:35]
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
%conv = and i64 %call, 255
%id = shl i64 %call, 7
%idx.ext11 = and i64 %id, 4294934528
%add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11
%add.ptr6 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr12, i64 %conv
%load1 = load i32, ptr addrspace(1) %add.ptr6, align 4
%add.ptr8.1 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 256
%load2 = load i32, ptr addrspace(1) %add.ptr8.1, align 4
%add.1 = add i32 %load2, %load1
%add.ptr8.2 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 512
%load3 = load i32, ptr addrspace(1) %add.ptr8.2, align 4
%add.2 = add i32 %load3, %add.1
%add.ptr8.3 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 768
%load4 = load i32, ptr addrspace(1) %add.ptr8.3, align 4
%add.3 = add i32 %load4, %add.2
%add.ptr8.4 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1024
%load5 = load i32, ptr addrspace(1) %add.ptr8.4, align 4
%add.4 = add i32 %load5, %add.3
%add.ptr8.5 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1280
%load6 = load i32, ptr addrspace(1) %add.ptr8.5, align 4
%add.5 = add i32 %load6, %add.4
%add.ptr8.6 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1536
%load7 = load i32, ptr addrspace(1) %add.ptr8.6, align 4
%add.6 = add i32 %load7, %add.5
%add.ptr8.7 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1792
%load8 = load i32, ptr addrspace(1) %add.ptr8.7, align 4
%add.7 = add i32 %load8, %add.6
%add.ptr8.8 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 2048
%load9 = load i32, ptr addrspace(1) %add.ptr8.8, align 4
%add.8 = add i32 %load9, %add.7
%add.ptr8.9 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 2304
%load10 = load i32, ptr addrspace(1) %add.ptr8.9, align 4
%add.9 = add i32 %load10, %add.8
store i32 %add.9, ptr addrspace(1) %add.ptr12, align 4
ret void
}
define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX8-LABEL: Offset64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_mov_b32 s38, -1
; GFX8-NEXT: s_mov_b32 s39, 0xe80000
; GFX8-NEXT: s_add_u32 s36, s36, s9
; GFX8-NEXT: s_addc_u32 s37, s37, 0
; GFX8-NEXT: s_getpc_b64 s[0:1]
; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
; GFX8-NEXT: v_mov_b32_e32 v2, s35
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1
; GFX8-NEXT: v_mov_b32_e32 v3, 3
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; GFX8-NEXT: s_movk_i32 s0, 0xf000
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0xf800
; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[3:4]
; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6]
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc
; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10]
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 1, v4, vcc
; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v7
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v8, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc
; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: Offset64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s38, -1
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v12
; GFX9-NEXT: v_mov_b32_e32 v3, 3
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 1, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:-4096
; GFX9-NEXT: s_movk_i32 s0, 0xf000
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[8:9], v[4:5], off
; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: Offset64:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s38, -1
; GFX10-NEXT: s_mov_b32 s39, 0x31c16000
; GFX10-NEXT: s_add_u32 s36, s36, s9
; GFX10-NEXT: s_addc_u32 s37, s37, 0
; GFX10-NEXT: s_getpc_b64 s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT: s_mov_b32 s32, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_and_b32_e32 v12, 0xffff8000, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v12
; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0xfffff800
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 1, v1, vcc_lo
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
; GFX10-NEXT: global_load_dwordx2 v[10:11], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: Offset64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v8, 0xffff8000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v8
; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 1, v1, vcc_lo
; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_load_b64 v[6:7], v[4:5], off offset:-4096
; GFX11-NEXT: global_load_b64 v[4:5], v[4:5], off
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
; GFX11-NEXT: global_store_b64 v8, v[0:1], s[34:35]
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
%conv = and i64 %call, 255
%a0 = shl i64 %call, 7
%idx.ext11 = and i64 %a0, 4294934528
%add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11
%addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv
%load1 = load i64, ptr addrspace(1) %addr1, align 8
%addr2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 536870400
%load2 = load i64, ptr addrspace(1) %addr2, align 8
%add1 = add i64 %load2, %load1
%addr3 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 536870656
%load3 = load i64, ptr addrspace(1) %addr3, align 8
%add2 = add i64 %load3, %add1
%addr4 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 536870912
%load4 = load i64, ptr addrspace(1) %addr4, align 8
%add4 = add i64 %load4, %add2
store i64 %add4, ptr addrspace(1) %add.ptr12, align 8
ret void
}
; TODO: Support load4 as anchor instruction.
define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) {
; GFX8-LABEL: p32Offset64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_mov_b32 s38, -1
; GFX8-NEXT: s_mov_b32 s39, 0xe80000
; GFX8-NEXT: s_add_u32 s36, s36, s9
; GFX8-NEXT: s_addc_u32 s37, s37, 0
; GFX8-NEXT: s_getpc_b64 s[0:1]
; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
; GFX8-NEXT: v_mov_b32_e32 v2, s35
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1
; GFX8-NEXT: v_mov_b32_e32 v3, 2
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; GFX8-NEXT: s_mov_b32 s0, 0x7ffff800
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc
; GFX8-NEXT: s_mov_b32 s0, 0x7ffffc00
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: flat_load_dword v6, v[7:8]
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT: flat_load_dword v3, v[3:4]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT: flat_store_dword v[1:2], v0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: p32Offset64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s38, -1
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v6
; GFX9-NEXT: v_mov_b32_e32 v3, 2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_mov_b32 s0, 0x7ffff000
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0x80000000, v0
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v7, v[0:1], off
; GFX9-NEXT: global_load_dword v8, v[2:3], off offset:2048
; GFX9-NEXT: global_load_dword v9, v[2:3], off offset:3072
; GFX9-NEXT: global_load_dword v10, v[4:5], off
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_add_u32_e32 v0, v8, v7
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add3_u32 v0, v9, v0, v10
; GFX9-NEXT: global_store_dword v6, v0, s[34:35]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: p32Offset64:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s38, -1
; GFX10-NEXT: s_mov_b32 s39, 0x31c16000
; GFX10-NEXT: s_add_u32 s36, s36, s9
; GFX10-NEXT: s_addc_u32 s37, s37, 0
; GFX10-NEXT: s_getpc_b64 s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT: s_mov_b32 s32, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 2
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff8000, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v4
; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x80000000
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: global_load_dword v5, v[0:1], off
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:-2048
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: global_load_dword v8, v[0:1], off offset:1024
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_add_nc_u32_e32 v0, v6, v5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add3_u32 v0, v8, v0, v7
; GFX10-NEXT: global_store_dword v4, v0, s[34:35]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: p32Offset64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v6, 0xffff8000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v6
; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x7ffff000, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x80000000, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:2048
; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:3072
; GFX11-NEXT: global_load_b32 v3, v[4:5], off
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v0, v2, v0, v3
; GFX11-NEXT: global_store_b32 v6, v0, s[34:35]
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
%conv = and i64 %call, 255
%a0 = shl i64 %call, 7
%idx.ext11 = and i64 %a0, 4294934528
%add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11
%addr1 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr12, i64 %conv
%load1 = load i32, ptr addrspace(1) %addr1, align 8
%addr2 = getelementptr inbounds i32, ptr addrspace(1) %addr1, i64 536870400
%load2 = load i32, ptr addrspace(1) %addr2, align 8
%add1 = add i32 %load2, %load1
%addr3 = getelementptr inbounds i32, ptr addrspace(1) %addr1, i64 536870656
%load3 = load i32, ptr addrspace(1) %addr3, align 8
%add2 = add i32 %load3, %add1
%addr4 = getelementptr inbounds i32, ptr addrspace(1) %addr1, i64 536870912
%load4 = load i32, ptr addrspace(1) %addr4, align 8
%add4 = add i32 %load4, %add2
store i32 %add4, ptr addrspace(1) %add.ptr12, align 8
ret void
}
define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX8-LABEL: DiffBase:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_mov_b32 s42, -1
; GFX8-NEXT: s_mov_b32 s43, 0xe80000
; GFX8-NEXT: s_add_u32 s40, s40, s9
; GFX8-NEXT: s_addc_u32 s41, s41, 0
; GFX8-NEXT: s_getpc_b64 s[0:1]
; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24
; GFX8-NEXT: s_mov_b64 s[0:1], s[40:41]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
; GFX8-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff8000, v0
; GFX8-NEXT: v_mov_b32_e32 v1, s37
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s36, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v3, s39
; GFX8-NEXT: v_add_u32_e32 v12, vcc, s38, v2
; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x1000, v0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x1800, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x2000, v0
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x2800, v12
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x3000, v12
; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v13, vcc
; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9]
; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11]
; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0x3800, v12
; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(4)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v7, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v10, v8
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: DiffBase:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s42, -1
; GFX9-NEXT: s_mov_b32 s43, 0xe00000
; GFX9-NEXT: s_add_u32 s40, s40, s9
; GFX9-NEXT: s_addc_u32 s41, s41, 0
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GFX9-NEXT: v_and_b32_e32 v16, 0xffff8000, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s37
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s36, v16
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s39
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, s38, v16
; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v0, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x2000, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2048
; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v10
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v11, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x3000, v10
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v11, vcc
; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048
; GFX9-NEXT: global_load_dwordx2 v[12:13], v[2:3], off
; GFX9-NEXT: global_load_dwordx2 v[14:15], v[2:3], off offset:2048
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v12, v10
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v13, v11, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v15, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX9-NEXT: global_store_dwordx2 v16, v[0:1], s[36:37]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: DiffBase:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s42, -1
; GFX10-NEXT: s_mov_b32 s43, 0x31c16000
; GFX10-NEXT: s_add_u32 s40, s40, s9
; GFX10-NEXT: s_addc_u32 s41, s41, 0
; GFX10-NEXT: s_getpc_b64 s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mov_b64 s[0:1], s[40:41]
; GFX10-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX10-NEXT: s_mov_b32 s32, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GFX10-NEXT: v_and_b32_e32 v16, 0xffff8000, v0
; GFX10-NEXT: v_add_co_u32 v8, s0, s36, v16
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s0, s37, 0, s0
; GFX10-NEXT: v_add_co_u32 v12, s0, s38, v16
; GFX10-NEXT: v_add_co_ci_u32_e64 v13, s0, s39, 0, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, 0x1800
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v12, 0x3000
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v13, vcc_lo
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:-2048
; GFX10-NEXT: global_load_dwordx2 v[10:11], v[2:3], off
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x3800, v12
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v13, vcc_lo
; GFX10-NEXT: global_load_dwordx2 v[12:13], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[14:15], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v10, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v11, v9, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v12, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: global_store_dwordx2 v16, v[0:1], s[36:37]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: DiffBase:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b128 s[36:39], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v12, 0xffff8000, v0
; GFX11-NEXT: v_add_co_u32 v2, s0, s36, v12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s37, 0, s0
; GFX11-NEXT: v_add_co_u32 v8, s0, s38, v12
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, s39, 0, s0
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0x2000
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x2000, v8
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0x3000, v8
; GFX11-NEXT: global_load_b64 v[6:7], v[2:3], off offset:-4096
; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[4:5], v[4:5], off offset:2048
; GFX11-NEXT: global_load_b64 v[10:11], v[8:9], off
; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off offset:2048
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v7, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v10, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v11, v5, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v12, v[0:1], s[36:37]
; GFX11-NEXT: s_endpgm
ptr addrspace(1) %buffer2) {
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
%conv = and i64 %call, 255
%a0 = shl i64 %call, 7
%idx.ext11 = and i64 %a0, 4294934528
%add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer1, i64 %idx.ext11
%add.ptr2 = getelementptr inbounds i8, ptr addrspace(1) %buffer2, i64 %idx.ext11
%addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 512
%load1 = load i64, ptr addrspace(1) %addr1, align 8
%add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 768
%load2 = load i64, ptr addrspace(1) %add.ptr8.3, align 8
%add1 = add i64 %load2, %load1
%add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 1024
%load3 = load i64, ptr addrspace(1) %add.ptr8.4, align 8
%add2 = add i64 %load3, %add1
%add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr2, i64 1280
%load4 = load i64, ptr addrspace(1) %add.ptr8.5, align 8
%add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr2, i64 1536
%load5 = load i64, ptr addrspace(1) %add.ptr8.6, align 8
%add3 = add i64 %load5, %load4
%add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr2, i64 1792
%load6 = load i64, ptr addrspace(1) %add.ptr8.7, align 8
%add4 = add i64 %load6, %add3
%add5 = add i64 %add2, %add4
store i64 %add5, ptr addrspace(1) %add.ptr12, align 8
ret void
}
define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX8-LABEL: ReverseOrder:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_mov_b32 s38, -1
; GFX8-NEXT: s_mov_b32 s39, 0xe80000
; GFX8-NEXT: s_add_u32 s36, s36, s9
; GFX8-NEXT: s_addc_u32 s37, s37, 0
; GFX8-NEXT: s_getpc_b64 s[0:1]
; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
; GFX8-NEXT: v_mov_b32_e32 v2, s35
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1
; GFX8-NEXT: v_mov_b32_e32 v3, 3
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x3800
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x3000
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x2800
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc
; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[3:4]
; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6]
; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8]
; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10]
; GFX8-NEXT: s_movk_i32 s0, 0x2000
; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x1800
; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc
; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14]
; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16]
; GFX8-NEXT: s_movk_i32 s0, 0x1000
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc
; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18]
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x800, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4]
; GFX8-NEXT: s_waitcnt vmcnt(6)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v11
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v12, vcc
; GFX8-NEXT: s_waitcnt vmcnt(5)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(4)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v13, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v15, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v17, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v18, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc
; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: ReverseOrder:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s38, -1
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT: v_and_b32_e32 v22, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v22
; GFX9-NEXT: v_mov_b32_e32 v3, 3
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_movk_i32 s0, 0x3000
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:2048
; GFX9-NEXT: global_load_dwordx2 v[8:9], v[4:5], off
; GFX9-NEXT: s_movk_i32 s0, 0x2000
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:2048
; GFX9-NEXT: s_movk_i32 s0, 0x1000
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[14:15], v[12:13], off
; GFX9-NEXT: global_load_dwordx2 v[16:17], v[4:5], off
; GFX9-NEXT: global_load_dwordx2 v[18:19], v[12:13], off offset:2048
; GFX9-NEXT: global_load_dwordx2 v[20:21], v[0:1], off offset:2048
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v18, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v20, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v21, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v22, v[0:1], s[34:35]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: ReverseOrder:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s38, -1
; GFX10-NEXT: s_mov_b32 s39, 0x31c16000
; GFX10-NEXT: s_add_u32 s36, s36, s9
; GFX10-NEXT: s_addc_u32 s37, s37, 0
; GFX10-NEXT: s_getpc_b64 s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT: s_mov_b32 s32, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_and_b32_e32 v20, 0xffff8000, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v20
; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x3800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x3000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x2800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, 0x2000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, 0x1800, v0
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[4:5], off
; GFX10-NEXT: global_load_dwordx2 v[10:11], v[10:11], off
; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, 0x1000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[12:13], v[12:13], off
; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[16:17], v[14:15], off
; GFX10-NEXT: global_load_dwordx2 v[18:19], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(6)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(5)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v12, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v16, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v18, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo
; GFX10-NEXT: global_store_dwordx2 v20, v[0:1], s[34:35]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: ReverseOrder:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v16, 0xffff8000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v16
; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x3000, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0x2000, v0
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
; GFX11-NEXT: global_load_b64 v[6:7], v[2:3], off offset:2048
; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0x1000, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: global_load_b64 v[12:13], v[8:9], off offset:2048
; GFX11-NEXT: global_load_b64 v[14:15], v[10:11], off
; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off
; GFX11-NEXT: global_load_b64 v[10:11], v[10:11], off offset:2048
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v6, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v12, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v13, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v10, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35]
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
%conv = and i64 %call, 255
%a0 = shl i64 %call, 7
%idx.ext11 = and i64 %a0, 4294934528
%add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11
%addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv
%load1 = load i64, ptr addrspace(1) %addr1, align 8
%add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1792
%load8 = load i64, ptr addrspace(1) %add.ptr8.7, align 8
%add7 = add i64 %load8, %load1
%add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1536
%load7 = load i64, ptr addrspace(1) %add.ptr8.6, align 8
%add6 = add i64 %load7, %add7
%add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1280
%load6 = load i64, ptr addrspace(1) %add.ptr8.5, align 8
%add5 = add i64 %load6, %add6
%add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1024
%load5 = load i64, ptr addrspace(1) %add.ptr8.4, align 8
%add4 = add i64 %load5, %add5
%add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 768
%load4 = load i64, ptr addrspace(1) %add.ptr8.3, align 8
%add3 = add i64 %load4, %add4
%add.ptr8.2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 512
%load3 = load i64, ptr addrspace(1) %add.ptr8.2, align 8
%add2 = add i64 %load3, %add3
%addr2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 256
%load2 = load i64, ptr addrspace(1) %addr2, align 8
%add1 = add i64 %load2, %add2
store i64 %add1, ptr addrspace(1) %add.ptr12, align 8
ret void
}
define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buffer) {
; GFX8-LABEL: negativeoffset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_mov_b32 s38, -1
; GFX8-NEXT: s_mov_b32 s39, 0xe80000
; GFX8-NEXT: s_add_u32 s36, s36, s9
; GFX8-NEXT: s_addc_u32 s37, s37, 0
; GFX8-NEXT: s_getpc_b64 s[0:1]
; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
; GFX8-NEXT: v_mov_b32_e32 v2, s35
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1
; GFX8-NEXT: v_mov_b32_e32 v3, 3
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x800
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s0, v0
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0, v0
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v6, vcc
; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4]
; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc
; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: negativeoffset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s38, -1
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v8
; GFX9-NEXT: v_mov_b32_e32 v3, 3
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
; GFX9-NEXT: global_store_dwordx2 v8, v[0:1], s[34:35]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: negativeoffset:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s38, -1
; GFX10-NEXT: s_mov_b32 s39, 0x31c16000
; GFX10-NEXT: s_add_u32 s36, s36, s9
; GFX10-NEXT: s_addc_u32 s37, s37, 0
; GFX10-NEXT: s_getpc_b64 s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX10-NEXT: s_mov_b32 s32, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v8
; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v1, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v3
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0, v3
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v4, vcc_lo
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[34:35]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: negativeoffset:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff8000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v4
; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v1, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v3
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0, v3
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v5, vcc_lo
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:-2048
; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX11-NEXT: global_store_b64 v4, v[0:1], s[34:35]
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0) #2
%conv = and i64 %call, 255
%0 = shl i64 %call, 7
%idx.ext11 = and i64 %0, 4294934528
%add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11
%buffer_wave = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv
%addr1 = getelementptr inbounds i64, ptr addrspace(1) %buffer_wave, i64 -536870656
%load1 = load i64, ptr addrspace(1) %addr1, align 8
%addr2 = getelementptr inbounds i64, ptr addrspace(1) %buffer_wave, i64 -536870912
%load2 = load i64, ptr addrspace(1) %addr2, align 8
%add = add i64 %load2, %load1
store i64 %add, ptr addrspace(1) %add.ptr12, align 8
ret void
}
define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) {
; GFX8-LABEL: negativeoffsetnullptr:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dword s1, s[2:3], 0xec
; GFX8-NEXT: s_add_u32 s0, 0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_addc_u32 s1, s1, -1
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_mov_b64 s[0:1], 0
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
; GFX8-NEXT: .LBB8_1: ; %branch
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_and_b64 s[2:3], exec, vcc
; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB8_1
; GFX8-NEXT: ; %bb.2: ; %end
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: negativeoffsetnullptr:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, -1, 0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
; GFX9-NEXT: .LBB8_1: ; %branch
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_and_b64 s[2:3], exec, vcc
; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: negativeoffsetnullptr:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX10-NEXT: s_add_u32 s0, 0, -1
; GFX10-NEXT: s_addc_u32 s1, s1, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
; GFX10-NEXT: .LBB8_1: ; %branch
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_and_b32 s1, exec_lo, vcc_lo
; GFX10-NEXT: s_or_b32 s0, s1, s0
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX10-NEXT: s_cbranch_execnz .LBB8_1
; GFX10-NEXT: ; %bb.2: ; %end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: negativeoffsetnullptr:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: v_add_co_u32 v0, s0, -1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
; GFX11-NEXT: .LBB8_1: ; %branch
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB8_1
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_endpgm
entry:
%null = select i1 false, ptr %buffer, ptr addrspacecast (ptr addrspace(5) null to ptr)
%gep = getelementptr i8, ptr %null, i64 -1
%ld = load i8, ptr %gep
%cmp = icmp eq i8 %ld, 0
br label %branch
branch:
br i1 %cmp, label %end, label %branch
end:
ret void
}
attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }