; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s
; Test using saddr addressing mode of global_* flat atomic instructions.
; --------------------------------------------------------------------------------
; atomicrmw max
; --------------------------------------------------------------------------------
define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_max_saddr_i32_rtn:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: v_max_i32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB0_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_max_saddr_i32_rtn:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_max_i32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB0_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_max_saddr_i32_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_i32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB0_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_max_saddr_i32_rtn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: v_max_i32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_max_saddr_i32_rtn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_max_i32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB1_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_max_saddr_i32_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_i32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB1_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%rtn = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_max_saddr_i32_nortn:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v5, v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_i32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_max_saddr_i32_nortn:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v5, v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_i32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB2_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_max_saddr_i32_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_i32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB2_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst
ret void
}
define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_max_saddr_i32_nortn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_i32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_max_saddr_i32_nortn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_i32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB3_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_max_saddr_i32_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_i32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB3_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%unused = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst
ret void
}
define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_max_saddr_i64_rtn:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v6, s3
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v10, v4
; GFX9-NEXT: v_mov_b32_e32 v9, v3
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB4_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_max_saddr_i64_rtn:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v10, v4
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB4_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: v_mov_b32_e32 v1, v4
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_max_saddr_i64_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v10, v4
; GFX11-NEXT: v_mov_b32_e32 v9, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB4_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: v_mov_b32_e32 v1, v4
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_max_saddr_i64_rtn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v6, s3
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v10, v4
; GFX9-NEXT: v_mov_b32_e32 v9, v3
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_max_saddr_i64_rtn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v10, v4
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB5_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: v_mov_b32_e32 v1, v4
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_max_saddr_i64_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v10, v4
; GFX11-NEXT: v_mov_b32_e32 v9, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB5_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: v_mov_b32_e32 v1, v4
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%rtn = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_max_saddr_i64_nortn:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_max_saddr_i64_nortn:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: v_mov_b32_e32 v5, v3
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB6_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_max_saddr_i64_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB6_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst
ret void
}
define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_max_saddr_i64_nortn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_max_saddr_i64_nortn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: v_mov_b32_e32 v5, v3
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB7_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_max_saddr_i64_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB7_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%unused = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst
ret void
}
; --------------------------------------------------------------------------------
; atomicrmw min
; --------------------------------------------------------------------------------
define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_min_saddr_i32_rtn:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: v_min_i32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_min_saddr_i32_rtn:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_min_i32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB8_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_min_saddr_i32_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_min_i32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB8_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_min_saddr_i32_rtn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: v_min_i32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB9_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_min_saddr_i32_rtn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_min_i32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB9_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_min_saddr_i32_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_min_i32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB9_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%rtn = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_min_saddr_i32_nortn:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v5, v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_i32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_min_saddr_i32_nortn:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v5, v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_min_i32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_min_saddr_i32_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_min_i32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB10_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst
ret void
}
define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_min_saddr_i32_nortn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_i32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_min_saddr_i32_nortn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_min_i32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB11_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_min_saddr_i32_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_min_i32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB11_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%unused = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst
ret void
}
define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_min_saddr_i64_rtn:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v6, s3
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v10, v4
; GFX9-NEXT: v_mov_b32_e32 v9, v3
; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB12_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_min_saddr_i64_rtn:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v10, v4
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB12_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: v_mov_b32_e32 v1, v4
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_min_saddr_i64_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v10, v4
; GFX11-NEXT: v_mov_b32_e32 v9, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB12_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: v_mov_b32_e32 v1, v4
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_min_saddr_i64_rtn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v6, s3
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v10, v4
; GFX9-NEXT: v_mov_b32_e32 v9, v3
; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB13_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_min_saddr_i64_rtn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v10, v4
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB13_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: v_mov_b32_e32 v1, v4
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_min_saddr_i64_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v10, v4
; GFX11-NEXT: v_mov_b32_e32 v9, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB13_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: v_mov_b32_e32 v1, v4
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%rtn = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_min_saddr_i64_nortn:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB14_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_min_saddr_i64_nortn:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: v_mov_b32_e32 v5, v3
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB14_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_min_saddr_i64_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB14_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst
ret void
}
define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_min_saddr_i64_nortn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB15_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_min_saddr_i64_nortn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: v_mov_b32_e32 v5, v3
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB15_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_min_saddr_i64_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB15_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%unused = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst
ret void
}
; --------------------------------------------------------------------------------
; atomicrmw umax
; --------------------------------------------------------------------------------
define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umax_saddr_i32_rtn:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: v_max_u32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB16_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_umax_saddr_i32_rtn:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_max_u32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB16_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umax_saddr_i32_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_u32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB16_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: v_max_u32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_max_u32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB17_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_u32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB17_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%rtn = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umax_saddr_i32_nortn:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v5, v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_u32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB18_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_umax_saddr_i32_nortn:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v5, v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_u32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB18_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_umax_saddr_i32_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_u32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB18_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst
ret void
}
define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_u32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB19_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_u32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB19_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_umax_saddr_i32_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_u32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB19_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%unused = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst
ret void
}
define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umax_saddr_i64_rtn:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v6, s3
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v10, v4
; GFX9-NEXT: v_mov_b32_e32 v9, v3
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB20_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_umax_saddr_i64_rtn:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v10, v4
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB20_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: v_mov_b32_e32 v1, v4
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umax_saddr_i64_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v10, v4
; GFX11-NEXT: v_mov_b32_e32 v9, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB20_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: v_mov_b32_e32 v1, v4
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v6, s3
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v10, v4
; GFX9-NEXT: v_mov_b32_e32 v9, v3
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB21_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v10, v4
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB21_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: v_mov_b32_e32 v1, v4
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umax_saddr_i64_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v10, v4
; GFX11-NEXT: v_mov_b32_e32 v9, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB21_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: v_mov_b32_e32 v1, v4
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%rtn = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umax_saddr_i64_nortn:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB22_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_umax_saddr_i64_nortn:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: v_mov_b32_e32 v5, v3
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB22_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_umax_saddr_i64_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB22_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst
ret void
}
define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB23_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: v_mov_b32_e32 v5, v3
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB23_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_umax_saddr_i64_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB23_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%unused = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst
ret void
}
; --------------------------------------------------------------------------------
; atomicrmw umin
; --------------------------------------------------------------------------------
define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umin_saddr_i32_rtn:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: v_min_u32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB24_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_umin_saddr_i32_rtn:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_min_u32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB24_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umin_saddr_i32_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_min_u32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB24_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: v_min_u32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB25_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_min_u32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB25_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_min_u32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB25_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%rtn = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umin_saddr_i32_nortn:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v5, v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_u32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB26_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_umin_saddr_i32_nortn:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v5, v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_min_u32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB26_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_umin_saddr_i32_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_min_u32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB26_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst
ret void
}
define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_u32_e32 v4, v5, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB27_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_min_u32_e32 v4, v5, v1
; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB27_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_umin_saddr_i32_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_min_u32_e32 v4, v5, v1
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB27_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%unused = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst
ret void
}
define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umin_saddr_i64_rtn:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v6, s3
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v10, v4
; GFX9-NEXT: v_mov_b32_e32 v9, v3
; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB28_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_umin_saddr_i64_rtn:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v10, v4
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB28_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: v_mov_b32_e32 v1, v4
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umin_saddr_i64_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v10, v4
; GFX11-NEXT: v_mov_b32_e32 v9, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB28_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: v_mov_b32_e32 v1, v4
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v6, s3
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v10, v4
; GFX9-NEXT: v_mov_b32_e32 v9, v3
; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB29_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v10, v4
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB29_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: v_mov_b32_e32 v1, v4
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umin_saddr_i64_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v10, v4
; GFX11-NEXT: v_mov_b32_e32 v9, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB29_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: v_mov_b32_e32 v1, v4
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%rtn = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umin_saddr_i64_nortn:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB30_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_umin_saddr_i64_nortn:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: v_mov_b32_e32 v5, v3
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB30_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_umin_saddr_i64_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3]
; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB30_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst
ret void
}
define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB31_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: v_mov_b32_e32 v5, v3
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT: s_cbranch_execnz .LBB31_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_umin_saddr_i64_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128
; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_waitcnt_depctr 0xfffe
; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT: s_cbranch_execnz .LBB31_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%unused = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst
ret void
}
attributes #0 = { argmemonly nounwind willreturn }