llvm/llvm/test/CodeGen/AMDGPU/v_cndmask.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s

declare i32 @llvm.amdgcn.workitem.id.x() #1
declare half @llvm.fabs.f16(half)
declare float @llvm.fabs.f32(float)
declare double @llvm.fabs.f64(double)

; All nan values are converted to 0xffffffff
define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
; SI-LABEL: v_cnd_nan_nosgpr:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT:    s_load_dword s8, s[2:3], 0xb
; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s3, 0xf000
; SI-NEXT:    s_mov_b32 s6, 0
; SI-NEXT:    s_mov_b32 s7, s3
; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_mov_b32 s2, -1
; SI-NEXT:    s_cmp_eq_u32 s8, 0
; SI-NEXT:    s_cselect_b64 vcc, -1, 0
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT:    s_endpgm
;
; VI-LABEL: v_cnd_nan_nosgpr:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    flat_load_dword v0, v[0:1]
; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    s_cmp_eq_u32 s4, 0
; VI-NEXT:    s_cselect_b64 vcc, -1, 0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
; VI-NEXT:    v_mov_b32_e32 v0, s0
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: v_cnd_nan_nosgpr:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v0, v0, s[0:1]
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
; GFX10-NEXT:    s_cselect_b64 vcc, -1, 0
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: v_cnd_nan_nosgpr:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    v_mov_b32_e32 v1, 0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
; GFX11-NEXT:    s_cselect_b64 vcc, -1, 0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
  %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
  %f = load float, ptr addrspace(1) %f.gep
  %setcc = icmp ne i32 %c, 0
  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
  store float %select, ptr addrspace(1) %out
  ret void
}

; This requires slightly trickier SGPR operand legalization since the
; single constant bus SGPR usage is the last operand, and it should
; never be moved.
; However on GFX10 constant bus is limited to 2 scalar operands, not one.
; All nan values are converted to 0xffffffff
define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 {
; SI-LABEL: v_cnd_nan:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT:    s_mov_b32 s7, 0xf000
; SI-NEXT:    s_mov_b32 s6, -1
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    s_mov_b32 s4, s0
; SI-NEXT:    s_mov_b32 s5, s1
; SI-NEXT:    s_cmp_eq_u32 s2, 0
; SI-NEXT:    v_mov_b32_e32 v0, s3
; SI-NEXT:    s_cselect_b64 vcc, -1, 0
; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT:    s_endpgm
;
; VI-LABEL: v_cnd_nan:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    s_cmp_eq_u32 s2, 0
; VI-NEXT:    v_mov_b32_e32 v0, s3
; VI-NEXT:    s_cselect_b64 vcc, -1, 0
; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
; VI-NEXT:    v_mov_b32_e32 v0, s0
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: v_cnd_nan:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
; GFX10-NEXT:    s_cselect_b64 s[0:1], -1, 0
; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1, s7, s[0:1]
; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: v_cnd_nan:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT:    v_mov_b32_e32 v0, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_cmp_eq_u32 s2, 0
; GFX11-NEXT:    s_cselect_b64 s[4:5], -1, 0
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    v_cndmask_b32_e64 v1, -1, s3, s[4:5]
; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %setcc = icmp ne i32 %c, 0
  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
  store float %select, ptr addrspace(1) %out
  ret void
}

; Test different compare and select operand types for optimal code
; shrinking.
; (select (cmp (sgprX, constant)), constant, sgprZ)
define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 {
; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x13
; SI-NEXT:    s_mov_b32 s3, 0xf000
; SI-NEXT:    s_mov_b32 s2, 0
; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    v_mov_b32_e32 v2, s5
; SI-NEXT:    v_cmp_nlg_f32_e64 vcc, s4, 0
; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x4c
; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_mov_b32_e32 v2, s3
; VI-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
; VI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x4c
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_cmp_nlg_f32_e64 s[2:3], s0, 0
; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, s1, s[2:3]
; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x4c
; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x24
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_cmp_nlg_f32_e64 s[4:5], s0, 0
; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, s1, s[4:5]
; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
  %setcc = fcmp one float %x, 0.0
  %select = select i1 %setcc, float 1.0, float %z
  store float %select, ptr addrspace(1) %out.gep
  ret void
}

define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 {
; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
; SI-NEXT:    s_mov_b32 s3, 0xf000
; SI-NEXT:    s_mov_b32 s2, 0
; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    v_mov_b32_e32 v2, s4
; SI-NEXT:    v_cmp_nlg_f32_e64 vcc, s4, 0
; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT:    s_load_dword s2, s[2:3], 0x2c
; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_mov_b32_e32 v2, s2
; VI-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
; VI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_cmp_nlg_f32_e64 s[2:3], s4, 0
; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, s4, s[2:3]
; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_cmp_nlg_f32_e64 s[2:3], s4, 0
; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, s4, s[2:3]
; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
  %setcc = fcmp one float %x, 0.0
  %select = select i1 %setcc, float 1.0, float %x
  store float %select, ptr addrspace(1) %out.gep
  ret void
}

define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 {
; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x13
; SI-NEXT:    s_mov_b32 s3, 0xf000
; SI-NEXT:    s_mov_b32 s2, 0
; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    v_mov_b32_e32 v2, s5
; SI-NEXT:    v_cmp_nlg_f32_e64 vcc, s4, 0
; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x4c
; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_mov_b32_e32 v2, s3
; VI-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x4c
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_cmp_nlg_f32_e64 s[2:3], s0, 0
; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, s1, s[2:3]
; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x4c
; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x24
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_cmp_nlg_f32_e64 s[4:5], s0, 0
; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, s1, s[4:5]
; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
  %setcc = fcmp one float %x, 0.0
  %select = select i1 %setcc, float 0.0, float %z
  store float %select, ptr addrspace(1) %out.gep
  ret void
}

define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 {
; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
; SI-NEXT:    s_mov_b32 s3, 0xf000
; SI-NEXT:    s_mov_b32 s2, 0
; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    v_mov_b32_e32 v2, s4
; SI-NEXT:    v_cmp_nlg_f32_e64 vcc, s4, 0
; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT:    s_load_dword s2, s[2:3], 0x2c
; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_mov_b32_e32 v2, s2
; VI-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_cmp_nlg_f32_e64 s[2:3], s4, 0
; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, s4, s[2:3]
; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_cmp_nlg_f32_e64 s[2:3], s4, 0
; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, s4, s[2:3]
; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
  %setcc = fcmp one float %x, 0.0
  %select = select i1 %setcc, float 0.0, float %x
  store float %select, ptr addrspace(1) %out.gep
  ret void
}

define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dword s0, s[2:3], 0xb
; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s7, 0xf000
; SI-NEXT:    s_mov_b32 s6, 0
; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
; SI-NEXT:    v_cmp_nlg_f32_e64 vcc, s0, 0
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    flat_load_dword v3, v[0:1]
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT:    s_load_dword s2, s[2:3], 0x2c
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v1, v0, s[0:1]
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_cmp_nlg_f32_e64 vcc, s4, 0
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_cmp_nlg_f32_e64 vcc, s4, 0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
  %z = load float, ptr addrspace(1) %z.gep
  %setcc = fcmp one float %x, 0.0
  %select = select i1 %setcc, float 0.0, float %z
  store float %select, ptr addrspace(1) %out.gep
  ret void
}

define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dword s0, s[2:3], 0xb
; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s7, 0xf000
; SI-NEXT:    s_mov_b32 s6, 0
; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
; SI-NEXT:    v_cmp_nlg_f32_e64 vcc, s0, 0
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    flat_load_dword v3, v[0:1]
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT:    s_load_dword s2, s[2:3], 0x2c
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v1, v0, s[0:1]
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_cmp_nlg_f32_e64 vcc, s4, 0
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_cmp_nlg_f32_e64 vcc, s4, 0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
  %z = load float, ptr addrspace(1) %z.gep
  %setcc = fcmp one float %x, 0.0
  %select = select i1 %setcc, float 1.0, float %z
  store float %select, ptr addrspace(1) %out.gep
  ret void
}

define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 {
; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT:    s_load_dword s8, s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s3, 0xf000
; SI-NEXT:    s_mov_b32 s2, 0
; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    s_mov_b64 s[0:1], s[6:7]
; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
; SI-NEXT:    v_mov_b32_e32 v3, s8
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v2
; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT:    s_load_dword s0, s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s7
; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    flat_load_dword v3, v[0:1]
; VI-NEXT:    v_mov_b32_e32 v1, s5
; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_mov_b32_e32 v4, s0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v3
; VI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, s0, vcc
; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, s0, vcc
; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
  %x = load float, ptr addrspace(1) %x.gep
  %setcc = fcmp olt float %x, 0.0
  %select = select i1 %setcc, float 1.0, float %z
  store float %select, ptr addrspace(1) %out.gep
  ret void
}

define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s11, 0xf000
; SI-NEXT:    s_mov_b32 s10, 0
; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
; SI-NEXT:    v_cmp_le_f32_e32 vcc, 0, v2
; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s7
; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_mov_b32_e32 v3, s1
; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT:    flat_load_dword v5, v[0:1] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    flat_load_dword v2, v[2:3] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s5
; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_cmp_le_f32_e32 vcc, 0, v5
; VI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    global_load_dword v2, v0, s[0:1] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_le_f32_e32 vcc, 0, v1
; GFX10-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    global_load_b32 v2, v0, s[0:1] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_le_f32_e32 vcc, 0, v1
; GFX11-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
  %x = load volatile float, ptr addrspace(1) %x.gep
  %z = load volatile float, ptr addrspace(1) %z.gep
  %setcc = fcmp ult float %x, 0.0
  %select = select i1 %setcc, float 1.0, float %z
  store float %select, ptr addrspace(1) %out.gep
  ret void
}

define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s11, 0xf000
; SI-NEXT:    s_mov_b32 s10, 0
; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
; SI-NEXT:    v_cndmask_b32_e32 v2, 2, v3, vcc
; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s7
; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_mov_b32_e32 v3, s1
; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT:    flat_load_dword v5, v[0:1] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    flat_load_dword v2, v[2:3] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s5
; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v5
; VI-NEXT:    v_cndmask_b32_e32 v2, 2, v2, vcc
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    global_load_dword v2, v0, s[0:1] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
; GFX10-NEXT:    v_cndmask_b32_e32 v1, 2, v2, vcc
; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    global_load_b32 v2, v0, s[0:1] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
; GFX11-NEXT:    v_cndmask_b32_e32 v1, 2, v2, vcc
; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
  %z.gep = getelementptr inbounds i32, ptr addrspace(1) %z.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
  %x = load volatile i32, ptr addrspace(1) %x.gep
  %z = load volatile i32, ptr addrspace(1) %z.gep
  %setcc = icmp slt i32 %x, 0
  %select = select i1 %setcc, i32 2, i32 %z
  store i32 %select, ptr addrspace(1) %out.gep
  ret void
}

define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s11, 0xf000
; SI-NEXT:    s_mov_b32 s10, 0
; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
; SI-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[2:3]
; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
; SI-NEXT:    v_cndmask_b32_e32 v2, 2, v4, vcc
; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s7
; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_mov_b32_e32 v3, s1
; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v5, s5
; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[0:1]
; VI-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
; VI-NEXT:    v_cndmask_b32_e32 v0, 2, v2, vcc
; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[0:1]
; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v0, 2, v2, vcc
; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[6:7] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[0:1] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[0:1]
; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v0, 2, v2, vcc
; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %x.gep = getelementptr inbounds i64, ptr addrspace(1) %x.ptr, i64 %tid.ext
  %z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext
  %x = load volatile i64, ptr addrspace(1) %x.gep
  %z = load volatile i64, ptr addrspace(1) %z.gep
  %setcc = icmp slt i64 %x, 0
  %select = select i1 %setcc, i64 2, i64 %z
  store i64 %select, ptr addrspace(1) %out.gep
  ret void
}

define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s11, 0xf000
; SI-NEXT:    s_mov_b32 s10, 0
; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT:    v_mov_b32_e32 v2, 0
; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
; SI-NEXT:    v_mov_b32_e32 v5, v2
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
; SI-NEXT:    buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 4.0, v6
; SI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
; SI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
; SI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
; SI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v2, s7
; VI-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT:    v_mov_b32_e32 v0, s1
; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v5
; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT:    flat_load_dword v6, v[1:2] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    flat_load_dwordx4 v[0:3], v[3:4] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v7, s5
; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v5
; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
; VI-NEXT:    v_cmp_nge_f32_e32 vcc, 4.0, v6
; VI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
; VI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
; VI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
; VI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v6, v4, s[6:7] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_nge_f32_e32 vcc, 4.0, v6
; GFX10-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX10-NEXT:    global_store_dwordx4 v5, v[0:3], s[4:5]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v5, v1, s[6:7] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[0:1] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc, 4.0, v5
; GFX11-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
  %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
  %x = load volatile float, ptr addrspace(1) %x.gep
  %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
  %setcc = fcmp ugt float %x, 4.0
  %select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>
  store <4 x float> %select, ptr addrspace(1) %out.gep
  ret void
}

define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s11, 0xf000
; SI-NEXT:    s_mov_b32 s10, 0
; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT:    v_mov_b32_e32 v2, 0
; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
; SI-NEXT:    v_mov_b32_e32 v5, v2
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
; SI-NEXT:    buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
; SI-NEXT:    v_cmp_ge_f32_e32 vcc, 4.0, v6
; SI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
; SI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
; SI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
; SI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v2, s7
; VI-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT:    v_mov_b32_e32 v0, s1
; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v5
; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT:    flat_load_dword v6, v[1:2] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    flat_load_dwordx4 v[0:3], v[3:4] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v7, s5
; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v5
; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
; VI-NEXT:    v_cmp_ge_f32_e32 vcc, 4.0, v6
; VI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
; VI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
; VI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
; VI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v6, v4, s[6:7] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_ge_f32_e32 vcc, 4.0, v6
; GFX10-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX10-NEXT:    global_store_dwordx4 v5, v[0:3], s[4:5]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v5, v1, s[6:7] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[0:1] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc, 4.0, v5
; GFX11-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
  %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
  %x = load volatile float, ptr addrspace(1) %x.gep
  %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
  %setcc = fcmp ugt float %x, 4.0
  %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
  store <4 x float> %select, ptr addrspace(1) %out.gep
  ret void
}

; This must be swapped as a vector type before the condition has
; multiple uses.
define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s11, 0xf000
; SI-NEXT:    s_mov_b32 s10, 0
; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT:    v_mov_b32_e32 v2, 0
; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
; SI-NEXT:    v_mov_b32_e32 v5, v2
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
; SI-NEXT:    buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
; SI-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v6
; SI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
; SI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
; SI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
; SI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v2, s7
; VI-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT:    v_mov_b32_e32 v0, s1
; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v5
; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT:    flat_load_dword v6, v[1:2] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    flat_load_dwordx4 v[0:3], v[3:4] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v7, s5
; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v5
; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
; VI-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v6
; VI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
; VI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
; VI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
; VI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v6, v4, s[6:7] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v6
; GFX10-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX10-NEXT:    global_store_dwordx4 v5, v[0:3], s[4:5]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v5, v1, s[6:7] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[0:1] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v5
; GFX11-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
  %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
  %x = load volatile float, ptr addrspace(1) %x.gep
  %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
  %setcc = fcmp ugt float 4.0, %x
  %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
  store <4 x float> %select, ptr addrspace(1) %out.gep
  ret void
}

define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s10, 0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_mov_b32 s11, 0xf000
; SI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT:    v_mov_b32_e32 v3, v1
; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
; SI-NEXT:    buffer_load_dword v2, v[2:3], s[8:11], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
; SI-NEXT:    v_and_b32_e32 v3, 1, v3
; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v3
; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; SI-NEXT:    buffer_store_byte v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v2, s7
; VI-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT:    v_mov_b32_e32 v4, s1
; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
; VI-NEXT:    flat_load_dword v2, v[1:2] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    flat_load_ubyte v3, v[3:4] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s5
; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
; VI-NEXT:    v_and_b32_e32 v3, 1, v3
; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v3
; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT:    flat_store_byte v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v2, v1, s[6:7] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    global_load_ubyte v3, v0, s[0:1] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
; GFX10-NEXT:    v_and_b32_e32 v1, 1, v3
; GFX10-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v1
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX10-NEXT:    global_store_byte v0, v1, s[4:5]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v1, v1, s[6:7] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    global_load_u8 v2, v0, s[0:1] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v2
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX11-NEXT:    global_store_b8 v0, v1, s[4:5]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
  %z.gep = getelementptr inbounds i1, ptr addrspace(1) %z.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds i1, ptr addrspace(1) %out, i64 %tid.ext
  %x = load volatile i32, ptr addrspace(1) %x.gep
  %z = load volatile i1, ptr addrspace(1) %z.gep
  %setcc = icmp slt i32 %x, 0
  %select = select i1 %setcc, i1 true, i1 %z
  store i1 %select, ptr addrspace(1) %out.gep
  ret void
}

; Different types compared vs. selected
define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s11, 0xf000
; SI-NEXT:    s_mov_b32 s10, 0
; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT:    v_mov_b32_e32 v2, 0
; SI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
; SI-NEXT:    v_mov_b32_e32 v4, v2
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
; SI-NEXT:    buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    v_mov_b32_e32 v5, 0x3ff00000
; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
; SI-NEXT:    v_cmp_le_f32_e32 vcc, 0, v2
; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
; SI-NEXT:    buffer_store_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v2, s7
; VI-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT:    v_mov_b32_e32 v0, s1
; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v5
; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT:    flat_load_dword v6, v[1:2] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    flat_load_dwordx2 v[0:1], v[3:4] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v3, s5
; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v5
; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
; VI-NEXT:    v_cmp_le_f32_e32 vcc, 0, v6
; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v4, v2, s[6:7] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v3, s[0:1] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_le_f32_e32 vcc, 0, v4
; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[4:5]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v3, v1, s[6:7] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[0:1] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_le_f32_e32 vcc, 0, v3
; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
  %z.gep = getelementptr inbounds double, ptr addrspace(1) %z.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
  %x = load volatile float, ptr addrspace(1) %x.gep
  %z = load volatile double, ptr addrspace(1) %z.gep
  %setcc = fcmp ult float %x, 0.0
  %select = select i1 %setcc, double 1.0, double %z
  store double %select, ptr addrspace(1) %out.gep
  ret void
}

; Different types compared vs. selected
define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s11, 0xf000
; SI-NEXT:    s_mov_b32 s10, 0
; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT:    v_mov_b32_e32 v2, 0
; SI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
; SI-NEXT:    v_mov_b32_e32 v4, v2
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
; SI-NEXT:    buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
; SI-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v2
; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
; SI-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
; SI-NEXT:    buffer_store_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v2, s7
; VI-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT:    v_mov_b32_e32 v0, s1
; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v5
; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT:    flat_load_dword v6, v[1:2] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    flat_load_dwordx2 v[0:1], v[3:4] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v3, s5
; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v5
; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v6
; VI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v4, v2, s[6:7] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v3, s[0:1] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v4
; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[4:5]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v3, v1, s[6:7] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[0:1] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v3
; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
  %z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext
  %x = load volatile float, ptr addrspace(1) %x.gep
  %z = load volatile i64, ptr addrspace(1) %z.gep
  %setcc = fcmp one float %x, 0.0
  %select = select i1 %setcc, i64 3, i64 %z
  store i64 %select, ptr addrspace(1) %out.gep
  ret void
}

; Different types compared vs. selected
define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s11, 0xf000
; SI-NEXT:    s_mov_b32 s10, 0
; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
; SI-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v2
; SI-NEXT:    v_cndmask_b32_e32 v2, 4.0, v3, vcc
; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_endpgm
;
; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s7
; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_mov_b32_e32 v3, s1
; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT:    flat_load_dword v5, v[0:1] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    flat_load_dword v2, v[2:3] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s5
; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v5
; VI-NEXT:    v_cndmask_b32_e32 v2, 4.0, v2, vcc
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    global_load_dword v2, v0, s[0:1] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v1
; GFX10-NEXT:    v_cndmask_b32_e32 v1, 4.0, v2, vcc
; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    global_load_b32 v2, v0, s[0:1] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v1
; GFX11-NEXT:    v_cndmask_b32_e32 v1, 4.0, v2, vcc
; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
  %x = load volatile i32, ptr addrspace(1) %x.gep
  %z = load volatile float, ptr addrspace(1) %z.gep
  %setcc = icmp ugt i32 %x, 1
  %select = select i1 %setcc, float 4.0, float %z
  store float %select, ptr addrspace(1) %out.gep
  ret void
}

; FIXME: Should be able to handle multiple uses
define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s11, 0xf000
; SI-NEXT:    s_mov_b32 s10, 0
; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
; SI-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v2
; SI-NEXT:    v_cndmask_b32_e64 v2, v3, -1.0, vcc
; SI-NEXT:    v_cndmask_b32_e64 v3, v3, -2.0, vcc
; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    buffer_store_dword v3, v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    s_endpgm
;
; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s7
; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_mov_b32_e32 v3, s1
; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT:    flat_load_dword v5, v[0:1] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    flat_load_dword v2, v[2:3] glc
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s5
; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v5
; VI-NEXT:    v_cndmask_b32_e64 v3, v2, -1.0, vcc
; VI-NEXT:    v_cndmask_b32_e64 v2, v2, -2.0, vcc
; VI-NEXT:    flat_store_dword v[0:1], v3
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    global_load_dword v2, v0, s[0:1] glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v1
; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, -1.0, vcc
; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, -2.0, vcc
; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_store_dword v0, v2, s[4:5]
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    global_load_b32 v2, v0, s[0:1] glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v1
; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, -1.0, vcc
; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, -2.0, vcc
; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5] dlc
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_store_b32 v0, v2, s[4:5] dlc
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %tid.ext = sext i32 %tid to i64
  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
  %x = load volatile float, ptr addrspace(1) %x.gep
  %z = load volatile float, ptr addrspace(1) %z.gep
  %setcc = fcmp ugt float 4.0, %x
  %select0 = select i1 %setcc, float -1.0, float %z
  %select1 = select i1 %setcc, float -2.0, float %z
  store volatile float %select0, ptr addrspace(1) %out.gep
  store volatile float %select1, ptr addrspace(1) %out.gep
  ret void
}

; Source modifiers abs/neg only work for f32
define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
; SI-LABEL: v_cndmask_abs_neg_f16:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dword s0, s[2:3], 0xb
; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s11, 0xf000
; SI-NEXT:    s_mov_b32 s6, 0
; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_mov_b32 s7, s11
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x9
; SI-NEXT:    s_mov_b32 s10, -1
; SI-NEXT:    s_cmp_lg_u32 s0, 0
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
; SI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT:    s_cselect_b64 vcc, -1, 0
; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
; SI-NEXT:    s_endpgm
;
; VI-LABEL: v_cndmask_abs_neg_f16:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    flat_load_ushort v0, v[0:1]
; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    s_cmp_lg_u32 s4, 0
; VI-NEXT:    s_cselect_b64 vcc, -1, 0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
; VI-NEXT:    v_mov_b32_e32 v0, s0
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    flat_store_short v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: v_cndmask_abs_neg_f16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_ushort v0, v0, s[0:1]
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
; GFX10-NEXT:    s_cselect_b64 vcc, -1, 0
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
; GFX10-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX10-NEXT:    global_store_short v2, v0, s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: v_cndmask_abs_neg_f16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    v_mov_b32_e32 v2, 0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_u16 v0, v0, s[0:1]
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
; GFX11-NEXT:    s_cselect_b64 vcc, -1, 0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
  %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx
  %f = load half, ptr addrspace(1) %f.gep
  %f.abs = call half @llvm.fabs.f16(half %f)
  %f.neg = fneg half %f
  %setcc = icmp ne i32 %c, 0
  %select = select i1 %setcc, half %f.abs, half %f.neg
  store half %select, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
; SI-LABEL: v_cndmask_abs_neg_f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT:    s_load_dword s8, s[2:3], 0xb
; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s3, 0xf000
; SI-NEXT:    s_mov_b32 s6, 0
; SI-NEXT:    s_mov_b32 s7, s3
; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_mov_b32 s2, -1
; SI-NEXT:    s_cmp_lg_u32 s8, 0
; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    v_cndmask_b32_e64 v0, -v0, |v0|, s[4:5]
; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT:    s_endpgm
;
; VI-LABEL: v_cndmask_abs_neg_f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    flat_load_dword v0, v[0:1]
; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    s_cmp_lg_u32 s4, 0
; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3]
; VI-NEXT:    v_mov_b32_e32 v0, s0
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    flat_store_dword v[0:1], v2
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: v_cndmask_abs_neg_f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dword v0, v0, s[0:1]
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
; GFX10-NEXT:    s_cselect_b64 s[2:3], -1, 0
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: v_cndmask_abs_neg_f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    v_mov_b32_e32 v1, 0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
; GFX11-NEXT:    s_cselect_b64 s[2:3], -1, 0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
  %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
  %f = load float, ptr addrspace(1) %f.gep
  %f.abs = call float @llvm.fabs.f32(float %f)
  %f.neg = fneg float %f
  %setcc = icmp ne i32 %c, 0
  %select = select i1 %setcc, float %f.abs, float %f.neg
  store float %select, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
; SI-LABEL: v_cndmask_abs_neg_f64:
; SI:       ; %bb.0:
; SI-NEXT:    s_load_dword s0, s[2:3], 0xb
; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
; SI-NEXT:    s_mov_b32 s11, 0xf000
; SI-NEXT:    s_mov_b32 s6, 0
; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    s_mov_b32 s7, s11
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; SI-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x9
; SI-NEXT:    s_mov_b32 s10, -1
; SI-NEXT:    s_cmp_lg_u32 s0, 0
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
; SI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
; SI-NEXT:    s_cselect_b64 vcc, -1, 0
; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
; SI-NEXT:    s_waitcnt lgkmcnt(0)
; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; SI-NEXT:    s_endpgm
;
; VI-LABEL: v_cndmask_abs_neg_f64:
; VI:       ; %bb.0:
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    v_mov_b32_e32 v1, s1
; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT:    s_waitcnt lgkmcnt(0)
; VI-NEXT:    s_cmp_lg_u32 s4, 0
; VI-NEXT:    s_cselect_b64 vcc, -1, 0
; VI-NEXT:    s_waitcnt vmcnt(0)
; VI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
; VI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
; VI-NEXT:    v_mov_b32_e32 v3, s1
; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
; VI-NEXT:    v_mov_b32_e32 v2, s0
; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT:    s_endpgm
;
; GFX10-LABEL: v_cndmask_abs_neg_f64:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT:    v_mov_b32_e32 v3, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1]
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
; GFX10-NEXT:    s_cselect_b64 vcc, -1, 0
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: v_cndmask_abs_neg_f64:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    v_mov_b32_e32 v3, 0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[0:1]
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
; GFX11-NEXT:    s_cselect_b64 vcc, -1, 0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
  %f.gep = getelementptr double, ptr addrspace(1) %fptr, i32 %idx
  %f = load double, ptr addrspace(1) %f.gep
  %f.abs = call double @llvm.fabs.f64(double %f)
  %f.neg = fneg double %f
  %setcc = icmp ne i32 %c, 0
  %select = select i1 %setcc, double %f.abs, double %f.neg
  store double %select, ptr addrspace(1) %out
  ret void
}

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }