; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
define float @v_rsq_clamp_f32(float %src) #0 {
; SI-LABEL: v_rsq_clamp_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f32_e32 v0, v0
; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_rsq_clamp_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_rsq_f32_e32 v0, v0
; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0x7f7fffff, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
ret float %rsq_clamp
}
define float @v_rsq_clamp_fabs_f32(float %src) #0 {
; SI-LABEL: v_rsq_clamp_fabs_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f32_e64 v0, |v0|
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_fabs_f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f32_e64 v0, |v0|
; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_rsq_clamp_fabs_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_rsq_f32_e64 v0, |v0|
; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0x7f7fffff, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fabs.src = call float @llvm.fabs.f32(float %src)
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %fabs.src)
ret float %rsq_clamp
}
define double @v_rsq_clamp_f64(double %src) #0 {
; SI-LABEL: v_rsq_clamp_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], v[0:1]
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, -1
; VI-NEXT: v_mov_b32_e32 v3, 0x7fefffff
; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v2, -1
; VI-NEXT: v_mov_b32_e32 v3, 0xffefffff
; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_rsq_clamp_f64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
; GFX12-NEXT: v_mov_b32_e32 v2, -1
; GFX12-NEXT: v_mov_b32_e32 v3, 0x7fefffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: v_mov_b32_e32 v2, -1
; GFX12-NEXT: v_mov_b32_e32 v3, 0xffefffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
ret double %rsq_clamp
}
define double @v_rsq_clamp_fabs_f64(double %src) #0 {
; SI-LABEL: v_rsq_clamp_fabs_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f64_e64 v[0:1], |v[0:1]|
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_fabs_f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f64_e64 v[0:1], |v[0:1]|
; VI-NEXT: v_mov_b32_e32 v2, -1
; VI-NEXT: v_mov_b32_e32 v3, 0x7fefffff
; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v2, -1
; VI-NEXT: v_mov_b32_e32 v3, 0xffefffff
; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_rsq_clamp_fabs_f64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_rsq_f64_e64 v[0:1], |v[0:1]|
; GFX12-NEXT: v_mov_b32_e32 v2, -1
; GFX12-NEXT: v_mov_b32_e32 v3, 0x7fefffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: v_mov_b32_e32 v2, -1
; GFX12-NEXT: v_mov_b32_e32 v3, 0xffefffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fabs.src = call double @llvm.fabs.f64(double %src)
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %fabs.src)
ret double %rsq_clamp
}
define float @v_rsq_clamp_undef_f32() #0 {
; SI-LABEL: v_rsq_clamp_undef_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f32_e32 v0, s4
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_undef_f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f32_e32 v0, s4
; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_rsq_clamp_undef_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_s_rsq_f32 s0, s0
; GFX12-NEXT: v_mov_b32_e32 v0, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, s0, 0x7f7fffff, v0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
ret float %rsq_clamp
}
define double @v_rsq_clamp_undef_f64() #0 {
; SI-LABEL: v_rsq_clamp_undef_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_undef_f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f64_e32 v[0:1], s[4:5]
; VI-NEXT: v_mov_b32_e32 v2, -1
; VI-NEXT: v_mov_b32_e32 v3, 0x7fefffff
; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v2, -1
; VI-NEXT: v_mov_b32_e32 v3, 0xffefffff
; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_rsq_clamp_undef_f64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_rsq_f64_e32 v[0:1], s[0:1]
; GFX12-NEXT: v_mov_b32_e32 v2, -1
; GFX12-NEXT: v_mov_b32_e32 v3, 0x7fefffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: v_mov_b32_e32 v2, -1
; GFX12-NEXT: v_mov_b32_e32 v3, 0xffefffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double undef)
ret double %rsq_clamp
}
define float @v_rsq_clamp_f32_non_ieee(float %src) #2 {
; SI-LABEL: v_rsq_clamp_f32_non_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_f32_non_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f32_e32 v0, v0
; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_rsq_clamp_f32_non_ieee:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_rsq_f32_e32 v0, v0
; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0x7f7fffff, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
ret float %rsq_clamp
}
define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
; SI-LABEL: v_rsq_clamp_f64_non_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], v[0:1]
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_f64_non_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, -1
; VI-NEXT: v_mov_b32_e32 v3, 0x7fefffff
; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v2, -1
; VI-NEXT: v_mov_b32_e32 v3, 0xffefffff
; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_rsq_clamp_f64_non_ieee:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
; GFX12-NEXT: v_mov_b32_e32 v2, -1
; GFX12-NEXT: v_mov_b32_e32 v3, 0x7fefffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: v_mov_b32_e32 v2, -1
; GFX12-NEXT: v_mov_b32_e32 v3, 0xffefffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
ret double %rsq_clamp
}
declare float @llvm.fabs.f32(float) #1
declare float @llvm.amdgcn.rsq.clamp.f32(float) #1
declare double @llvm.fabs.f64(double) #1
declare double @llvm.amdgcn.rsq.clamp.f64(double) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind "amdgpu-ieee"="false" }