; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=R600,EG %s
; RUN: llc -mtriple=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=R600,CM %s
define float @v_rcp_f32_ieee(float %x) #3 {
; SI-LABEL: v_rcp_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; SI-NEXT: v_rcp_f32_e32 v2, v1
; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; SI-NEXT: v_fma_f32 v2, v4, v2, v2
; SI-NEXT: v_mul_f32_e32 v4, v3, v2
; SI-NEXT: v_fma_f32 v5, -v1, v4, v3
; SI-NEXT: v_fma_f32 v4, v5, v2, v4
; SI-NEXT: v_fma_f32 v1, -v1, v4, v3
; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rcp_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
; VI-NEXT: v_fma_f32 v3, v4, v3, v3
; VI-NEXT: v_mul_f32_e32 v4, v2, v3
; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
; VI-NEXT: v_fma_f32 v4, v5, v3, v4
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_f32_ieee:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%rcp = fdiv float 1.0, %x
ret float %rcp
}
define float @v_rcp_f32_ieee_unsafe(float %x) #4 {
; GCN-LABEL: v_rcp_f32_ieee_unsafe:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_f32_ieee_unsafe:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%rcp = fdiv float 1.0, %x
ret float %rcp
}
define float @v_rcp_f32_ieee_known_not_denormal(float nofpclass(sub) %x) #3 {
; SI-LABEL: v_rcp_f32_ieee_known_not_denormal:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; SI-NEXT: v_rcp_f32_e32 v2, v1
; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; SI-NEXT: v_fma_f32 v2, v4, v2, v2
; SI-NEXT: v_mul_f32_e32 v4, v3, v2
; SI-NEXT: v_fma_f32 v5, -v1, v4, v3
; SI-NEXT: v_fma_f32 v4, v5, v2, v4
; SI-NEXT: v_fma_f32 v1, -v1, v4, v3
; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rcp_f32_ieee_known_not_denormal:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
; VI-NEXT: v_fma_f32 v3, v4, v3, v3
; VI-NEXT: v_mul_f32_e32 v4, v2, v3
; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
; VI-NEXT: v_fma_f32 v4, v5, v3, v4
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_f32_ieee_known_not_denormal:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%rcp = fdiv float 1.0, %x
ret float %rcp
}
define float @v_rcp_f32_ieee_nnan_ninf(float %x) #3 {
; SI-LABEL: v_rcp_f32_ieee_nnan_ninf:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; SI-NEXT: v_rcp_f32_e32 v2, v1
; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; SI-NEXT: v_fma_f32 v2, v4, v2, v2
; SI-NEXT: v_mul_f32_e32 v4, v3, v2
; SI-NEXT: v_fma_f32 v5, -v1, v4, v3
; SI-NEXT: v_fma_f32 v4, v5, v2, v4
; SI-NEXT: v_fma_f32 v1, -v1, v4, v3
; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rcp_f32_ieee_nnan_ninf:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
; VI-NEXT: v_fma_f32 v3, v4, v3, v3
; VI-NEXT: v_mul_f32_e32 v4, v2, v3
; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
; VI-NEXT: v_fma_f32 v4, v5, v3, v4
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_f32_ieee_nnan_ninf:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%rcp = fdiv nnan ninf float 1.0, %x
ret float %rcp
}
define float @v_neg_rcp_f32_ieee(float %x) #3 {
; SI-LABEL: v_neg_rcp_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
; SI-NEXT: v_rcp_f32_e32 v2, v1
; SI-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; SI-NEXT: v_fma_f32 v2, v4, v2, v2
; SI-NEXT: v_mul_f32_e32 v4, v3, v2
; SI-NEXT: v_fma_f32 v5, -v1, v4, v3
; SI-NEXT: v_fma_f32 v4, v5, v2, v4
; SI-NEXT: v_fma_f32 v1, -v1, v4, v3
; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; SI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_neg_rcp_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
; VI-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
; VI-NEXT: v_fma_f32 v3, v4, v3, v3
; VI-NEXT: v_mul_f32_e32 v4, v2, v3
; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
; VI-NEXT: v_fma_f32 v4, v5, v3, v4
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; VI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_neg_rcp_f32_ieee:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%rcp = fdiv float -1.0, %x
ret float %rcp
}
define float @v_rcp_f32_daz(float %x) #0 {
; SI-LABEL: v_rcp_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; SI-NEXT: v_rcp_f32_e32 v2, v1
; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; SI-NEXT: v_fma_f32 v2, v4, v2, v2
; SI-NEXT: v_mul_f32_e32 v4, v3, v2
; SI-NEXT: v_fma_f32 v5, -v1, v4, v3
; SI-NEXT: v_fma_f32 v4, v5, v2, v4
; SI-NEXT: v_fma_f32 v1, -v1, v4, v3
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rcp_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
; VI-NEXT: v_fma_f32 v3, v4, v3, v3
; VI-NEXT: v_mul_f32_e32 v4, v2, v3
; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
; VI-NEXT: v_fma_f32 v4, v5, v3, v4
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_f32_daz:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%rcp = fdiv float 1.0, %x
ret float %rcp
}
define float @v_neg_rcp_f32_daz(float %x) #0 {
; SI-LABEL: v_neg_rcp_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
; SI-NEXT: v_rcp_f32_e32 v2, v1
; SI-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; SI-NEXT: v_fma_f32 v2, v4, v2, v2
; SI-NEXT: v_mul_f32_e32 v4, v3, v2
; SI-NEXT: v_fma_f32 v5, -v1, v4, v3
; SI-NEXT: v_fma_f32 v4, v5, v2, v4
; SI-NEXT: v_fma_f32 v1, -v1, v4, v3
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; SI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_neg_rcp_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
; VI-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
; VI-NEXT: v_fma_f32 v3, v4, v3, v3
; VI-NEXT: v_mul_f32_e32 v4, v2, v3
; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
; VI-NEXT: v_fma_f32 v4, v5, v3, v4
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; VI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_neg_rcp_f32_daz:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%rcp = fdiv float -1.0, %x
ret float %rcp
}
define float @v_rcp_f32_ieee_ulp25(float %x) #3 {
; SI-LABEL: v_rcp_f32_ieee_ulp25:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, 0x7f800000
; SI-NEXT: v_frexp_mant_f32_e32 v1, v0
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; SI-NEXT: v_rcp_f32_e32 v1, v1
; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rcp_f32_ieee_ulp25:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_frexp_mant_f32_e32 v1, v0
; VI-NEXT: v_rcp_f32_e32 v1, v1
; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0
; VI-NEXT: v_ldexp_f32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_f32_ieee_ulp25:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%rcp = fdiv float 1.0, %x, !fpmath !0
ret float %rcp
}
define float @v_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 {
; SI-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, 0x7f800000
; SI-NEXT: v_frexp_mant_f32_e32 v1, v0
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; SI-NEXT: v_rcp_f32_e32 v1, v1
; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_frexp_mant_f32_e32 v1, v0
; VI-NEXT: v_rcp_f32_e32 v1, v1
; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0
; VI-NEXT: v_ldexp_f32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%rcp = fdiv float 1.0, %x, !fpmath !0
ret float %rcp
}
define float @v_neg_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 {
; SI-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, 0x7f800000
; SI-NEXT: v_frexp_mant_f32_e64 v1, -v0
; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
; SI-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
; SI-NEXT: v_rcp_f32_e32 v1, v1
; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_frexp_mant_f32_e64 v1, -v0
; VI-NEXT: v_rcp_f32_e32 v1, v1
; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0
; VI-NEXT: v_ldexp_f32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%rcp = fdiv float -1.0, %x, !fpmath !0
ret float %rcp
}
define float @v_rcp_f32_ieee_ulp25_ninf_nnan(float %x) #3 {
; SI-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, 0x7f800000
; SI-NEXT: v_frexp_mant_f32_e32 v1, v0
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; SI-NEXT: v_rcp_f32_e32 v1, v1
; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_frexp_mant_f32_e32 v1, v0
; VI-NEXT: v_rcp_f32_e32 v1, v1
; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0
; VI-NEXT: v_ldexp_f32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%rcp = fdiv ninf nnan float 1.0, %x, !fpmath !0
ret float %rcp
}
define float @v_rcp_f32_daz_ulp25(float %x) #0 {
; GCN-LABEL: v_rcp_f32_daz_ulp25:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_f32_daz_ulp25:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%rcp = fdiv float 1.0, %x, !fpmath !0
ret float %rcp
}
define float @v_neg_rcp_f32_ieee_ulp25(float %x) #3 {
; SI-LABEL: v_neg_rcp_f32_ieee_ulp25:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, 0x7f800000
; SI-NEXT: v_frexp_mant_f32_e64 v1, -v0
; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
; SI-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
; SI-NEXT: v_rcp_f32_e32 v1, v1
; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_neg_rcp_f32_ieee_ulp25:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_frexp_mant_f32_e64 v1, -v0
; VI-NEXT: v_rcp_f32_e32 v1, v1
; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0
; VI-NEXT: v_ldexp_f32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_neg_rcp_f32_ieee_ulp25:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%rcp = fdiv float -1.0, %x, !fpmath !0
ret float %rcp
}
define float @v_neg_rcp_f32_daz_ulp25(float %x) #0 {
; GCN-LABEL: v_neg_rcp_f32_daz_ulp25:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f32_e64 v0, -v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_neg_rcp_f32_daz_ulp25:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%rcp = fdiv float -1.0, %x, !fpmath !0
ret float %rcp
}
define float @v_rcp_fabs_f32_ieee(float %x) #3 {
; SI-LABEL: v_rcp_fabs_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; SI-NEXT: v_rcp_f32_e32 v3, v2
; SI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0
; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; SI-NEXT: v_fma_f32 v3, v4, v3, v3
; SI-NEXT: v_mul_f32_e32 v4, v1, v3
; SI-NEXT: v_fma_f32 v5, -v2, v4, v1
; SI-NEXT: v_fma_f32 v4, v5, v3, v4
; SI-NEXT: v_fma_f32 v1, -v2, v4, v1
; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rcp_fabs_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0
; VI-NEXT: v_rcp_f32_e32 v3, v2
; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; VI-NEXT: v_fma_f32 v3, v4, v3, v3
; VI-NEXT: v_mul_f32_e32 v4, v1, v3
; VI-NEXT: v_fma_f32 v5, -v2, v4, v1
; VI-NEXT: v_fma_f32 v4, v5, v3, v4
; VI-NEXT: v_fma_f32 v1, -v2, v4, v1
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_fabs_f32_ieee:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%fabs.x = call float @llvm.fabs.f32(float %x)
%rcp = fdiv float 1.0, %fabs.x
ret float %rcp
}
define float @v_rcp_fabs_f32_daz(float %x) #0 {
; SI-LABEL: v_rcp_fabs_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; SI-NEXT: v_rcp_f32_e32 v3, v2
; SI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; SI-NEXT: v_fma_f32 v3, v4, v3, v3
; SI-NEXT: v_mul_f32_e32 v4, v1, v3
; SI-NEXT: v_fma_f32 v5, -v2, v4, v1
; SI-NEXT: v_fma_f32 v4, v5, v3, v4
; SI-NEXT: v_fma_f32 v1, -v2, v4, v1
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rcp_fabs_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0
; VI-NEXT: v_rcp_f32_e32 v3, v2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; VI-NEXT: v_fma_f32 v3, v4, v3, v3
; VI-NEXT: v_mul_f32_e32 v4, v1, v3
; VI-NEXT: v_fma_f32 v5, -v2, v4, v1
; VI-NEXT: v_fma_f32 v4, v5, v3, v4
; VI-NEXT: v_fma_f32 v1, -v2, v4, v1
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_fabs_f32_daz:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%fabs.x = call float @llvm.fabs.f32(float %x)
%rcp = fdiv float 1.0, %fabs.x
ret float %rcp
}
define float @v_rcp_fabs_f32_ieee_ulp25(float %x) #3 {
; SI-LABEL: v_rcp_fabs_f32_ieee_ulp25:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, 0x7f800000
; SI-NEXT: v_frexp_mant_f32_e64 v1, |v0|
; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
; SI-NEXT: v_cndmask_b32_e64 v1, |v0|, v1, s[4:5]
; SI-NEXT: v_rcp_f32_e32 v1, v1
; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rcp_fabs_f32_ieee_ulp25:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_frexp_mant_f32_e64 v1, |v0|
; VI-NEXT: v_rcp_f32_e32 v1, v1
; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0
; VI-NEXT: v_ldexp_f32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_fabs_f32_ieee_ulp25:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%fabs.x = call float @llvm.fabs.f32(float %x)
%rcp = fdiv float 1.0, %fabs.x, !fpmath !0
ret float %rcp
}
define float @v_rcp_fabs_f32_daz_ulp25(float %x) #0 {
; GCN-LABEL: v_rcp_fabs_f32_daz_ulp25:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f32_e64 v0, |v0|
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_fabs_f32_daz_ulp25:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%fabs.x = call float @llvm.fabs.f32(float %x)
%rcp = fdiv float 1.0, %fabs.x, !fpmath !0
ret float %rcp
}
define float @v_rcp_neg_fabs_f32_ieee(float %x) #3 {
; SI-LABEL: v_rcp_neg_fabs_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; SI-NEXT: v_rcp_f32_e32 v3, v2
; SI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0
; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; SI-NEXT: v_fma_f32 v3, v4, v3, v3
; SI-NEXT: v_mul_f32_e32 v4, v1, v3
; SI-NEXT: v_fma_f32 v5, -v2, v4, v1
; SI-NEXT: v_fma_f32 v4, v5, v3, v4
; SI-NEXT: v_fma_f32 v1, -v2, v4, v1
; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rcp_neg_fabs_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; VI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0
; VI-NEXT: v_rcp_f32_e32 v3, v2
; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; VI-NEXT: v_fma_f32 v3, v4, v3, v3
; VI-NEXT: v_mul_f32_e32 v4, v1, v3
; VI-NEXT: v_fma_f32 v5, -v2, v4, v1
; VI-NEXT: v_fma_f32 v4, v5, v3, v4
; VI-NEXT: v_fma_f32 v1, -v2, v4, v1
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_neg_fabs_f32_ieee:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%fabs.x = call float @llvm.fabs.f32(float %x)
%rcp = fdiv float -1.0, %fabs.x
ret float %rcp
}
define float @v_rcp_neg_fabs_f32_daz(float %x) #0 {
; SI-LABEL: v_rcp_neg_fabs_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; SI-NEXT: v_rcp_f32_e32 v3, v2
; SI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; SI-NEXT: v_fma_f32 v3, v4, v3, v3
; SI-NEXT: v_mul_f32_e32 v4, v1, v3
; SI-NEXT: v_fma_f32 v5, -v2, v4, v1
; SI-NEXT: v_fma_f32 v4, v5, v3, v4
; SI-NEXT: v_fma_f32 v1, -v2, v4, v1
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rcp_neg_fabs_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; VI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0
; VI-NEXT: v_rcp_f32_e32 v3, v2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; VI-NEXT: v_fma_f32 v3, v4, v3, v3
; VI-NEXT: v_mul_f32_e32 v4, v1, v3
; VI-NEXT: v_fma_f32 v5, -v2, v4, v1
; VI-NEXT: v_fma_f32 v4, v5, v3, v4
; VI-NEXT: v_fma_f32 v1, -v2, v4, v1
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_neg_fabs_f32_daz:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%fabs.x = call float @llvm.fabs.f32(float %x)
%rcp = fdiv float -1.0, %fabs.x
ret float %rcp
}
define float @v_rcp_neg_fabs_f32_ieee_ulp25(float %x) #3 {
; SI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, 0x7f800000
; SI-NEXT: v_frexp_mant_f32_e64 v1, -|v0|
; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
; SI-NEXT: v_cndmask_b32_e64 v1, -|v0|, v1, s[4:5]
; SI-NEXT: v_rcp_f32_e32 v1, v1
; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_frexp_mant_f32_e64 v1, -|v0|
; VI-NEXT: v_rcp_f32_e32 v1, v1
; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0
; VI-NEXT: v_ldexp_f32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%fabs.x = call float @llvm.fabs.f32(float %x)
%rcp = fdiv float -1.0, %fabs.x, !fpmath !0
ret float %rcp
}
define float @v_rcp_neg_fabs_f32_daz_ulp25(float %x) #0 {
; GCN-LABEL: v_rcp_neg_fabs_f32_daz_ulp25:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f32_e64 v0, -|v0|
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_rcp_neg_fabs_f32_daz_ulp25:
; R600: ; %bb.0:
; R600-NEXT: CF_END
; R600-NEXT: PAD
%fabs.x = call float @llvm.fabs.f32(float %x)
%rcp = fdiv float -1.0, %fabs.x, !fpmath !0
ret float %rcp
}
define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
; SI-LABEL: s_rcp_pat_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_rcp_f32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_rcp_pat_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rcp_f32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_rcp_pat_f32_daz:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: s_rcp_pat_f32_daz:
; CM: ; %bb.0:
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z,
; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
%rcp = fdiv float 1.0, %src, !fpmath !0
store float %rcp, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
; SI-LABEL: s_rcp_ulp25_pat_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_rcp_f32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_rcp_ulp25_pat_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rcp_f32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_rcp_ulp25_pat_f32_daz:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: s_rcp_ulp25_pat_f32_daz:
; CM: ; %bb.0:
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z,
; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
%rcp = fdiv float 1.0, %src, !fpmath !0
store float %rcp, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
; SI-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_rcp_f32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rcp_f32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
; CM: ; %bb.0:
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z,
; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
%rcp = fdiv fast float 1.0, %src, !fpmath !0
store float %rcp, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
; SI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_rcp_f32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rcp_f32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
; CM: ; %bb.0:
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z,
; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
%rcp = fdiv arcp float 1.0, %src, !fpmath !0
store float %rcp, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #2 {
; SI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_rcp_f32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rcp_f32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
; CM: ; %bb.0:
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z,
; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
%rcp = fdiv float 1.0, %src, !fpmath !0
store float %rcp, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
; SI-LABEL: s_rcp_fabs_pat_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_rcp_f32_e64 v0, |s4|
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_rcp_fabs_pat_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rcp_f32_e64 v2, |s4|
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_rcp_fabs_pat_f32_daz:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: RECIP_IEEE * T1.X, |KC0[2].Z|,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: s_rcp_fabs_pat_f32_daz:
; CM: ; %bb.0:
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: RECIP_IEEE T1.X, |KC0[2].Z|,
; CM-NEXT: RECIP_IEEE T1.Y (MASKED), |KC0[2].Z|,
; CM-NEXT: RECIP_IEEE T1.Z (MASKED), |KC0[2].Z|,
; CM-NEXT: RECIP_IEEE * T1.W (MASKED), |KC0[2].Z|,
%src.fabs = call float @llvm.fabs.f32(float %src)
%rcp = fdiv float 1.0, %src.fabs, !fpmath !0
store float %rcp, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
; SI-LABEL: s_neg_rcp_pat_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_rcp_f32_e64 v0, -s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_neg_rcp_pat_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rcp_f32_e64 v2, -s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_neg_rcp_pat_f32_daz:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: RECIP_IEEE * T0.X, KC0[2].Z,
; EG-NEXT: MUL_IEEE T0.X, literal.x, PS,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45)
;
; CM-LABEL: s_neg_rcp_pat_f32_daz:
; CM: ; %bb.0:
; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: RECIP_IEEE T0.X, KC0[2].Z,
; CM-NEXT: RECIP_IEEE T0.Y (MASKED), KC0[2].Z,
; CM-NEXT: RECIP_IEEE T0.Z (MASKED), KC0[2].Z,
; CM-NEXT: RECIP_IEEE * T0.W (MASKED), KC0[2].Z,
; CM-NEXT: MUL_IEEE * T0.X, literal.x, PV.X,
; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%rcp = fdiv float -1.0, %src, !fpmath !0
store float %rcp, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
; SI-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_rcp_f32_e64 v0, -|s4|
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rcp_f32_e64 v2, -|s4|
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: RECIP_IEEE * T0.X, |KC0[2].Z|,
; EG-NEXT: MUL_IEEE T0.X, literal.x, PS,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45)
;
; CM-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
; CM: ; %bb.0:
; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: RECIP_IEEE T0.X, |KC0[2].Z|,
; CM-NEXT: RECIP_IEEE T0.Y (MASKED), |KC0[2].Z|,
; CM-NEXT: RECIP_IEEE T0.Z (MASKED), |KC0[2].Z|,
; CM-NEXT: RECIP_IEEE * T0.W (MASKED), |KC0[2].Z|,
; CM-NEXT: MUL_IEEE * T0.X, literal.x, PV.X,
; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%src.fabs = call float @llvm.fabs.f32(float %src)
%src.fabs.fneg = fneg float %src.fabs
%rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0
store float %rcp, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1) %out, float %src) #0 {
; SI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_rcp_f32_e64 v0, -|s4|
; SI-NEXT: v_mul_f32_e64 v1, s4, -|s4|
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rcp_f32_e64 v2, -|s4|
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mul_f32_e64 v3, s4, -|s4|
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
; EG: ; %bb.0:
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MUL_IEEE T0.X, KC0[2].Z, -|KC0[2].Z|,
; EG-NEXT: RECIP_IEEE * T0.Y, |KC0[2].Z|,
; EG-NEXT: MUL_IEEE T1.X, literal.x, PS,
; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.y,
; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45)
;
; CM-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
; CM: ; %bb.0:
; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X
; CM-NEXT: CF_END
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: MUL_IEEE * T0.X, KC0[2].Z, -|KC0[2].Z|,
; CM-NEXT: RECIP_IEEE T0.X (MASKED), |KC0[2].Z|,
; CM-NEXT: RECIP_IEEE T0.Y, |KC0[2].Z|,
; CM-NEXT: RECIP_IEEE T0.Z (MASKED), |KC0[2].Z|,
; CM-NEXT: RECIP_IEEE * T0.W (MASKED), |KC0[2].Z|,
; CM-NEXT: MUL_IEEE * T1.X, literal.x, PV.Y,
; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00)
; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%src.fabs = call float @llvm.fabs.f32(float %src)
%src.fabs.fneg = fneg float %src.fabs
%rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0
store volatile float %rcp, ptr addrspace(1) %out, align 4
%other = fmul float %src, %src.fabs.fneg
store volatile float %other, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0 {
; SI-LABEL: s_div_arcp_2_x_pat_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[0:1], 0x0
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mul_f32_e64 v0, s4, 0.5
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_div_arcp_2_x_pat_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[0:1], 0x0
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f32_e64 v2, s4, 0.5
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_div_arcp_2_x_pat_f32_daz:
; EG: ; %bb.0:
; EG-NEXT: TEX 0 @4
; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 4:
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 6:
; EG-NEXT: MUL_IEEE T0.X, T0.X, 0.5,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: s_div_arcp_2_x_pat_f32_daz:
; CM: ; %bb.0:
; CM-NEXT: TEX 0 @4
; CM-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 4:
; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 6:
; CM-NEXT: MUL_IEEE * T0.X, T0.X, 0.5,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%x = load float, ptr addrspace(1) undef
%rcp = fdiv arcp float %x, 2.0
store float %rcp, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 {
; SI-LABEL: s_div_arcp_k_x_pat_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[0:1], 0x0
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, s4, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_div_arcp_k_x_pat_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[0:1], 0x0
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f32_e32 v2, s4, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_div_arcp_k_x_pat_f32_daz:
; EG: ; %bb.0:
; EG-NEXT: TEX 0 @4
; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 4:
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 6:
; EG-NEXT: MUL_IEEE T0.X, T0.X, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 1036831949(1.000000e-01), 2(2.802597e-45)
;
; CM-LABEL: s_div_arcp_k_x_pat_f32_daz:
; CM: ; %bb.0:
; CM-NEXT: TEX 0 @4
; CM-NEXT: ALU 3, @6, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 4:
; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 6:
; CM-NEXT: MUL_IEEE * T0.X, T0.X, literal.x,
; CM-NEXT: 1036831949(1.000000e-01), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%x = load float, ptr addrspace(1) undef
%rcp = fdiv arcp float %x, 10.0
store float %rcp, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 {
; SI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[0:1], 0x0
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, s4, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[0:1], 0x0
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f32_e32 v2, s4, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
; EG: ; %bb.0:
; EG-NEXT: TEX 0 @4
; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 4:
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 6:
; EG-NEXT: MUL_IEEE T0.X, T0.X, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: -1110651699(-1.000000e-01), 2(2.802597e-45)
;
; CM-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
; CM: ; %bb.0:
; CM-NEXT: TEX 0 @4
; CM-NEXT: ALU 3, @6, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 4:
; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 6:
; CM-NEXT: MUL_IEEE * T0.X, T0.X, literal.x,
; CM-NEXT: -1110651699(-1.000000e-01), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%x = load float, ptr addrspace(1) undef
%rcp = fdiv arcp float %x, -10.0
store float %rcp, ptr addrspace(1) %out, align 4
ret void
}
declare float @llvm.fabs.f32(float) #1
declare float @llvm.sqrt.f32(float) #1
attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #3 = { nounwind "denormal-fp-math-f32"="ieee,ieee" }
attributes #4 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="ieee,ieee" }
!0 = !{float 2.500000e+00}