; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
declare i16 @llvm.umax.i16(i16, i16)
declare i64 @llvm.umin.i64(i64, i64)
declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>)
define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) {
; VI-LABEL: fmul_pow2_4xfloat:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1
; VI-NEXT: v_lshlrev_b32_e64 v1, v1, 1
; VI-NEXT: v_lshlrev_b32_e64 v2, v2, 1
; VI-NEXT: v_lshlrev_b32_e64 v3, v3, 1
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: v_cvt_f32_u32_e32 v1, v1
; VI-NEXT: v_cvt_f32_u32_e32 v2, v2
; VI-NEXT: v_cvt_f32_u32_e32 v3, v3
; VI-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
; VI-NEXT: v_mul_f32_e32 v1, 0x41100000, v1
; VI-NEXT: v_mul_f32_e32 v2, 0x41100000, v2
; VI-NEXT: v_mul_f32_e32 v3, 0x41100000, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow2_4xfloat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1
; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, 1
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, 1
; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, 1
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3
; GFX10-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x41100000, v1
; GFX10-NEXT: v_mul_f32_e32 v2, 0x41100000, v2
; GFX10-NEXT: v_mul_f32_e32 v3, 0x41100000, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow2_4xfloat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1
; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 1
; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, 1
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v3, 1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_mul_f32 v0, 0x41100000, v0 :: v_dual_mul_f32 v1, 0x41100000, v1
; GFX11-NEXT: v_dual_mul_f32 v2, 0x41100000, v2 :: v_dual_mul_f32 v3, 0x41100000, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i
%p2_f = uitofp <4 x i32> %p2 to <4 x float>
%r = fmul <4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, %p2_f
ret <4 x float> %r
}
define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
; VI-LABEL: fmul_pow2_ldexp_4xfloat:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, 0x41100000
; VI-NEXT: v_ldexp_f32 v0, s4, v0
; VI-NEXT: v_ldexp_f32 v1, s4, v1
; VI-NEXT: v_ldexp_f32 v2, s4, v2
; VI-NEXT: v_ldexp_f32 v3, s4, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow2_ldexp_4xfloat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_ldexp_f32 v0, 0x41100000, v0
; GFX10-NEXT: v_ldexp_f32 v1, 0x41100000, v1
; GFX10-NEXT: v_ldexp_f32 v2, 0x41100000, v2
; GFX10-NEXT: v_ldexp_f32 v3, 0x41100000, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow2_ldexp_4xfloat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_ldexp_f32 v0, 0x41100000, v0
; GFX11-NEXT: v_ldexp_f32 v1, 0x41100000, v1
; GFX11-NEXT: v_ldexp_f32 v2, 0x41100000, v2
; GFX11-NEXT: v_ldexp_f32 v3, 0x41100000, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i)
ret <4 x float> %r
}
define <4 x float> @fdiv_pow2_4xfloat(<4 x i32> %i) {
; VI-LABEL: fdiv_pow2_4xfloat:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; VI-NEXT: v_lshlrev_b32_e32 v1, 23, v1
; VI-NEXT: v_lshlrev_b32_e32 v2, 23, v2
; VI-NEXT: v_lshlrev_b32_e32 v3, 23, v3
; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x41100000, v0
; VI-NEXT: v_sub_u32_e32 v1, vcc, 0x41100000, v1
; VI-NEXT: v_sub_u32_e32 v2, vcc, 0x41100000, v2
; VI-NEXT: v_sub_u32_e32 v3, vcc, 0x41100000, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow2_4xfloat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 23, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 23, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 23, v3
; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x41100000, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41100000, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x41100000, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, 0x41100000, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow2_4xfloat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 23, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 23, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 23, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x41100000, v0
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x41100000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x41100000, v2
; GFX11-NEXT: v_sub_nc_u32_e32 v3, 0x41100000, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i
%p2_f = uitofp <4 x i32> %p2 to <4 x float>
%r = fdiv <4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, %p2_f
ret <4 x float> %r
}
declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; VI-LABEL: fmul_pow2_8xhalf:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 1
; VI-NEXT: v_lshlrev_b16_e64 v4, v3, 1
; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v6, v2, 1
; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v7, v1, 1
; VI-NEXT: v_lshlrev_b16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, v0, 1
; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_cvt_f16_u16_e32 v0, v0
; VI-NEXT: v_cvt_f16_u16_e32 v5, v8
; VI-NEXT: v_cvt_f16_u16_e32 v1, v1
; VI-NEXT: v_cvt_f16_u16_e32 v7, v7
; VI-NEXT: v_cvt_f16_u16_e32 v2, v2
; VI-NEXT: v_cvt_f16_u16_e32 v6, v6
; VI-NEXT: v_cvt_f16_u16_e32 v3, v3
; VI-NEXT: v_cvt_f16_u16_e32 v4, v4
; VI-NEXT: v_mov_b32_e32 v8, 0x7000
; VI-NEXT: v_mul_f16_e32 v4, 0x7000, v4
; VI-NEXT: v_mul_f16_sdwa v3, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v6, 0x7000, v6
; VI-NEXT: v_mul_f16_sdwa v2, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v7, 0x7000, v7
; VI-NEXT: v_mul_f16_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v5, 0x7000, v5
; VI-NEXT: v_mul_f16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v5, v0
; VI-NEXT: v_or_b32_e32 v1, v7, v1
; VI-NEXT: v_or_b32_e32 v2, v6, v2
; VI-NEXT: v_or_b32_e32 v3, v4, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow2_8xhalf:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
; GFX10-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
; GFX10-NEXT: v_cvt_f16_u16_e32 v4, v3
; GFX10-NEXT: v_cvt_f16_u16_e32 v5, v2
; GFX10-NEXT: v_cvt_f16_u16_e32 v6, v1
; GFX10-NEXT: v_cvt_f16_u16_e32 v7, v0
; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_pack_b32_f16 v0, v7, v0
; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1
; GFX10-NEXT: v_pack_b32_f16 v2, v5, v2
; GFX10-NEXT: v_pack_b32_f16 v3, v4, v3
; GFX10-NEXT: v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow2_8xhalf:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_cvt_f16_u16_e32 v4, v3
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-NEXT: v_cvt_f16_u16_e32 v5, v2
; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1
; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
; GFX11-NEXT: v_cvt_f16_u16_e32 v6, v6
; GFX11-NEXT: v_cvt_f16_u16_e32 v7, v7
; GFX11-NEXT: v_cvt_f16_u16_e32 v2, v2
; GFX11-NEXT: v_cvt_f16_u16_e32 v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v6
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_pack_b32_f16 v2, v5, v2
; GFX11-NEXT: v_pack_b32_f16 v3, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
%p2_f = uitofp <8 x i16> %p2 to <8 x half>
%r = fmul <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
ret <8 x half> %r
}
define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
; VI-LABEL: fmul_pow2_ldexp_8xhalf:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0x7000
; VI-NEXT: v_ldexp_f16_e32 v4, 0x7000, v3
; VI-NEXT: v_ldexp_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_ldexp_f16_e32 v6, 0x7000, v2
; VI-NEXT: v_ldexp_f16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_ldexp_f16_e32 v7, 0x7000, v1
; VI-NEXT: v_ldexp_f16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_ldexp_f16_e32 v8, 0x7000, v0
; VI-NEXT: v_ldexp_f16_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v8, v0
; VI-NEXT: v_or_b32_e32 v1, v7, v1
; VI-NEXT: v_or_b32_e32 v2, v6, v2
; VI-NEXT: v_or_b32_e32 v3, v4, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow2_ldexp_8xhalf:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, 0x7000
; GFX10-NEXT: v_ldexp_f16_e32 v5, 0x7000, v3
; GFX10-NEXT: v_ldexp_f16_e32 v6, 0x7000, v2
; GFX10-NEXT: v_ldexp_f16_e32 v7, 0x7000, v1
; GFX10-NEXT: v_ldexp_f16_e32 v8, 0x7000, v0
; GFX10-NEXT: v_ldexp_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_ldexp_f16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_ldexp_f16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_ldexp_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_pack_b32_f16 v0, v8, v0
; GFX10-NEXT: v_pack_b32_f16 v1, v7, v1
; GFX10-NEXT: v_pack_b32_f16 v2, v6, v2
; GFX10-NEXT: v_pack_b32_f16 v3, v5, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow2_ldexp_8xhalf:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_ldexp_f16_e32 v4, 0x7000, v3
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-NEXT: v_ldexp_f16_e32 v5, 0x7000, v2
; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_ldexp_f16_e32 v1, 0x7000, v1
; GFX11-NEXT: v_ldexp_f16_e32 v0, 0x7000, v0
; GFX11-NEXT: v_ldexp_f16_e32 v6, 0x7000, v6
; GFX11-NEXT: v_ldexp_f16_e32 v7, 0x7000, v7
; GFX11-NEXT: v_ldexp_f16_e32 v2, 0x7000, v2
; GFX11-NEXT: v_ldexp_f16_e32 v3, 0x7000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v6
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_pack_b32_f16 v2, v5, v2
; GFX11-NEXT: v_pack_b32_f16 v3, v4, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i)
ret <8 x half> %r
}
define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
; VI-LABEL: fdiv_pow2_8xhalf:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, 10
; VI-NEXT: v_lshlrev_b16_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_mov_b32_e32 v6, 0x7000
; VI-NEXT: v_lshlrev_b16_e32 v3, 10, v3
; VI-NEXT: v_lshlrev_b16_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_lshlrev_b16_e32 v2, 10, v2
; VI-NEXT: v_lshlrev_b16_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_lshlrev_b16_e32 v1, 10, v1
; VI-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0
; VI-NEXT: v_sub_u16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_sub_u16_sdwa v7, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_sub_u16_sdwa v8, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_sub_u16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_sub_u16_e32 v0, 0x7000, v0
; VI-NEXT: v_sub_u16_e32 v1, 0x7000, v1
; VI-NEXT: v_sub_u16_e32 v2, 0x7000, v2
; VI-NEXT: v_sub_u16_e32 v3, 0x7000, v3
; VI-NEXT: v_or_b32_e32 v0, v0, v4
; VI-NEXT: v_or_b32_e32 v1, v1, v8
; VI-NEXT: v_or_b32_e32 v2, v2, v7
; VI-NEXT: v_or_b32_e32 v3, v3, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow2_8xhalf:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 10, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 10, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v2, 10, v2 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v3, 10, v3 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_sub_i16 v0, 0x7000, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_sub_i16 v1, 0x7000, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_sub_i16 v2, 0x7000, v2 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_sub_i16 v3, 0x7000, v3 op_sel_hi:[0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow2_8xhalf:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v0, 10, v0 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_lshlrev_b16 v1, 10, v1 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_lshlrev_b16 v2, 10, v2 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_lshlrev_b16 v3, 10, v3 op_sel_hi:[0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_pk_sub_i16 v0, 0x7000, v0 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_sub_i16 v1, 0x7000, v1 op_sel_hi:[0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_pk_sub_i16 v2, 0x7000, v2 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_sub_i16 v3, 0x7000, v3 op_sel_hi:[0,1]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
%p2_f = uitofp <8 x i16> %p2 to <8 x half>
%r = fdiv <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
ret <8 x half> %r
}
define double @fmul_pow_shl_cnt(i64 %cnt) nounwind {
; VI-LABEL: fmul_pow_shl_cnt:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 1
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
; VI-NEXT: s_mov_b32 s5, 0x40220000
; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32
; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4]
; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow_shl_cnt:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 1
; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX10-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow_shl_cnt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i64 1, %cnt
%conv = uitofp i64 %shl to double
%mul = fmul double 9.000000e+00, %conv
ret double %mul
}
define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind {
; VI-LABEL: fmul_pow_shl_cnt2:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
; VI-NEXT: s_mov_b32 s5, 0xc0220000
; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32
; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4]
; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow_shl_cnt2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2
; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX10-NEXT: v_mul_f64 v[0:1], 0xc0220000, v[0:1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow_shl_cnt2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f64 v[0:1], 0xc0220000, v[0:1]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i64 2, %cnt
%conv = uitofp i64 %shl to double
%mul = fmul double -9.000000e+00, %conv
ret double %mul
}
define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
; VI-LABEL: fmul_pow_select:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 1, v1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; VI-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v0, v0, v1
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow_select:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX10-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v0, v0, v1
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow_select:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl2 = shl nuw i32 2, %cnt
%shl1 = shl nuw i32 1, %cnt
%shl = select i1 %c, i32 %shl1, i32 %shl2
%conv = uitofp i32 %shl to float
%mul = fmul float 9.000000e+00, %conv
ret float %mul
}
define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
; VI-LABEL: fmul_fly_pow_mul_min_pow2:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8
; VI-NEXT: s_mov_b64 s[4:5], 0x2000
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, 0x2000
; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-NEXT: v_ffbh_u32_e32 v2, v1
; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; VI-NEXT: v_min_u32_e32 v0, 1, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
; VI-NEXT: v_ldexp_f32 v0, v0, v1
; VI-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_fly_pow_mul_min_pow2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8
; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1]
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo
; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_fly_pow_mul_min_pow2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1]
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo
; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_u32_e32 v2, 32, v2
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl8 = shl nuw i64 8, %cnt
%shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192)
%conv = uitofp i64 %shl to float
%mul = fmul float 9.000000e+00, %conv
ret float %mul
}
define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind {
; VI-LABEL: fmul_pow_mul_max_pow2:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 2
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: s_mov_b32 s5, 0x40080000
; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow_mul_max_pow2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b16 v0, v0, 2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; GFX10-NEXT: v_mul_f64 v[0:1], 0x40080000, v[0:1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow_mul_max_pow2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b16 v0, v0, 2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f64 v[0:1], 0x40080000, v[0:1]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl2 = shl nuw i16 2, %cnt
%shl1 = shl nuw i16 1, %cnt
%shl = call i16 @llvm.umax.i16(i16 %shl1, i16 %shl2)
%conv = uitofp i16 %shl to double
%mul = fmul double 3.000000e+00, %conv
ret double %mul
}
define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
; VI-NEXT: s_mov_b32 s5, 0x40220000
; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32
; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4]
; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX10-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i64 %v, %cnt
%conv = uitofp i64 %shl to double
%mul = fmul double 9.000000e+00, %conv
ret double %mul
}
define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nounwind {
; VI-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b64 v[1:2], v2, 2
; VI-NEXT: v_ffbh_u32_e32 v3, v2
; VI-NEXT: v_min_u32_e32 v5, 32, v3
; VI-NEXT: v_lshlrev_b64 v[1:2], v5, v[1:2]
; VI-NEXT: v_lshlrev_b64 v[3:4], v0, 2
; VI-NEXT: v_min_u32_e32 v0, 1, v1
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: v_cvt_f32_u32_e32 v2, v0
; VI-NEXT: v_ffbh_u32_e32 v0, v4
; VI-NEXT: v_min_u32_e32 v6, 32, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], v6, v[3:4]
; VI-NEXT: v_sub_u32_e32 v3, vcc, 32, v5
; VI-NEXT: v_min_u32_e32 v0, 1, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: v_ldexp_f32 v1, v2, v3
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v6
; VI-NEXT: v_ldexp_f32 v0, v0, v2
; VI-NEXT: v_mul_f32_e32 v0, 0x41700000, v0
; VI-NEXT: v_mul_f32_e32 v1, 0x41700000, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 2
; GFX10-NEXT: v_ffbh_u32_e32 v4, v1
; GFX10-NEXT: v_ffbh_u32_e32 v5, v3
; GFX10-NEXT: v_min_u32_e32 v4, 32, v4
; GFX10-NEXT: v_min_u32_e32 v5, 32, v5
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v2, 32, v5
; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v4
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX10-NEXT: v_ldexp_f32 v0, v0, v3
; GFX10-NEXT: v_ldexp_f32 v1, v1, v2
; GFX10-NEXT: v_mul_f32_e32 v0, 0x41700000, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x41700000, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2
; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_clz_i32_u32_e32 v4, v1
; GFX11-NEXT: v_clz_i32_u32_e32 v5, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_min_u32_e32 v4, 32, v4
; GFX11-NEXT: v_min_u32_e32 v5, 32, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX11-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v5
; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v3
; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mul_f32 v0, 0x41700000, v0 :: v_dual_mul_f32 v1, 0x41700000, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
%conv = uitofp <2 x i64> %shl to <2 x float>
%mul = fmul <2 x float> <float 15.000000e+00, float 15.000000e+00>, %conv
ret <2 x float> %mul
}
define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
; VI-LABEL: fmul_pow_shl_cnt_vec:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2
; VI-NEXT: v_lshlrev_b64 v[2:3], v2, 2
; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
; VI-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
; VI-NEXT: v_cvt_f64_u32_e32 v[7:8], v2
; VI-NEXT: s_mov_b32 s5, 0x402e0000
; VI-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
; VI-NEXT: v_add_f64 v[2:3], v[5:6], v[7:8]
; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow_shl_cnt_vec:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 2
; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; GFX10-NEXT: v_cvt_f64_u32_e32 v[8:9], v2
; GFX10-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9]
; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1]
; GFX10-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow_shl_cnt_vec:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2
; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
; GFX11-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1]
; GFX11-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
%conv = uitofp <2 x i64> %shl to <2 x double>
%mul = fmul <2 x double> <double 15.000000e+00, double 15.000000e+00>, %conv
ret <2 x double> %mul
}
define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float> %add) nounwind {
; VI-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e64 v3, v3, 2
; VI-NEXT: v_lshlrev_b32_e64 v2, v2, 2
; VI-NEXT: v_lshlrev_b32_e64 v1, v1, 2
; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 2
; VI-NEXT: v_cvt_f32_u32_e32 v3, v3
; VI-NEXT: v_cvt_f32_u32_e32 v2, v2
; VI-NEXT: v_cvt_f32_u32_e32 v1, v1
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: v_mul_f32_e32 v3, 0x40a00000, v3
; VI-NEXT: v_mul_f32_e32 v2, 0x40a00000, v2
; VI-NEXT: v_mul_f32_e32 v1, 0x40a00000, v1
; VI-NEXT: v_mul_f32_e32 v0, 0x40a00000, v0
; VI-NEXT: v_add_f32_e32 v0, v0, v4
; VI-NEXT: v_add_f32_e32 v1, v1, v5
; VI-NEXT: v_add_f32_e32 v2, v2, v6
; VI-NEXT: v_add_f32_e32 v3, v3, v7
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 2
; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, 2
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, 2
; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, 2
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3
; GFX10-NEXT: v_mul_f32_e32 v0, 0x40a00000, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x40a00000, v1
; GFX10-NEXT: v_mul_f32_e32 v2, 0x40a00000, v2
; GFX10-NEXT: v_mul_f32_e32 v3, 0x40a00000, v3
; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 2
; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 2
; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, 2
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v3, 2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_mul_f32 v0, 0x40a00000, v0 :: v_dual_mul_f32 v1, 0x40a00000, v1
; GFX11-NEXT: v_dual_mul_f32 v2, 0x40a00000, v2 :: v_dual_mul_f32 v3, 0x40a00000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
; GFX11-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nsw nuw <4 x i32> <i32 2, i32 2, i32 2, i32 2>, %cnt
%conv = uitofp <4 x i32> %shl to <4 x float>
%mul = fmul <4 x float> <float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00>, %conv
%res = fadd <4 x float> %mul, %add
ret <4 x float> %res
}
define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwind {
; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v1
; VI-NEXT: v_lshlrev_b64 v[1:2], v2, 2
; VI-NEXT: s_mov_b32 s5, 0x402e0000
; VI-NEXT: v_cvt_f64_u32_e32 v[5:6], v2
; VI-NEXT: v_ldexp_f64 v[2:3], v[3:4], 32
; VI-NEXT: v_ldexp_f64 v[4:5], v[5:6], 32
; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v0
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v1
; VI-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7]
; VI-NEXT: v_add_f64 v[4:5], v[4:5], v[0:1]
; VI-NEXT: v_mul_f64 v[0:1], v[2:3], s[4:5]
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: s_mov_b32 s5, 0x402c0000
; VI-NEXT: v_mul_f64 v[2:3], v[4:5], s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 2
; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; GFX10-NEXT: v_cvt_f64_u32_e32 v[8:9], v2
; GFX10-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9]
; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1]
; GFX10-NEXT: v_mul_f64 v[2:3], 0x402c0000, v[2:3]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2
; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
; GFX11-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1]
; GFX11-NEXT: v_mul_f64 v[2:3], 0x402c0000, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
%conv = uitofp <2 x i64> %shl to <2 x double>
%mul = fmul <2 x double> <double 15.000000e+00, double 14.000000e+00>, %conv
ret <2 x double> %mul
}
define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwind {
; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2
; VI-NEXT: v_lshlrev_b64 v[2:3], v2, 1
; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
; VI-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
; VI-NEXT: v_cvt_f64_u32_e32 v[7:8], v2
; VI-NEXT: s_mov_b32 s5, 0x402e0000
; VI-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
; VI-NEXT: v_add_f64 v[2:3], v[5:6], v[7:8]
; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 1
; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; GFX10-NEXT: v_cvt_f64_u32_e32 v[8:9], v2
; GFX10-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9]
; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1]
; GFX10-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2
; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
; GFX11-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1]
; GFX11-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nsw nuw <2 x i64> <i64 2, i64 1>, %cnt
%conv = uitofp <2 x i64> %shl to <2 x double>
%mul = fmul <2 x double> <double 15.000000e+00, double 15.000000e+00>, %conv
ret <2 x double> %mul
}
define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; VI-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 2
; VI-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 2
; VI-NEXT: v_cvt_f16_u16_e32 v0, v0
; VI-NEXT: v_cvt_f16_u16_e32 v1, v1
; VI-NEXT: v_mov_b32_e32 v2, 0x4b80
; VI-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v0, 0x4b80, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
; GFX10-NEXT: v_cvt_f16_u16_e32 v1, v0
; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX10-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nsw nuw <2 x i16> <i16 2, i16 2>, %cnt
%conv = uitofp <2 x i16> %shl to <2 x half>
%mul = fmul <2 x half> <half 15.000000e+00, half 15.000000e+00>, %conv
ret <2 x half> %mul
}
define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind {
; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 1
; VI-NEXT: s_mov_b32 s4, 0xff5f3992
; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
; VI-NEXT: s_mov_b32 s5, 0x7befffff
; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32
; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4]
; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 1
; GFX10-NEXT: s_mov_b32 s4, 0xff5f3992
; GFX10-NEXT: s_mov_b32 s5, 0x7befffff
; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 1
; GFX11-NEXT: s_mov_b32 s0, 0xff5f3992
; GFX11-NEXT: s_mov_b32 s1, 0x7befffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i64 1, %cnt
%conv = uitofp i64 %shl to double
%mul = fmul double 9.745314e+288, %conv
ret double %mul
}
define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind {
; VI-LABEL: fmul_pow_shl_cnt_safe:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 1
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; VI-NEXT: s_mov_b32 s4, 0xff5f3992
; VI-NEXT: s_mov_b32 s5, 0x7befffff
; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fmul_pow_shl_cnt_safe:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b16 v0, v0, 1
; GFX10-NEXT: s_mov_b32 s4, 0xff5f3992
; GFX10-NEXT: s_mov_b32 s5, 0x7befffff
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fmul_pow_shl_cnt_safe:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b16 v0, v0, 1
; GFX11-NEXT: s_mov_b32 s0, 0xff5f3992
; GFX11-NEXT: s_mov_b32 s1, 0x7befffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i16 1, %cnt
%conv = uitofp i16 %shl to double
%mul = fmul double 9.745314e+288, %conv
ret double %mul
}
define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
; VI-LABEL: fdiv_pow_shl_cnt_vec:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v1, 20, v0
; VI-NEXT: v_mov_b32_e32 v3, 0x3ff00000
; VI-NEXT: v_sub_u32_e64 v0, vcc, 0, 0
; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v4, 20, v2
; VI-NEXT: v_sub_u32_e64 v2, vcc, 0, 0
; VI-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt_vec:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 20, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 20, v2
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, 0, 0
; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x3ff00000, v1, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, 0, 0
; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt_vec:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 20, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 20, v2
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, 0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x3ff00000, v1, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, 0, 0
; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v3, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt
%conv = uitofp <2 x i64> %shl to <2 x double>
%mul = fdiv <2 x double> <double 1.000000e+00, double 1.000000e+00>, %conv
ret <2 x double> %mul
}
define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nounwind {
; VI-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; VI-NEXT: v_lshlrev_b32_e32 v1, 23, v2
; VI-NEXT: v_sub_u32_e32 v0, vcc, 1.0, v0
; VI-NEXT: v_sub_u32_e32 v1, vcc, 1.0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 23, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v0, 1.0, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 1.0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 23, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 1.0, v0
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 1.0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt
%conv = uitofp <2 x i64> %shl to <2 x float>
%mul = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %conv
ret <2 x float> %mul
}
define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
; VI-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8
; VI-NEXT: s_mov_b32 s6, 0xc1100000
; VI-NEXT: v_ffbh_u32_e32 v2, v1
; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; VI-NEXT: v_min_u32_e32 v0, 1, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
; VI-NEXT: v_ldexp_f32 v0, v0, v1
; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6
; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
; VI-NEXT: v_fma_f32 v3, v4, v3, v3
; VI-NEXT: v_mul_f32_e32 v4, v2, v3
; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
; VI-NEXT: v_fma_f32 v4, v5, v3, v4
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; VI-NEXT: v_div_fixup_f32 v0, v1, v0, s6
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8
; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0xc1100000
; GFX10-NEXT: v_rcp_f32_e32 v2, v1
; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2
; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2
; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1
; GFX11-NEXT: v_min_u32_e32 v2, 32, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0xc1100000
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2
; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2
; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 8, %cnt
%conv = uitofp i64 %shl to float
%mul = fdiv float -9.000000e+00, %conv
ret float %mul
}
define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
; VI-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8
; VI-NEXT: s_mov_b32 s6, 0xc1100000
; VI-NEXT: v_xor_b32_e32 v2, v0, v1
; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; VI-NEXT: v_ffbh_i32_e32 v3, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v3
; VI-NEXT: v_min_u32_e32 v2, v3, v2
; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; VI-NEXT: v_min_u32_e32 v0, 1, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
; VI-NEXT: v_ldexp_f32 v0, v0, v1
; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6
; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
; VI-NEXT: v_fma_f32 v3, v4, v3, v3
; VI-NEXT: v_mul_f32_e32 v4, v2, v3
; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
; VI-NEXT: v_fma_f32 v4, v5, v3, v4
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; VI-NEXT: v_div_fixup_f32 v0, v1, v0, s6
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8
; GFX10-NEXT: v_xor_b32_e32 v2, v0, v1
; GFX10-NEXT: v_ffbh_i32_e32 v3, v1
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v3
; GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2
; GFX10-NEXT: v_min_u32_e32 v2, v3, v2
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0xc1100000
; GFX10-NEXT: v_rcp_f32_e32 v2, v1
; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2
; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2
; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_xor_b32_e32 v2, v0, v1
; GFX11-NEXT: v_cls_i32_e32 v3, v1
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3
; GFX11-NEXT: v_add_nc_u32_e32 v2, 32, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_u32_e32 v2, v3, v2
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0xc1100000
; GFX11-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 8, %cnt
%conv = sitofp i64 %shl to float
%mul = fdiv float -9.000000e+00, %conv
ret float %mul
}
define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
; VI-LABEL: fdiv_pow_shl_cnt:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 31, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; VI-NEXT: v_sub_u32_e32 v0, vcc, 0xbd800000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 31, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0xbd800000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 31, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0xbd800000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cnt = and i64 %cnt_in, 31
%shl = shl i64 8, %cnt
%conv = sitofp i64 %shl to float
%mul = fdiv float -0.500000e+00, %conv
ret float %mul
}
define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: s_mov_b32 s4, 0x46000000
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
; VI-NEXT: v_rcp_f32_e32 v2, v1
; VI-NEXT: v_mul_f32_e32 v3, 0x46000000, v2
; VI-NEXT: v_mad_f32 v4, -v1, v3, s4
; VI-NEXT: v_mac_f32_e32 v3, v4, v2
; VI-NEXT: v_mad_f32 v1, -v1, v3, s4
; VI-NEXT: v_mul_f32_e32 v1, v1, v2
; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; VI-NEXT: v_add_f32_e32 v1, v1, v3
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
; VI-NEXT: s_movk_i32 s4, 0x7000
; VI-NEXT: v_div_fixup_f16 v0, v1, v0, s4
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX10-NEXT: v_rcp_f32_e32 v2, v1
; GFX10-NEXT: v_mul_f32_e32 v3, 0x46000000, v2
; GFX10-NEXT: v_mad_f32 v4, -v1, v3, 0x46000000
; GFX10-NEXT: v_mac_f32_e32 v3, v4, v2
; GFX10-NEXT: v_mad_f32 v1, -v1, v3, 0x46000000
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v2
; GFX10-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1
; GFX11-NEXT: s_mov_b32 s0, 0x46000000
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v2, 0x46000000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX11-NEXT: v_add_f32_e32 v1, v1, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i32 1, %cnt
%conv = uitofp i32 %shl to half
%mul = fdiv half 0xH7000, %conv
ret half %mul
}
define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) nounwind {
; VI-LABEL: fdiv_pow_shl_cnt_in_bounds:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0
; VI-NEXT: v_sub_u16_e32 v0, 0x7000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b16 v0, 10, v0
; GFX10-NEXT: v_sub_nc_u16 v0, 0x7000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b16 v0, 10, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u16 v0, 0x7000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i16 1, %cnt
%conv = uitofp i16 %shl to half
%mul = fdiv half 0xH7000, %conv
ret half %mul
}
define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) nounwind {
; VI-LABEL: fdiv_pow_shl_cnt_in_bounds2:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0
; VI-NEXT: v_sub_u16_e32 v0, 0x4800, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b16 v0, 10, v0
; GFX10-NEXT: v_sub_nc_u16 v0, 0x4800, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b16 v0, 10, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u16 v0, 0x4800, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i16 1, %cnt
%conv = uitofp i16 %shl to half
%mul = fdiv half 0xH4800, %conv
ret half %mul
}
define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 1
; VI-NEXT: v_cvt_f16_u16_e32 v0, v0
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
; VI-NEXT: v_rcp_f32_e32 v2, v1
; VI-NEXT: v_add_f32_e32 v3, v2, v2
; VI-NEXT: v_mad_f32 v4, -v1, v3, 2.0
; VI-NEXT: v_mac_f32_e32 v3, v4, v2
; VI-NEXT: v_mad_f32 v1, -v1, v3, 2.0
; VI-NEXT: v_mul_f32_e32 v1, v1, v2
; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; VI-NEXT: v_add_f32_e32 v1, v1, v3
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
; VI-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b16 v0, v0, 1
; GFX10-NEXT: v_cvt_f16_u16_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX10-NEXT: v_rcp_f32_e32 v2, v1
; GFX10-NEXT: v_add_f32_e32 v3, v2, v2
; GFX10-NEXT: v_mad_f32 v4, -v1, v3, 2.0
; GFX10-NEXT: v_mac_f32_e32 v3, v4, v2
; GFX10-NEXT: v_mad_f32 v1, -v1, v3, 2.0
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v2
; GFX10-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b16 v0, v0, 1
; GFX11-NEXT: s_mov_b32 s0, 2.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_add_f32_e32 v2, v1, v1
; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v1
; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1
; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v1, v1, v2
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i16 1, %cnt
%conv = uitofp i16 %shl to half
%mul = fdiv half 0xH4000, %conv
ret half %mul
}
define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind {
; VI-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 20, v0
; VI-NEXT: v_mov_b32_e32 v1, 0x36a00000
; VI-NEXT: v_sub_u32_e64 v2, vcc, 0, 0
; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v0, vcc
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 20, v0
; GFX10-NEXT: v_sub_co_u32 v1, vcc_lo, 0, 0
; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 20, v0
; GFX11-NEXT: v_sub_co_u32 v1, vcc_lo, 0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i32 1, %cnt
%conv = uitofp i32 %shl to double
%mul = fdiv double 0x36A0000000000000, %conv
ret double %mul
}
define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
; VI-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: s_mov_b32 s6, 0x10fffff8
; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6
; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
; VI-NEXT: v_fma_f32 v3, v4, v3, v3
; VI-NEXT: v_mul_f32_e32 v4, v2, v3
; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
; VI-NEXT: v_fma_f32 v4, v5, v3, v4
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; VI-NEXT: v_div_fixup_f32 v0, v1, v0, s6
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0x10fffff8
; GFX10-NEXT: v_rcp_f32_e32 v2, v1
; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0x10fffff8, v0, 0x10fffff8
; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2
; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2
; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0x10fffff8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x10fffff8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0x10fffff8, v0, 0x10fffff8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2
; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2
; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0x10fffff8
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i32 1, %cnt
%conv = uitofp i32 %shl to float
%mul = fdiv float 0x3a1fffff00000000, %conv
ret float %mul
}
define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind {
; VI-LABEL: fdiv_pow_shl_cnt32_okay:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x11000000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt32_okay:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x11000000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt32_okay:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x11000000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i32 1, %cnt
%conv = uitofp i32 %shl to float
%mul = fdiv float 0x3a20000000000000, %conv
ret float %mul
}