; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
declare void @f16_user(half) #0
declare half @f16_result() #0
declare void @v2f16_user(<2 x half>) #0
declare <2 x half> @v2f16_result() #0
declare void @v4f16_user(<4 x half>) #0
declare <4 x half> @v4f16_result() #0
declare void @v8f16_user(<8 x half>) #0
declare <8 x half> @v8f16_result() #0
define void @f16_arg(half %arg, ptr %ptr) #0 {
; GFX7-LABEL: f16_arg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: flat_store_dword v[1:2], v0
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%fpext = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict")
store float %fpext, ptr %ptr
ret void
}
define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 {
; GFX7-LABEL: v2f16_arg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v2
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX7-NEXT: flat_store_dword v[0:1], v5
; GFX7-NEXT: flat_store_dword v[2:3], v4
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict")
store <2 x float> %fpext, ptr %ptr
ret void
}
define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 {
; GFX7-LABEL: v3f16_arg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v3
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v3
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
; GFX7-NEXT: flat_store_dword v[0:1], v6
; GFX7-NEXT: flat_store_dword v[3:4], v5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict")
store <3 x float> %fpext, ptr %ptr
ret void
}
define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
; GFX7-LABEL: v4f16_arg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v3
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v4
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v4
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX7-NEXT: flat_store_dword v[0:1], v7
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX7-NEXT: flat_store_dword v[0:1], v3
; GFX7-NEXT: flat_store_dword v[4:5], v6
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict")
store <4 x float> %fpext, ptr %ptr
ret void
}
define half @f16_return(float %arg) #0 {
; GFX7-LABEL: f16_return:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret half %fptrunc
}
define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
; GFX7-LABEL: v2f16_return:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
%fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret <2 x half> %fptrunc
}
define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
; GFX7-LABEL: v3f16_return:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
%fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret <3 x half> %fptrunc
}
define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
; GFX7-LABEL: v4f16_return:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret <4 x half> %fptrunc
}
define void @outgoing_f16_arg(ptr %ptr) #0 {
; GFX7-LABEL: outgoing_f16_arg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s16, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[18:19]
; GFX7-NEXT: flat_load_ushort v0, v[0:1]
; GFX7-NEXT: v_writelane_b32 v40, s16, 2
; GFX7-NEXT: v_writelane_b32 v40, s30, 0
; GFX7-NEXT: s_mov_b32 s17, f16_user@abs32@hi
; GFX7-NEXT: s_mov_b32 s16, f16_user@abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: v_writelane_b32 v40, s31, 1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
; GFX7-NEXT: v_readlane_b32 s4, v40, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%val = load half, ptr %ptr
call void @f16_user(half %val)
ret void
}
define void @outgoing_v2f16_arg(ptr %ptr) #0 {
; GFX7-LABEL: outgoing_v2f16_arg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s16, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[18:19]
; GFX7-NEXT: flat_load_dword v1, v[0:1]
; GFX7-NEXT: v_writelane_b32 v40, s16, 2
; GFX7-NEXT: v_writelane_b32 v40, s30, 0
; GFX7-NEXT: s_mov_b32 s17, v2f16_user@abs32@hi
; GFX7-NEXT: s_mov_b32 s16, v2f16_user@abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: v_writelane_b32 v40, s31, 1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
; GFX7-NEXT: v_readlane_b32 s4, v40, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%val = load <2 x half>, ptr %ptr
call void @v2f16_user(<2 x half> %val)
ret void
}
define void @outgoing_f16_return(ptr %ptr) #0 {
; GFX7-LABEL: outgoing_f16_return:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s16, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[18:19]
; GFX7-NEXT: v_writelane_b32 v42, s16, 2
; GFX7-NEXT: v_writelane_b32 v42, s30, 0
; GFX7-NEXT: s_mov_b32 s17, f16_result@abs32@hi
; GFX7-NEXT: s_mov_b32 s16, f16_result@abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: v_writelane_b32 v42, s31, 1
; GFX7-NEXT: v_mov_b32_e32 v41, v1
; GFX7-NEXT: v_mov_b32_e32 v40, v0
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: v_readlane_b32 s30, v42, 0
; GFX7-NEXT: v_readlane_b32 s4, v42, 2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: flat_store_short v[40:41], v0
; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s4
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%val = call half @f16_result()
store half %val, ptr %ptr
ret void
}
define void @outgoing_v2f16_return(ptr %ptr) #0 {
; GFX7-LABEL: outgoing_v2f16_return:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s16, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[18:19]
; GFX7-NEXT: v_writelane_b32 v42, s16, 2
; GFX7-NEXT: v_writelane_b32 v42, s30, 0
; GFX7-NEXT: s_mov_b32 s17, v2f16_result@abs32@hi
; GFX7-NEXT: s_mov_b32 s16, v2f16_result@abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: v_writelane_b32 v42, s31, 1
; GFX7-NEXT: v_mov_b32_e32 v41, v1
; GFX7-NEXT: v_mov_b32_e32 v40, v0
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: v_readlane_b32 s30, v42, 0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_readlane_b32 s4, v42, 2
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: flat_store_dword v[40:41], v0
; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s4
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%val = call <2 x half> @v2f16_result()
store <2 x half> %val, ptr %ptr
ret void
}
define void @outgoing_v4f16_return(ptr %ptr) #0 {
; GFX7-LABEL: outgoing_v4f16_return:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s16, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[18:19]
; GFX7-NEXT: v_writelane_b32 v42, s16, 2
; GFX7-NEXT: v_writelane_b32 v42, s30, 0
; GFX7-NEXT: s_mov_b32 s17, v4f16_result@abs32@hi
; GFX7-NEXT: s_mov_b32 s16, v4f16_result@abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: v_writelane_b32 v42, s31, 1
; GFX7-NEXT: v_mov_b32_e32 v41, v1
; GFX7-NEXT: v_mov_b32_e32 v40, v0
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_or_b32_e32 v4, v0, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT: v_or_b32_e32 v2, v2, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v40
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: flat_store_dword v[40:41], v4
; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: v_readlane_b32 s30, v42, 0
; GFX7-NEXT: v_readlane_b32 s4, v42, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s4
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%val = call <4 x half> @v4f16_result()
store <4 x half> %val, ptr %ptr
ret void
}
define void @outgoing_v8f16_return(ptr %ptr) #0 {
; GFX7-LABEL: outgoing_v8f16_return:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s16, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[18:19]
; GFX7-NEXT: v_writelane_b32 v42, s16, 2
; GFX7-NEXT: v_writelane_b32 v42, s30, 0
; GFX7-NEXT: s_mov_b32 s17, v8f16_result@abs32@hi
; GFX7-NEXT: s_mov_b32 s16, v8f16_result@abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: v_writelane_b32 v42, s31, 1
; GFX7-NEXT: v_mov_b32_e32 v41, v1
; GFX7-NEXT: v_mov_b32_e32 v40, v0
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_or_b32_e32 v8, v0, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX7-NEXT: v_or_b32_e32 v2, v2, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v7
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_or_b32_e32 v5, v1, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT: v_or_b32_e32 v3, v4, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v40
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc
; GFX7-NEXT: flat_store_dword v[0:1], v3
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v40
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc
; GFX7-NEXT: flat_store_dword v[0:1], v5
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v40
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: flat_store_dword v[40:41], v8
; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: v_readlane_b32 s30, v42, 0
; GFX7-NEXT: v_readlane_b32 s4, v42, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s4
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%val = call <8 x half> @v8f16_result()
store <8 x half> %val, ptr %ptr
ret void
}
define half @call_split_type_used_outside_block_v8f16() #0 {
; GFX7-LABEL: call_split_type_used_outside_block_v8f16:
; GFX7: ; %bb.0: ; %bb0
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s16, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[18:19]
; GFX7-NEXT: v_writelane_b32 v40, s16, 2
; GFX7-NEXT: v_writelane_b32 v40, s30, 0
; GFX7-NEXT: s_mov_b32 s17, v8f16_result@abs32@hi
; GFX7-NEXT: s_mov_b32 s16, v8f16_result@abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: v_writelane_b32 v40, s31, 1
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
; GFX7-NEXT: v_readlane_b32 s4, v40, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
bb0:
%split.ret.type = call <8 x half> @v8f16_result()
br label %bb1
bb1:
%extract = extractelement <8 x half> %split.ret.type, i32 0
ret half %extract
}
declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, metadata) #0
declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) #0
declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0
declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0
declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #0
declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata) #0
attributes #0 = { strictfp }