llvm/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s

declare void @f16_user(half) #0
declare half @f16_result() #0

declare void @v2f16_user(<2 x half>) #0
declare <2 x half> @v2f16_result() #0

declare void @v4f16_user(<4 x half>) #0
declare <4 x half> @v4f16_result() #0

declare void @v8f16_user(<8 x half>) #0
declare <8 x half> @v8f16_result() #0

define void @f16_arg(half %arg, ptr %ptr) #0 {
; GFX7-LABEL: f16_arg:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT:    flat_store_dword v[1:2], v0
; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_setpc_b64 s[30:31]
  %fpext = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict")
  store float %fpext, ptr %ptr
  ret void
}

define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 {
; GFX7-LABEL: v2f16_arg:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v0
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v1
; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v2
; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX7-NEXT:    flat_store_dword v[0:1], v5
; GFX7-NEXT:    flat_store_dword v[2:3], v4
; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_setpc_b64 s[30:31]
  %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict")
  store <2 x float> %fpext, ptr %ptr
  ret void
}

define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 {
; GFX7-LABEL: v3f16_arg:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v1
; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v2
; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 8, v3
; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
; GFX7-NEXT:    flat_store_dword v[0:1], v2
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v3
; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
; GFX7-NEXT:    flat_store_dword v[0:1], v6
; GFX7-NEXT:    flat_store_dword v[3:4], v5
; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_setpc_b64 s[30:31]
  %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict")
  store <3 x float> %fpext, ptr %ptr
  ret void
}

define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
; GFX7-LABEL: v4f16_arg:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v2
; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v3
; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v1
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v2
; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 12, v4
; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX7-NEXT:    flat_store_dword v[0:1], v2
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 8, v4
; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX7-NEXT:    flat_store_dword v[0:1], v7
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v4
; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX7-NEXT:    flat_store_dword v[0:1], v3
; GFX7-NEXT:    flat_store_dword v[4:5], v6
; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_setpc_b64 s[30:31]
  %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict")
  store <4 x float> %fpext, ptr %ptr
  ret void
}

define half @f16_return(float %arg) #0 {
; GFX7-LABEL: f16_return:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT:    s_setpc_b64 s[30:31]
  %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
  ret half %fptrunc
}

define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
; GFX7-LABEL: v2f16_return:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT:    s_setpc_b64 s[30:31]
  %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
  ret <2 x half> %fptrunc
}

define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
; GFX7-LABEL: v3f16_return:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT:    s_setpc_b64 s[30:31]
  %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
  ret <3 x half> %fptrunc
}

define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
; GFX7-LABEL: v4f16_return:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT:    s_setpc_b64 s[30:31]
  %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
  ret <4 x half> %fptrunc
}

define void @outgoing_f16_arg(ptr %ptr) #0 {
; GFX7-LABEL: outgoing_f16_arg:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_mov_b32 s16, s33
; GFX7-NEXT:    s_mov_b32 s33, s32
; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
; GFX7-NEXT:    flat_load_ushort v0, v[0:1]
; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
; GFX7-NEXT:    s_mov_b32 s17, f16_user@abs32@hi
; GFX7-NEXT:    s_mov_b32 s16, f16_user@abs32@lo
; GFX7-NEXT:    s_addk_i32 s32, 0x400
; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
; GFX7-NEXT:    s_mov_b32 s33, s4
; GFX7-NEXT:    s_waitcnt vmcnt(0)
; GFX7-NEXT:    s_setpc_b64 s[30:31]
  %val = load half, ptr %ptr
  call void @f16_user(half %val)
  ret void
}

define void @outgoing_v2f16_arg(ptr %ptr) #0 {
; GFX7-LABEL: outgoing_v2f16_arg:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_mov_b32 s16, s33
; GFX7-NEXT:    s_mov_b32 s33, s32
; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
; GFX7-NEXT:    flat_load_dword v1, v[0:1]
; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
; GFX7-NEXT:    s_mov_b32 s17, v2f16_user@abs32@hi
; GFX7-NEXT:    s_mov_b32 s16, v2f16_user@abs32@lo
; GFX7-NEXT:    s_addk_i32 s32, 0x400
; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
; GFX7-NEXT:    s_mov_b32 s33, s4
; GFX7-NEXT:    s_waitcnt vmcnt(0)
; GFX7-NEXT:    s_setpc_b64 s[30:31]
  %val = load <2 x half>, ptr %ptr
  call void @v2f16_user(<2 x half> %val)
  ret void
}

define void @outgoing_f16_return(ptr %ptr) #0 {
; GFX7-LABEL: outgoing_f16_return:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_mov_b32 s16, s33
; GFX7-NEXT:    s_mov_b32 s33, s32
; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
; GFX7-NEXT:    v_writelane_b32 v42, s16, 2
; GFX7-NEXT:    v_writelane_b32 v42, s30, 0
; GFX7-NEXT:    s_mov_b32 s17, f16_result@abs32@hi
; GFX7-NEXT:    s_mov_b32 s16, f16_result@abs32@lo
; GFX7-NEXT:    s_addk_i32 s32, 0x400
; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT:    v_writelane_b32 v42, s31, 1
; GFX7-NEXT:    v_mov_b32_e32 v41, v1
; GFX7-NEXT:    v_mov_b32_e32 v40, v0
; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    flat_store_short v[40:41], v0
; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
; GFX7-NEXT:    s_mov_b32 s33, s4
; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_setpc_b64 s[30:31]
  %val = call half @f16_result()
  store half %val, ptr %ptr
  ret void
}

define void @outgoing_v2f16_return(ptr %ptr) #0 {
; GFX7-LABEL: outgoing_v2f16_return:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_mov_b32 s16, s33
; GFX7-NEXT:    s_mov_b32 s33, s32
; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
; GFX7-NEXT:    v_writelane_b32 v42, s16, 2
; GFX7-NEXT:    v_writelane_b32 v42, s30, 0
; GFX7-NEXT:    s_mov_b32 s17, v2f16_result@abs32@hi
; GFX7-NEXT:    s_mov_b32 s16, v2f16_result@abs32@lo
; GFX7-NEXT:    s_addk_i32 s32, 0x400
; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT:    v_writelane_b32 v42, s31, 1
; GFX7-NEXT:    v_mov_b32_e32 v41, v1
; GFX7-NEXT:    v_mov_b32_e32 v40, v0
; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
; GFX7-NEXT:    flat_store_dword v[40:41], v0
; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
; GFX7-NEXT:    s_mov_b32 s33, s4
; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_setpc_b64 s[30:31]
  %val = call <2 x half> @v2f16_result()
  store <2 x half> %val, ptr %ptr
  ret void
}

define void @outgoing_v4f16_return(ptr %ptr) #0 {
; GFX7-LABEL: outgoing_v4f16_return:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_mov_b32 s16, s33
; GFX7-NEXT:    s_mov_b32 s33, s32
; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
; GFX7-NEXT:    v_writelane_b32 v42, s16, 2
; GFX7-NEXT:    v_writelane_b32 v42, s30, 0
; GFX7-NEXT:    s_mov_b32 s17, v4f16_result@abs32@hi
; GFX7-NEXT:    s_mov_b32 s16, v4f16_result@abs32@lo
; GFX7-NEXT:    s_addk_i32 s32, 0x400
; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT:    v_writelane_b32 v42, s31, 1
; GFX7-NEXT:    v_mov_b32_e32 v41, v1
; GFX7-NEXT:    v_mov_b32_e32 v40, v0
; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT:    v_or_b32_e32 v4, v0, v1
; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT:    v_or_b32_e32 v2, v2, v0
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v40
; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v41, vcc
; GFX7-NEXT:    flat_store_dword v[0:1], v2
; GFX7-NEXT:    flat_store_dword v[40:41], v4
; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
; GFX7-NEXT:    s_mov_b32 s33, s4
; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_setpc_b64 s[30:31]
  %val = call <4 x half> @v4f16_result()
  store <4 x half> %val, ptr %ptr
  ret void
}

define void @outgoing_v8f16_return(ptr %ptr) #0 {
; GFX7-LABEL: outgoing_v8f16_return:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_mov_b32 s16, s33
; GFX7-NEXT:    s_mov_b32 s33, s32
; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
; GFX7-NEXT:    v_writelane_b32 v42, s16, 2
; GFX7-NEXT:    v_writelane_b32 v42, s30, 0
; GFX7-NEXT:    s_mov_b32 s17, v8f16_result@abs32@hi
; GFX7-NEXT:    s_mov_b32 s16, v8f16_result@abs32@lo
; GFX7-NEXT:    s_addk_i32 s32, 0x400
; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT:    v_writelane_b32 v42, s31, 1
; GFX7-NEXT:    v_mov_b32_e32 v41, v1
; GFX7-NEXT:    v_mov_b32_e32 v40, v0
; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT:    v_or_b32_e32 v8, v0, v1
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v5
; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
; GFX7-NEXT:    v_or_b32_e32 v2, v2, v1
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v4
; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v7
; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v6
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT:    v_or_b32_e32 v5, v1, v0
; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT:    v_or_b32_e32 v3, v4, v0
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 12, v40
; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v41, vcc
; GFX7-NEXT:    flat_store_dword v[0:1], v3
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 8, v40
; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v41, vcc
; GFX7-NEXT:    flat_store_dword v[0:1], v5
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v40
; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v41, vcc
; GFX7-NEXT:    flat_store_dword v[0:1], v2
; GFX7-NEXT:    flat_store_dword v[40:41], v8
; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
; GFX7-NEXT:    s_mov_b32 s33, s4
; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_setpc_b64 s[30:31]
  %val = call <8 x half> @v8f16_result()
  store <8 x half> %val, ptr %ptr
  ret void
}

define half @call_split_type_used_outside_block_v8f16() #0 {
; GFX7-LABEL: call_split_type_used_outside_block_v8f16:
; GFX7:       ; %bb.0: ; %bb0
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_mov_b32 s16, s33
; GFX7-NEXT:    s_mov_b32 s33, s32
; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
; GFX7-NEXT:    s_mov_b32 s17, v8f16_result@abs32@hi
; GFX7-NEXT:    s_mov_b32 s16, v8f16_result@abs32@lo
; GFX7-NEXT:    s_addk_i32 s32, 0x400
; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
; GFX7-NEXT:    s_mov_b32 s33, s4
; GFX7-NEXT:    s_waitcnt vmcnt(0)
; GFX7-NEXT:    s_setpc_b64 s[30:31]
bb0:
  %split.ret.type = call <8 x half> @v8f16_result()
  br label %bb1

bb1:
  %extract = extractelement <8 x half> %split.ret.type, i32 0
  ret half %extract
}

declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, metadata) #0
declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) #0

declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0
declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0
declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #0
declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata) #0

attributes #0 = { strictfp }