llvm/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s

declare hidden void @external_void_func_i8_inreg(i8 inreg) #0
declare hidden void @external_void_func_i16_inreg(i32 inreg) #0
declare hidden void @external_void_func_i32_inreg(i32 inreg) #0
declare hidden void @external_void_func_i64_inreg(i64 inreg) #0
declare hidden void @external_void_func_v2i32_inreg(<2 x i32> inreg) #0
declare hidden void @external_void_func_v3i32_inreg(<3 x i32> inreg) #0
declare hidden void @external_void_func_v4i32_inreg(<4 x i32> inreg) #0
declare hidden void @external_void_func_v8i32_inreg(<8 x i32> inreg) #0
declare hidden void @external_void_func_v16i32_inreg(<16 x i32> inreg) #0
declare hidden void @external_void_func_f16_inreg(half inreg) #0
declare hidden void @external_void_func_bf16_inreg(bfloat inreg) #0
declare hidden void @external_void_func_f32_inreg(float inreg) #0
declare hidden void @external_void_func_f64_inreg(double inreg) #0
declare hidden void @external_void_func_v2f16_inreg(<2 x half> inreg) #0
declare hidden void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg) #0
declare hidden void @external_void_func_v3f16_inreg(<3 x half> inreg) #0
declare hidden void @external_void_func_v4f16_inreg(<4 x half> inreg) #0

declare hidden void @external_void_func_p0_inreg(ptr inreg) #0
declare hidden void @external_void_func_p1_inreg(ptr addrspace(1) inreg) #0
declare hidden void @external_void_func_p3_inreg(ptr addrspace(3) inreg) #0
declare hidden void @external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg) #0
declare hidden void @external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg) #0

declare hidden void @external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg, i32 inreg, i64 inreg) #0

declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0
declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0
declare hidden void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg, i32 inreg) #0
declare hidden void @external_void_func_a15i32_inreg_i32_inreg__noimplicit([15 x i32] inreg, i32 inreg) #1

define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_i8_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_i8_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_i8_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i8_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s1, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s2
; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_i8_inreg(i8 inreg %arg)
  ret void
}

define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_i16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_i16_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_i16_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s1, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s2
; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_i16_inreg(i16 inreg %arg)
  ret void
}

define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_i32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_i32_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_i32_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s1, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s2
; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i32_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i32_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_i32_inreg(i32 inreg %arg)
  ret void
}

define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_i64_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_i64_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_i64_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i64_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s2, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s3
; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i64_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i64_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_i64_inreg(i64 inreg %arg)
  ret void
}

define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v2i32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v2i32_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v2i32_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2i32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s2, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s3
; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i32_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i32_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_v2i32_inreg(<2 x i32> inreg %arg)
  ret void
}

define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v3i32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s17, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s2, s16
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[18:19]
; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v3i32_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v3i32_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3i32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s3, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s16, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s16
; GFX11-NEXT:    v_writelane_b32 v40, s3, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[16:17]
; GFX11-NEXT:    s_add_u32 s16, s16, external_void_func_v3i32_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s17, s17, external_void_func_v3i32_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_v3i32_inreg(<3 x i32> inreg %arg)
  ret void
}

define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v4i32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s18, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s3, s17
; GFX9-NEXT:    s_mov_b32 s2, s16
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[18:19]
; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v4i32_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v4i32_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4i32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s16, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s17, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s17
; GFX11-NEXT:    v_writelane_b32 v40, s16, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[16:17]
; GFX11-NEXT:    s_add_u32 s16, s16, external_void_func_v4i32_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s17, s17, external_void_func_v4i32_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_v4i32_inreg(<4 x i32> inreg %arg)
  ret void
}

define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v8i32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s22, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[24:25], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[24:25]
; GFX9-NEXT:    v_writelane_b32 v40, s22, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s3, s17
; GFX9-NEXT:    s_mov_b32 s2, s16
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    s_mov_b32 s16, s18
; GFX9-NEXT:    s_mov_b32 s17, s19
; GFX9-NEXT:    s_mov_b32 s18, s20
; GFX9-NEXT:    s_mov_b32 s19, s21
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[22:23]
; GFX9-NEXT:    s_add_u32 s22, s22, external_void_func_v8i32_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_v8i32_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v8i32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s18, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s19, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s19
; GFX11-NEXT:    v_writelane_b32 v40, s18, 2
; GFX11-NEXT:    s_mov_b32 s19, s17
; GFX11-NEXT:    s_mov_b32 s18, s16
; GFX11-NEXT:    s_mov_b32 s17, s7
; GFX11-NEXT:    s_mov_b32 s16, s6
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[20:21]
; GFX11-NEXT:    s_add_u32 s20, s20, external_void_func_v8i32_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s21, s21, external_void_func_v8i32_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[20:21]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_v8i32_inreg(<8 x i32> inreg %arg)
  ret void
}

; FIXME:
; define void @test_call_external_void_func_v16i32_inreg(<16 x i32> inreg %arg) #0 {
;   call void @external_void_func_v16i32_inreg(<16 x i32> inreg %arg)
;   ret void
; }

define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_f16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_f16_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_f16_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_f16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s1, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s2
; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f16_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f16_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_f16_inreg(half inreg %arg)
  ret void
}

define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_bf16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_bf16_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_bf16_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_bf16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s1, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s2
; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_bf16_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_bf16_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_bf16_inreg(bfloat inreg %arg)
  ret void
}

define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_f32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_f32_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_f32_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_f32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s1, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s2
; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f32_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f32_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_f32_inreg(float inreg %arg)
  ret void
}

define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_f64_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_f64_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_f64_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_f64_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s2, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s3
; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f64_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f64_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_f64_inreg(double inreg %arg)
  ret void
}

define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v2f16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v2f16_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v2f16_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2f16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s1, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s2
; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f16_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f16_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_v2f16_inreg(<2 x half> inreg %arg)
  ret void
}


define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v2bf16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v2bf16_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v2bf16_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s1, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s2
; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2bf16_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2bf16_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
  ret void
}

define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v3f16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v3f16_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v3f16_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3f16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s2, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s3
; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f16_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f16_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_v3f16_inreg(<3 x half> inreg %arg)
  ret void
}

define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v4f16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v4f16_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v4f16_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4f16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s2, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s3
; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4f16_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4f16_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_v4f16_inreg(<4 x half> inreg %arg)
  ret void
}

define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_p0_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_p0_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_p0_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_p0_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s2, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s3
; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_p0_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_p0_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_p0_inreg(ptr inreg %arg)
  ret void
}

define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_p1_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_p1_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_p1_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_p1_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s2, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s3
; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_p1_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_p1_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
  ret void
}

define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_p3_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_p3_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_p3_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_p3_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s1, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s2
; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_p3_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_p3_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
  ret void
}

define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v2p1_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s18, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s3, s17
; GFX9-NEXT:    s_mov_b32 s2, s16
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[18:19]
; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2p1_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2p1_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2p1_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s16, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s17, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s17
; GFX11-NEXT:    v_writelane_b32 v40, s16, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[16:17]
; GFX11-NEXT:    s_add_u32 s16, s16, external_void_func_v2p1_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s17, s17, external_void_func_v2p1_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg)
  ret void
}

define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v2p5_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s16, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
; GFX9-NEXT:    v_writelane_b32 v40, s16, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[16:17]
; GFX9-NEXT:    s_add_u32 s16, s16, external_void_func_v2p5_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s17, s17, external_void_func_v2p5_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2p5_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s2, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s3
; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[2:3]
; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2p5_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2p5_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg)
  ret void
}

define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg %arg1, i64 inreg %arg2) #0 {
; GFX9-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s19, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
; GFX9-NEXT:    v_writelane_b32 v40, s19, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s3, s17
; GFX9-NEXT:    s_mov_b32 s2, s16
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    s_mov_b32 s16, s18
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[20:21]
; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s16, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s17, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s17
; GFX11-NEXT:    v_writelane_b32 v40, s16, 2
; GFX11-NEXT:    ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
; GFX11-NEXT:    s_mov_b32 s16, s6
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[18:19]
; GFX11-NEXT:    s_add_u32 s18, s18, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s19, s19, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[18:19]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg %arg1, i64 inreg %arg2)
  ret void
}

define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #0 {
; GFX9-LABEL: test_call_external_void_func_a15i32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s29, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 vcc, -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, vcc
; GFX9-NEXT:    v_writelane_b32 v40, s29, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s3, s17
; GFX9-NEXT:    s_mov_b32 s2, s16
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    s_mov_b32 s16, s18
; GFX9-NEXT:    s_mov_b32 s17, s19
; GFX9-NEXT:    s_mov_b32 s18, s20
; GFX9-NEXT:    s_mov_b32 s19, s21
; GFX9-NEXT:    s_mov_b32 s20, s22
; GFX9-NEXT:    s_mov_b32 s21, s23
; GFX9-NEXT:    s_mov_b32 s22, s24
; GFX9-NEXT:    s_mov_b32 s23, s25
; GFX9-NEXT:    s_mov_b32 s24, s26
; GFX9-NEXT:    s_mov_b32 s25, s27
; GFX9-NEXT:    s_mov_b32 s26, s28
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 vcc
; GFX9-NEXT:    s_add_u32 vcc_lo, vcc_lo, external_void_func_a15i32_inreg@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 vcc_hi, vcc_hi, external_void_func_a15i32_inreg@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], vcc
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_a15i32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s25, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s26, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s26
; GFX11-NEXT:    v_writelane_b32 v40, s25, 2
; GFX11-NEXT:    s_mov_b32 s26, s24
; GFX11-NEXT:    s_mov_b32 s25, s23
; GFX11-NEXT:    s_mov_b32 s24, s22
; GFX11-NEXT:    s_mov_b32 s23, s21
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_mov_b32 s22, s20
; GFX11-NEXT:    s_mov_b32 s21, s19
; GFX11-NEXT:    s_mov_b32 s20, s18
; GFX11-NEXT:    s_mov_b32 s19, s17
; GFX11-NEXT:    s_mov_b32 s18, s16
; GFX11-NEXT:    s_mov_b32 s17, s7
; GFX11-NEXT:    s_mov_b32 s16, s6
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_getpc_b64 s[28:29]
; GFX11-NEXT:    s_add_u32 s28, s28, external_void_func_a15i32_inreg@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s29, s29, external_void_func_a15i32_inreg@rel32@hi+12
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[28:29]
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_a15i32_inreg([15 x i32] inreg %arg0)
  ret void
}

; FIXME:
; define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #0 {
;   call void @external_void_func_a16i32_inreg([16 x i32] inreg %arg0)
;   ret void
; }

; FIXME:
; define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #0 {
;   call void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1)
;   ret void
; }


; FIXME: This should also fail
define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #1 {
; GFX9-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s23, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[24:25], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[24:25]
; GFX9-NEXT:    v_writelane_b32 v40, s23, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s3, s7
; GFX9-NEXT:    s_mov_b32 s2, s6
; GFX9-NEXT:    s_mov_b32 s1, s5
; GFX9-NEXT:    s_mov_b32 s0, s4
; GFX9-NEXT:    s_mov_b32 s4, s8
; GFX9-NEXT:    s_mov_b32 s5, s9
; GFX9-NEXT:    s_mov_b32 s6, s10
; GFX9-NEXT:    s_mov_b32 s7, s11
; GFX9-NEXT:    s_mov_b32 s8, s15
; GFX9-NEXT:    s_mov_b32 s9, s16
; GFX9-NEXT:    s_mov_b32 s10, s17
; GFX9-NEXT:    s_mov_b32 s11, s18
; GFX9-NEXT:    s_mov_b32 s15, s19
; GFX9-NEXT:    s_mov_b32 s16, s20
; GFX9-NEXT:    s_mov_b32 s17, s21
; GFX9-NEXT:    s_mov_b32 s18, s22
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_getpc_b64 s[24:25]
; GFX9-NEXT:    s_add_u32 s24, s24, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@lo+4
; GFX9-NEXT:    s_addc_u32 s25, s25, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@hi+12
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[24:25]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s19, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s20, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s20
; GFX11-NEXT:    v_writelane_b32 v40, s19, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_getpc_b64 s[20:21]
; GFX11-NEXT:    s_add_u32 s20, s20, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@lo+4
; GFX11-NEXT:    s_addc_u32 s21, s21, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@hi+12
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[20:21]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
  call void @external_void_func_a15i32_inreg_i32_inreg__noimplicit([15 x i32] inreg %arg0, i32 inreg %arg1)
  ret void
}

attributes #0 = { nounwind }
attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-work-group-id-x" "amdgpu-no-work-group-id-y" "amdgpu-no-work-group-id-z" "amdgpu-no-work-item-id-x" "amdgpu-no-work-item-id-y" "amdgpu-no-work-item-id-z" }

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}