; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX908 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1030 %s
; Function Attrs: mustprogress nounwind willreturn
define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
; GFX908-LABEL: half8:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX908-NEXT: v_mov_b32_e32 v4, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: half8:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX1030-LABEL: half8:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX1030-NEXT: v_mov_b32_e32 v4, 0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX1030-NEXT: s_endpgm
%gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
%gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
%gep2 = getelementptr half, ptr addrspace(1) %0, i64 2
%gep3 = getelementptr half, ptr addrspace(1) %0, i64 3
%gep4 = getelementptr half, ptr addrspace(1) %0, i64 4
%gep5 = getelementptr half, ptr addrspace(1) %0, i64 5
%gep6 = getelementptr half, ptr addrspace(1) %0, i64 6
%gep7 = getelementptr half, ptr addrspace(1) %0, i64 7
%l0 = load half, ptr addrspace(1) %gep0, align 2
%l1 = load half, ptr addrspace(1) %gep1, align 2
%l2 = load half, ptr addrspace(1) %gep2, align 2
%l3 = load half, ptr addrspace(1) %gep3, align 2
%l4 = load half, ptr addrspace(1) %gep4, align 2
%l5 = load half, ptr addrspace(1) %gep5, align 2
%l6 = load half, ptr addrspace(1) %gep6, align 2
%l7 = load half, ptr addrspace(1) %gep7, align 2
%sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
%sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
%sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2
%sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3
%sgep4 = getelementptr half, ptr addrspace(1) %1, i64 4
%sgep5 = getelementptr half, ptr addrspace(1) %1, i64 5
%sgep6 = getelementptr half, ptr addrspace(1) %1, i64 6
%sgep7 = getelementptr half, ptr addrspace(1) %1, i64 7
store half %l0, ptr addrspace(1) %sgep0, align 2
store half %l1, ptr addrspace(1) %sgep1, align 2
store half %l2, ptr addrspace(1) %sgep2, align 2
store half %l3, ptr addrspace(1) %sgep3, align 2
store half %l4, ptr addrspace(1) %sgep4, align 2
store half %l5, ptr addrspace(1) %sgep5, align 2
store half %l6, ptr addrspace(1) %sgep6, align 2
store half %l7, ptr addrspace(1) %sgep7, align 2
ret void
}
; Function Attrs: mustprogress nounwind willreturn
define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
; GFX908-LABEL: half6:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX908-NEXT: v_mov_b32_e32 v3, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: half6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX1030-LABEL: half6:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX1030-NEXT: s_endpgm
%gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
%gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
%gep2 = getelementptr half, ptr addrspace(1) %0, i64 2
%gep3 = getelementptr half, ptr addrspace(1) %0, i64 3
%gep4 = getelementptr half, ptr addrspace(1) %0, i64 4
%gep5 = getelementptr half, ptr addrspace(1) %0, i64 5
%l0 = load half, ptr addrspace(1) %gep0, align 1
%l1 = load half, ptr addrspace(1) %gep1, align 1
%l2 = load half, ptr addrspace(1) %gep2, align 1
%l3 = load half, ptr addrspace(1) %gep3, align 1
%l4 = load half, ptr addrspace(1) %gep4, align 1
%l5 = load half, ptr addrspace(1) %gep5, align 1
%sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
%sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
%sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2
%sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3
%sgep4 = getelementptr half, ptr addrspace(1) %1, i64 4
%sgep5 = getelementptr half, ptr addrspace(1) %1, i64 5
store half %l0, ptr addrspace(1) %sgep0, align 1
store half %l1, ptr addrspace(1) %sgep1, align 1
store half %l2, ptr addrspace(1) %sgep2, align 1
store half %l3, ptr addrspace(1) %sgep3, align 1
store half %l4, ptr addrspace(1) %sgep4, align 1
store half %l5, ptr addrspace(1) %sgep5, align 1
ret void
}
; Function Attrs: mustprogress nounwind willreturn
define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
; GFX908-LABEL: half4:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX908-NEXT: v_mov_b32_e32 v2, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v0, s4
; GFX908-NEXT: v_mov_b32_e32 v1, s5
; GFX908-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: half4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX1030-LABEL: half4:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s0
; GFX1030-NEXT: v_mov_b32_e32 v1, s1
; GFX1030-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030-NEXT: s_endpgm
%gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
%gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
%gep2 = getelementptr half, ptr addrspace(1) %0, i64 2
%gep3 = getelementptr half, ptr addrspace(1) %0, i64 3
%l0 = load half, ptr addrspace(1) %gep0, align 4
%l1 = load half, ptr addrspace(1) %gep1, align 4
%l2 = load half, ptr addrspace(1) %gep2, align 4
%l3 = load half, ptr addrspace(1) %gep3, align 4
%sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
%sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
%sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2
%sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3
store half %l0, ptr addrspace(1) %sgep0, align 4
store half %l1, ptr addrspace(1) %sgep1, align 4
store half %l2, ptr addrspace(1) %sgep2, align 4
store half %l3, ptr addrspace(1) %sgep3, align 4
ret void
}
; Function Attrs: mustprogress nounwind willreturn
define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
; GFX908-LABEL: half2:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX908-NEXT: v_mov_b32_e32 v0, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: global_load_dword v1, v0, s[0:1]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: global_store_dword v0, v1, s[2:3]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: half2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v1, v0, s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX1030-LABEL: half2:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX1030-NEXT: v_mov_b32_e32 v0, 0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: global_load_dword v1, v0, s[0:1]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1030-NEXT: s_endpgm
%gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
%gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
%l0 = load half, ptr addrspace(1) %gep0
%l1 = load half, ptr addrspace(1) %gep1
%sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
%sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
store half %l0, ptr addrspace(1) %sgep0
store half %l1, ptr addrspace(1) %sgep1
ret void
}