llvm/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 %s -o - | FileCheck %s

%struct.S = type { [32 x i32] }

@shared = addrspace(3) global %struct.S undef, align 4

define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 {
; CHECK-LABEL: memcpy_p0_p0_minsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    v_mov_b32_e32 v0, s2
; CHECK-NEXT:    v_mov_b32_e32 v1, s3
; CHECK-NEXT:    flat_load_ubyte v4, v[0:1]
; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:1
; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:2
; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:3
; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:4
; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:5
; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:6
; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:7
; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:8
; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:9
; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:10
; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:11
; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:12
; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:13
; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:14
; CHECK-NEXT:    v_mov_b32_e32 v3, s1
; CHECK-NEXT:    v_mov_b32_e32 v2, s0
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[2:3], v4
; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:1
; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:2
; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:3
; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:4
; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:5
; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:6
; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:7
; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:8
; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:9
; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:10
; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:11
; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:12
; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:13
; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:14
; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:30
; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:29
; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:28
; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:27
; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:26
; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:25
; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:24
; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:23
; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:22
; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:21
; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:20
; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:19
; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:18
; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:17
; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:16
; CHECK-NEXT:    flat_load_ubyte v19, v[0:1] offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:30
; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:29
; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:28
; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:27
; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:26
; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:25
; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:24
; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:23
; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:22
; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:21
; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:20
; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:19
; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:18
; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:17
; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:16
; CHECK-NEXT:    flat_store_byte v[2:3], v19 offset:15
; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:46
; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:45
; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:44
; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:43
; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:42
; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:41
; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:40
; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:39
; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:38
; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:37
; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:36
; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:35
; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:34
; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:33
; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:32
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_load_ubyte v0, v[0:1] offset:31
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:46
; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:45
; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:44
; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:43
; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:42
; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:41
; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:40
; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:39
; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:38
; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:37
; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:36
; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:35
; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:34
; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:33
; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:32
; CHECK-NEXT:    flat_store_byte v[2:3], v0 offset:31
; CHECK-NEXT:    s_endpgm
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
  ret void
}

define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #0 {
; CHECK-LABEL: memcpy_p1_p1_minsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
; CHECK-NEXT:    v_mov_b32_e32 v12, 0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx2 v[8:9], v12, s[2:3] offset:32
; CHECK-NEXT:    global_load_dwordx2 v[10:11], v12, s[2:3] offset:39
; CHECK-NEXT:    global_load_dwordx4 v[0:3], v12, s[2:3]
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v12, s[2:3] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    global_store_dwordx2 v12, v[8:9], s[0:1] offset:32
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    global_store_dwordx2 v12, v[10:11], s[0:1] offset:39
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    global_store_dwordx4 v12, v[4:7], s[0:1] offset:16
; CHECK-NEXT:    s_endpgm
entry:
  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
  ret void
}

define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #0 {
; CHECK-LABEL: memcpy_p1_p4_minsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
; CHECK-NEXT:    v_mov_b32_e32 v32, 0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[0:3], v32, s[2:3]
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v32, s[2:3] offset:16
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v32, s[2:3] offset:32
; CHECK-NEXT:    global_load_dwordx4 v[12:15], v32, s[2:3] offset:48
; CHECK-NEXT:    global_load_dwordx4 v[16:19], v32, s[2:3] offset:64
; CHECK-NEXT:    global_load_dwordx4 v[20:23], v32, s[2:3] offset:80
; CHECK-NEXT:    global_load_dwordx4 v[24:27], v32, s[2:3] offset:96
; CHECK-NEXT:    global_load_dwordx4 v[28:31], v32, s[2:3] offset:112
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; CHECK-NEXT:    s_endpgm
entry:
  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
  ret void
}

define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 {
; CHECK-LABEL: memcpy_p5_p4_minsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
; CHECK-NEXT:    v_mov_b32_e32 v0, 0
; CHECK-NEXT:    s_add_u32 s16, s16, s13
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:13
; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:12
; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:11
; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:10
; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:9
; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:8
; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:7
; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:6
; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:5
; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:4
; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:3
; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:2
; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:1
; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1]
; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:31
; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:30
; CHECK-NEXT:    s_addc_u32 s17, s17, 0
; CHECK-NEXT:    v_mov_b32_e32 v1, s2
; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:10
; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(19)
; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:9
; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(20)
; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:8
; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(21)
; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:7
; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(22)
; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:6
; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:5
; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(22)
; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:2
; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:47
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:31
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:30
; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:4
; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:17
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:3
; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:16
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:27
; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:26
; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:25
; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:24
; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:45
; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:44
; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:43
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:23
; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:36
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:22
; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:35
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:21
; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:34
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:20
; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:33
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:19
; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:32
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:28
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:29
; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:42
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:18
; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:63
; CHECK-NEXT:    s_waitcnt vmcnt(24)
; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:16
; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:61
; CHECK-NEXT:    s_waitcnt vmcnt(24)
; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:27
; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:40
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:26
; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:39
; CHECK-NEXT:    s_waitcnt vmcnt(26)
; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:25
; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:38
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:24
; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:37
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:44
; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:57
; CHECK-NEXT:    s_waitcnt vmcnt(28)
; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:43
; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:56
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:45
; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:58
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:36
; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:49
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:35
; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:48
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:46
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:47
; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:60
; CHECK-NEXT:    s_waitcnt vmcnt(33)
; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:34
; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:79
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:28
; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:41
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:42
; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:55
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:33
; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:32
; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:77
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:61
; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:74
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:40
; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:53
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:39
; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:52
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:38
; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:51
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:37
; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:50
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:57
; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:70
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:56
; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:69
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:58
; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:71
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:49
; CHECK-NEXT:    s_waitcnt vmcnt(29)
; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:48
; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:93
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:46
; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:59
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:60
; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:73
; CHECK-NEXT:    s_waitcnt vmcnt(28)
; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:41
; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:54
; CHECK-NEXT:    s_waitcnt vmcnt(28)
; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:55
; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:68
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:74
; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:87
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:53
; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:66
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:52
; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:65
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:51
; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:64
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:62
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:63
; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:76
; CHECK-NEXT:    s_waitcnt vmcnt(28)
; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:50
; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:95
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:77
; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:90
; CHECK-NEXT:    s_waitcnt vmcnt(26)
; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:71
; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:83
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:70
; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:69
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:59
; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:72
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:73
; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:85
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:54
; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:67
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:68
; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:81
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:66
; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:111
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:65
; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:110
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:64
; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:109
; CHECK-NEXT:    s_waitcnt vmcnt(24)
; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:62
; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:75
; CHECK-NEXT:    s_waitcnt vmcnt(24)
; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:76
; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:89
; CHECK-NEXT:    s_waitcnt vmcnt(22)
; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:90
; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:103
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:72
; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:86
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:84
; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:82
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:87
; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:100
; CHECK-NEXT:    s_waitcnt vmcnt(20)
; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:67
; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:80
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:78
; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:94
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:79
; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:92
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:95
; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:108
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:93
; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:106
; CHECK-NEXT:    s_waitcnt vmcnt(20)
; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:75
; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:88
; CHECK-NEXT:    s_waitcnt vmcnt(20)
; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:89
; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:102
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:78
; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:91
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:94
; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:107
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:92
; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:105
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:88
; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:101
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:91
; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:104
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:86
; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:85
; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:84
; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:83
; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:82
; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:96
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:97
; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:98
; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:99
; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:120
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:81
; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:80
; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:111
; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:110
; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:109
; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:108
; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:100
; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:121
; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:122
; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:123
; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:124
; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:125
; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:126
; CHECK-NEXT:    s_waitcnt vmcnt(29)
; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:107
; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:127
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:106
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:105
; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:103
; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:102
; CHECK-NEXT:    s_waitcnt vmcnt(31)
; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:101
; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:116
; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:117
; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:119
; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:114
; CHECK-NEXT:    s_waitcnt vmcnt(34)
; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:104
; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:118
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:115
; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:113
; CHECK-NEXT:    global_load_ubyte v21, v0, s[0:1] offset:112
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:99
; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:98
; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:97
; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:96
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:127
; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:126
; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:125
; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:124
; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:123
; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:122
; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:121
; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:120
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:119
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:118
; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:117
; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:116
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:115
; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:114
; CHECK-NEXT:    s_waitcnt vmcnt(19)
; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:113
; CHECK-NEXT:    s_waitcnt vmcnt(19)
; CHECK-NEXT:    buffer_store_byte v21, v1, s[16:19], 0 offen offset:112
; CHECK-NEXT:    s_endpgm
entry:
  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
  ret void
}

define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 {
; CHECK-LABEL: memcpy_p0_p5_minsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x8
; CHECK-NEXT:    s_add_u32 s16, s16, s13
; CHECK-NEXT:    s_addc_u32 s17, s17, 0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    v_mov_b32_e32 v2, s0
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2
; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    v_mov_b32_e32 v0, s0
; CHECK-NEXT:    v_mov_b32_e32 v1, s1
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v18
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:31
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:22
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:21
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:19
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:18
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:17
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:16
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:25
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:24
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:45
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:37
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:36
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:35
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:34
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:47
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:29
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:44
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:33
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:32
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:40
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:39
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:38
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:59
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:51
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:50
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:63
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:46
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:61
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:58
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:49
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:48
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:56
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:54
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:53
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:52
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:55
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:73
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:65
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:64
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:62
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:77
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:60
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:75
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:72
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:70
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:68
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:67
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:66
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:79
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:69
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:87
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:76
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:91
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:74
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:89
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:71
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:86
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:78
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:93
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:90
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:88
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:85
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:95
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:92
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:94
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:84
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:81
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:80
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:111
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:110
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:109
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:99
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:107
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:105
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:104
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:103
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:106
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:102
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:101
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:100
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:108
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113
; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:98
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:97
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:96
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:127
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:126
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:125
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:124
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:123
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:122
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:121
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:120
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:119
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:118
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:117
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:116
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:115
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:114
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:113
; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:112
; CHECK-NEXT:    s_endpgm
entry:
  tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
  ret void
}

define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 {
; CHECK-LABEL: memcpy_p3_p4_minsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
; CHECK-NEXT:    v_mov_b32_e32 v24, 0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1]
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:16
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:32
; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:48
; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:64
; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:80
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset0:2 offset1:3
; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:96
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:112
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    ds_write2_b64 v24, v[8:9], v[10:11] offset0:4 offset1:5
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    ds_write2_b64 v24, v[12:13], v[14:15] offset0:6 offset1:7
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    ds_write2_b64 v24, v[16:17], v[18:19] offset0:8 offset1:9
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    ds_write2_b64 v24, v[20:21], v[22:23] offset0:10 offset1:11
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset0:12 offset1:13
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset0:14 offset1:15
; CHECK-NEXT:    s_endpgm
entry:
  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
  ret void
}

define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
; CHECK-LABEL: memcpy_p0_p3_minsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
; CHECK-NEXT:    v_mov_b32_e32 v2, 0
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:112
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:113
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:114
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:115
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:116
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    v_mov_b32_e32 v0, s0
; CHECK-NEXT:    v_mov_b32_e32 v1, s1
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:112
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:113
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:114
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:115
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:116
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:117
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:118
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:119
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:120
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:121
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:122
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:123
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:124
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:125
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:126
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:127
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:120
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:121
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:122
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:123
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:124
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:125
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:126
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:127
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:96
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:97
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:98
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:99
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:100
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:101
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:102
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:103
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:96
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:97
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:98
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:99
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:100
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:101
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:102
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:103
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:104
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:105
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:106
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:107
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:108
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:109
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:110
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:111
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:104
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:105
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:106
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:107
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:108
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:109
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:110
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:111
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:80
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:81
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:82
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:83
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:84
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:85
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:86
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:87
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:80
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:81
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:84
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:85
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:86
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:87
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:88
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:89
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:90
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:91
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:92
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:93
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:94
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:95
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:88
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:89
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:90
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:91
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:92
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:93
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:94
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:95
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:64
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:65
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:66
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:67
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:68
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:69
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:70
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:71
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:64
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:65
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:66
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:67
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:68
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:69
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:70
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:71
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:72
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:73
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:74
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:75
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:76
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:77
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:78
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:79
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:72
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:73
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:74
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:75
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:76
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:77
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:78
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:79
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:48
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:49
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:50
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:51
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:52
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:53
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:54
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:55
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:48
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:49
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:50
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:51
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:52
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:53
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:54
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:55
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:56
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:57
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:58
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:59
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:60
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:61
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:62
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:63
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:56
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:57
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:58
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:59
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:60
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:61
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:62
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:63
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:32
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:33
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:34
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:35
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:36
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:37
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:38
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:39
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:32
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:33
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:34
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:35
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:36
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:37
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:38
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:39
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:40
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:41
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:42
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:43
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:44
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:45
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:46
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:47
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:40
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:41
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:42
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:43
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:44
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:45
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:46
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:47
; CHECK-NEXT:    ds_read_u8 v3, v2
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:1
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:2
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:3
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:4
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:5
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:6
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:7
; CHECK-NEXT:    ds_read_u8 v11, v2 offset:8
; CHECK-NEXT:    ds_read_u8 v12, v2 offset:9
; CHECK-NEXT:    ds_read_u8 v13, v2 offset:10
; CHECK-NEXT:    ds_read_u8 v14, v2 offset:11
; CHECK-NEXT:    ds_read_u8 v15, v2 offset:12
; CHECK-NEXT:    ds_read_u8 v16, v2 offset:13
; CHECK-NEXT:    ds_read_u8 v17, v2 offset:14
; CHECK-NEXT:    ds_read_u8 v18, v2 offset:15
; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
; CHECK-NEXT:    ds_read_u8 v21, v2 offset:18
; CHECK-NEXT:    ds_read_u8 v22, v2 offset:19
; CHECK-NEXT:    ds_read_u8 v23, v2 offset:20
; CHECK-NEXT:    ds_read_u8 v24, v2 offset:21
; CHECK-NEXT:    ds_read_u8 v25, v2 offset:22
; CHECK-NEXT:    ds_read_u8 v26, v2 offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:18
; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:19
; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:20
; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:21
; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:22
; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:23
; CHECK-NEXT:    ds_read_u8 v19, v2 offset:24
; CHECK-NEXT:    ds_read_u8 v20, v2 offset:25
; CHECK-NEXT:    ds_read_u8 v21, v2 offset:26
; CHECK-NEXT:    ds_read_u8 v22, v2 offset:27
; CHECK-NEXT:    ds_read_u8 v23, v2 offset:28
; CHECK-NEXT:    ds_read_u8 v24, v2 offset:29
; CHECK-NEXT:    ds_read_u8 v25, v2 offset:30
; CHECK-NEXT:    ds_read_u8 v2, v2 offset:31
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:24
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:25
; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:26
; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:27
; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:28
; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:29
; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:30
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:31
; CHECK-NEXT:    flat_store_byte v[0:1], v3
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:2
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:3
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:4
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:5
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:6
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:9
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:10
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:11
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:12
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:13
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:14
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
; CHECK-NEXT:    s_endpgm
entry:
  tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
  ret void
}

define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
; CHECK-LABEL: memcpy_p0_p0_optsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    v_mov_b32_e32 v0, s2
; CHECK-NEXT:    v_mov_b32_e32 v1, s3
; CHECK-NEXT:    flat_load_ubyte v4, v[0:1]
; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:1
; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:2
; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:3
; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:4
; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:5
; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:6
; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:7
; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:8
; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:9
; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:10
; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:11
; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:12
; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:13
; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:14
; CHECK-NEXT:    v_mov_b32_e32 v3, s1
; CHECK-NEXT:    v_mov_b32_e32 v2, s0
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[2:3], v4
; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:1
; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:2
; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:3
; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:4
; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:5
; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:6
; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:7
; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:8
; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:9
; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:10
; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:11
; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:12
; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:13
; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:14
; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:30
; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:29
; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:28
; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:27
; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:26
; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:25
; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:24
; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:23
; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:22
; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:21
; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:20
; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:19
; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:18
; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:17
; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:16
; CHECK-NEXT:    flat_load_ubyte v19, v[0:1] offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:30
; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:29
; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:28
; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:27
; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:26
; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:25
; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:24
; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:23
; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:22
; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:21
; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:20
; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:19
; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:18
; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:17
; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:16
; CHECK-NEXT:    flat_store_byte v[2:3], v19 offset:15
; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:46
; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:45
; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:44
; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:43
; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:42
; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:41
; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:40
; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:39
; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:38
; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:37
; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:36
; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:35
; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:34
; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:33
; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:32
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_load_ubyte v0, v[0:1] offset:31
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:46
; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:45
; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:44
; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:43
; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:42
; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:41
; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:40
; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:39
; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:38
; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:37
; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:36
; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:35
; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:34
; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:33
; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:32
; CHECK-NEXT:    flat_store_byte v[2:3], v0 offset:31
; CHECK-NEXT:    s_endpgm
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
  ret void
}

define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #1 {
; CHECK-LABEL: memcpy_p1_p1_optsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
; CHECK-NEXT:    v_mov_b32_e32 v12, 0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx2 v[8:9], v12, s[2:3] offset:32
; CHECK-NEXT:    global_load_dwordx2 v[10:11], v12, s[2:3] offset:39
; CHECK-NEXT:    global_load_dwordx4 v[0:3], v12, s[2:3]
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v12, s[2:3] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    global_store_dwordx2 v12, v[8:9], s[0:1] offset:32
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    global_store_dwordx2 v12, v[10:11], s[0:1] offset:39
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    global_store_dwordx4 v12, v[4:7], s[0:1] offset:16
; CHECK-NEXT:    s_endpgm
entry:
  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
  ret void
}

define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #1 {
; CHECK-LABEL: memcpy_p1_p4_optsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
; CHECK-NEXT:    v_mov_b32_e32 v32, 0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[0:3], v32, s[2:3]
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v32, s[2:3] offset:16
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v32, s[2:3] offset:32
; CHECK-NEXT:    global_load_dwordx4 v[12:15], v32, s[2:3] offset:48
; CHECK-NEXT:    global_load_dwordx4 v[16:19], v32, s[2:3] offset:64
; CHECK-NEXT:    global_load_dwordx4 v[20:23], v32, s[2:3] offset:80
; CHECK-NEXT:    global_load_dwordx4 v[24:27], v32, s[2:3] offset:96
; CHECK-NEXT:    global_load_dwordx4 v[28:31], v32, s[2:3] offset:112
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; CHECK-NEXT:    s_endpgm
entry:
  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
  ret void
}

define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 {
; CHECK-LABEL: memcpy_p5_p4_optsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
; CHECK-NEXT:    v_mov_b32_e32 v0, 0
; CHECK-NEXT:    s_add_u32 s16, s16, s13
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:13
; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:12
; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:11
; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:10
; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:9
; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:8
; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:7
; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:6
; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:5
; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:4
; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:3
; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:2
; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:1
; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1]
; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:31
; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:30
; CHECK-NEXT:    s_addc_u32 s17, s17, 0
; CHECK-NEXT:    v_mov_b32_e32 v1, s2
; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:10
; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(19)
; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:9
; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(20)
; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:8
; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(21)
; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:7
; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(22)
; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:6
; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:5
; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(22)
; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:2
; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:47
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:31
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:30
; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:4
; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:17
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:3
; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:16
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:27
; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:26
; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:25
; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:24
; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:45
; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:44
; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:43
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:23
; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:36
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:22
; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:35
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:21
; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:34
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:20
; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:33
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:19
; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:32
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:28
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:29
; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:42
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:18
; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:63
; CHECK-NEXT:    s_waitcnt vmcnt(24)
; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:16
; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:61
; CHECK-NEXT:    s_waitcnt vmcnt(24)
; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:27
; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:40
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:26
; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:39
; CHECK-NEXT:    s_waitcnt vmcnt(26)
; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:25
; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:38
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:24
; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:37
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:44
; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:57
; CHECK-NEXT:    s_waitcnt vmcnt(28)
; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:43
; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:56
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:45
; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:58
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:36
; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:49
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:35
; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:48
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:46
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:47
; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:60
; CHECK-NEXT:    s_waitcnt vmcnt(33)
; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:34
; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:79
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:28
; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:41
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:42
; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:55
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:33
; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:32
; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:77
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:61
; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:74
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:40
; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:53
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:39
; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:52
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:38
; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:51
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:37
; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:50
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:57
; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:70
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:56
; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:69
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:58
; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:71
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:49
; CHECK-NEXT:    s_waitcnt vmcnt(29)
; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:48
; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:93
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:46
; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:59
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:60
; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:73
; CHECK-NEXT:    s_waitcnt vmcnt(28)
; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:41
; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:54
; CHECK-NEXT:    s_waitcnt vmcnt(28)
; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:55
; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:68
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:74
; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:87
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:53
; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:66
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:52
; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:65
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:51
; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:64
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:62
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:63
; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:76
; CHECK-NEXT:    s_waitcnt vmcnt(28)
; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:50
; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:95
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:77
; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:90
; CHECK-NEXT:    s_waitcnt vmcnt(26)
; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:71
; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:83
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:70
; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:69
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:59
; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:72
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:73
; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:85
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:54
; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:67
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:68
; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:81
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:66
; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:111
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:65
; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:110
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:64
; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:109
; CHECK-NEXT:    s_waitcnt vmcnt(24)
; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:62
; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:75
; CHECK-NEXT:    s_waitcnt vmcnt(24)
; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:76
; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:89
; CHECK-NEXT:    s_waitcnt vmcnt(22)
; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:90
; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:103
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:72
; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:86
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:84
; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:82
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:87
; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:100
; CHECK-NEXT:    s_waitcnt vmcnt(20)
; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:67
; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:80
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:78
; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:94
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:79
; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:92
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:95
; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:108
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:93
; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:106
; CHECK-NEXT:    s_waitcnt vmcnt(20)
; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:75
; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:88
; CHECK-NEXT:    s_waitcnt vmcnt(20)
; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:89
; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:102
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:78
; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:91
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:94
; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:107
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:92
; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:105
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:88
; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:101
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:91
; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:104
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:86
; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:85
; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:84
; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:83
; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:82
; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:96
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:97
; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:98
; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:99
; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:120
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:81
; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:80
; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:111
; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:110
; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:109
; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:108
; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:100
; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:121
; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:122
; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:123
; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:124
; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:125
; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:126
; CHECK-NEXT:    s_waitcnt vmcnt(29)
; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:107
; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:127
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:106
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:105
; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:103
; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:102
; CHECK-NEXT:    s_waitcnt vmcnt(31)
; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:101
; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:116
; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:117
; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:119
; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:114
; CHECK-NEXT:    s_waitcnt vmcnt(34)
; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:104
; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:118
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:115
; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:113
; CHECK-NEXT:    global_load_ubyte v21, v0, s[0:1] offset:112
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:99
; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:98
; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:97
; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:96
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:127
; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:126
; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:125
; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:124
; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:123
; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:122
; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:121
; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:120
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:119
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:118
; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:117
; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:116
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:115
; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:114
; CHECK-NEXT:    s_waitcnt vmcnt(19)
; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:113
; CHECK-NEXT:    s_waitcnt vmcnt(19)
; CHECK-NEXT:    buffer_store_byte v21, v1, s[16:19], 0 offen offset:112
; CHECK-NEXT:    s_endpgm
entry:
  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
  ret void
}

define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 {
; CHECK-LABEL: memcpy_p0_p5_optsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x8
; CHECK-NEXT:    s_add_u32 s16, s16, s13
; CHECK-NEXT:    s_addc_u32 s17, s17, 0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    v_mov_b32_e32 v2, s0
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2
; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    v_mov_b32_e32 v0, s0
; CHECK-NEXT:    v_mov_b32_e32 v1, s1
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v18
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:31
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:22
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:21
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:19
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:18
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:17
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:16
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:25
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:24
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:45
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:37
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:36
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:35
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:34
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:47
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:29
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:44
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:33
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:32
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:40
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:39
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:38
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:59
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:51
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:50
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:63
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:46
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:61
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:58
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:49
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:48
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:56
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:54
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:53
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:52
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:55
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:73
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:65
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:64
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:62
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:77
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:60
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:75
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:72
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:70
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:68
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:67
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:66
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:79
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:69
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:87
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:76
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:91
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:74
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:89
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:71
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:86
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:78
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:93
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:90
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:88
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:85
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:95
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:92
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:94
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:84
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:81
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:80
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:111
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:110
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:109
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:99
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:107
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:105
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:104
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:103
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:106
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:102
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:101
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:100
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:108
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113
; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112
; CHECK-NEXT:    s_nop 0
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:98
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:97
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:96
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:127
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:126
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:125
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:124
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:123
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:122
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:121
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:120
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:119
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:118
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:117
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:116
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:115
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:114
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:113
; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:112
; CHECK-NEXT:    s_endpgm
entry:
  tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
  ret void
}

define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 {
; CHECK-LABEL: memcpy_p3_p4_optsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
; CHECK-NEXT:    v_mov_b32_e32 v24, 0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1]
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:16
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:32
; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:48
; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:64
; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:80
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset0:2 offset1:3
; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:96
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:112
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    ds_write2_b64 v24, v[8:9], v[10:11] offset0:4 offset1:5
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    ds_write2_b64 v24, v[12:13], v[14:15] offset0:6 offset1:7
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    ds_write2_b64 v24, v[16:17], v[18:19] offset0:8 offset1:9
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    ds_write2_b64 v24, v[20:21], v[22:23] offset0:10 offset1:11
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset0:12 offset1:13
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset0:14 offset1:15
; CHECK-NEXT:    s_endpgm
entry:
  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
  ret void
}

define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
; CHECK-LABEL: memcpy_p0_p3_optsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
; CHECK-NEXT:    v_mov_b32_e32 v2, 0
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:112
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:113
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:114
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:115
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:116
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    v_mov_b32_e32 v0, s0
; CHECK-NEXT:    v_mov_b32_e32 v1, s1
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:112
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:113
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:114
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:115
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:116
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:117
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:118
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:119
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:120
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:121
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:122
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:123
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:124
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:125
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:126
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:127
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:120
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:121
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:122
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:123
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:124
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:125
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:126
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:127
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:96
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:97
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:98
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:99
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:100
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:101
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:102
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:103
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:96
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:97
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:98
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:99
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:100
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:101
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:102
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:103
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:104
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:105
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:106
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:107
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:108
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:109
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:110
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:111
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:104
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:105
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:106
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:107
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:108
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:109
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:110
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:111
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:80
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:81
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:82
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:83
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:84
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:85
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:86
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:87
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:80
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:81
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:84
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:85
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:86
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:87
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:88
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:89
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:90
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:91
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:92
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:93
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:94
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:95
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:88
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:89
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:90
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:91
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:92
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:93
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:94
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:95
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:64
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:65
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:66
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:67
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:68
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:69
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:70
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:71
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:64
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:65
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:66
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:67
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:68
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:69
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:70
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:71
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:72
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:73
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:74
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:75
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:76
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:77
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:78
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:79
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:72
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:73
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:74
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:75
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:76
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:77
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:78
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:79
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:48
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:49
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:50
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:51
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:52
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:53
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:54
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:55
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:48
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:49
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:50
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:51
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:52
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:53
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:54
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:55
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:56
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:57
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:58
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:59
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:60
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:61
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:62
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:63
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:56
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:57
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:58
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:59
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:60
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:61
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:62
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:63
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:32
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:33
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:34
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:35
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:36
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:37
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:38
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:39
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:32
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:33
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:34
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:35
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:36
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:37
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:38
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:39
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:40
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:41
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:42
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:43
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:44
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:45
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:46
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:47
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:40
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:41
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:42
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:43
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:44
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:45
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:46
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:47
; CHECK-NEXT:    ds_read_u8 v3, v2
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:1
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:2
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:3
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:4
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:5
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:6
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:7
; CHECK-NEXT:    ds_read_u8 v11, v2 offset:8
; CHECK-NEXT:    ds_read_u8 v12, v2 offset:9
; CHECK-NEXT:    ds_read_u8 v13, v2 offset:10
; CHECK-NEXT:    ds_read_u8 v14, v2 offset:11
; CHECK-NEXT:    ds_read_u8 v15, v2 offset:12
; CHECK-NEXT:    ds_read_u8 v16, v2 offset:13
; CHECK-NEXT:    ds_read_u8 v17, v2 offset:14
; CHECK-NEXT:    ds_read_u8 v18, v2 offset:15
; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
; CHECK-NEXT:    ds_read_u8 v21, v2 offset:18
; CHECK-NEXT:    ds_read_u8 v22, v2 offset:19
; CHECK-NEXT:    ds_read_u8 v23, v2 offset:20
; CHECK-NEXT:    ds_read_u8 v24, v2 offset:21
; CHECK-NEXT:    ds_read_u8 v25, v2 offset:22
; CHECK-NEXT:    ds_read_u8 v26, v2 offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:18
; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:19
; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:20
; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:21
; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:22
; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:23
; CHECK-NEXT:    ds_read_u8 v19, v2 offset:24
; CHECK-NEXT:    ds_read_u8 v20, v2 offset:25
; CHECK-NEXT:    ds_read_u8 v21, v2 offset:26
; CHECK-NEXT:    ds_read_u8 v22, v2 offset:27
; CHECK-NEXT:    ds_read_u8 v23, v2 offset:28
; CHECK-NEXT:    ds_read_u8 v24, v2 offset:29
; CHECK-NEXT:    ds_read_u8 v25, v2 offset:30
; CHECK-NEXT:    ds_read_u8 v2, v2 offset:31
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:24
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:25
; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:26
; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:27
; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:28
; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:29
; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:30
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:31
; CHECK-NEXT:    flat_store_byte v[0:1], v3
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:2
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:3
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:4
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:5
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:6
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:9
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:10
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:11
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:12
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:13
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:14
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
; CHECK-NEXT:    s_endpgm
entry:
  tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
  ret void
}

declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2

declare void @llvm.memcpy.p0.p5.i64(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2

declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2

declare void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2

declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2

declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2

declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2

attributes #0 = { minsize }
attributes #1 = { optsize }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }