llvm/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5

; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s

; Testing codegen for memcpy with vector operands for all combinations of the following parameters:
;     destination address space: 0, 1, 3, 5
;     source address space: 0, 1, 3, 4, 5
;     alignment: 1, 2, 8, 16
;     sizes: 16, 31, 32


define void @memcpy_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p0_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:14
; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:13
; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:12
; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:11
; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:10
; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:9
; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:8
; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:7
; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:6
; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:5
; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:4
; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:3
; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:2
; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:1
; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xe
; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:14
; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:13
; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:12
; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:11
; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:10
; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:9
; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:8
; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:7
; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:6
; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:5
; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:4
; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:3
; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:2
; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:1
; CHECK-NEXT:    flat_load_ubyte v18, v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v18
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:29
; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:28
; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:27
; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:26
; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:25
; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:24
; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:23
; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:22
; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:21
; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:20
; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:19
; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:18
; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:17
; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:16
; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p0_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:14
; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:13
; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:12
; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:11
; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:10
; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:9
; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:8
; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:7
; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:6
; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:5
; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:4
; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:3
; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:2
; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:1
; CHECK-NEXT:    flat_load_ubyte v19, v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v19
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:31
; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:30
; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:29
; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:28
; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:27
; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:26
; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:25
; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:24
; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:23
; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:22
; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:21
; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:20
; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:19
; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:18
; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:17
; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p0_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:14
; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:12
; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:10
; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:8
; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:6
; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:4
; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:2
; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:22
; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:20
; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:18
; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:16
; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:14
; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:12
; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:10
; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:8
; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p0_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:30
; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:22
; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:20
; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:18
; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:16
; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:14
; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:12
; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:10
; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:8
; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p0_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p0_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p0_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x10
; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:17
; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:18
; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:15
; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:16
; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:21
; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:22
; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:19
; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:20
; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:25
; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:26
; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:24
; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:29
; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:30
; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:27
; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:28
; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(16)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p0_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p0_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p0_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p0_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p0_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x10
; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:17
; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:18
; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:15
; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:16
; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:21
; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:22
; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:19
; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:20
; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:25
; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:26
; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:24
; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:29
; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:30
; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:27
; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:28
; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(16)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p0_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p0_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p1_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xe
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:14
; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:13
; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:12
; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:11
; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:10
; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:9
; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:8
; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:7
; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:6
; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:5
; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:4
; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:3
; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:2
; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:1
; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v18
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:29
; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:28
; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:27
; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:26
; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:25
; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:24
; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:23
; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:22
; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:21
; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:20
; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:19
; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:18
; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:17
; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:16
; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p1_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v19
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:31
; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:30
; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:29
; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:28
; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:27
; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:26
; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:25
; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:24
; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:23
; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:22
; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:21
; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:20
; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:19
; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:18
; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:17
; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p1_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:14
; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:12
; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:10
; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:8
; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:6
; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:4
; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:2
; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p1_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:30
; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p1_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p1_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p1_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:17
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:21
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:19
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:25
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v11 offset:29
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:27
; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v10
; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v10
; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v11
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p1_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p1_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p1_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p1_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p1_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:17
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:21
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:19
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:25
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v11 offset:29
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:27
; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v10
; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v10
; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v11
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p1_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p1_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:15
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:14
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:13
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:12
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:11
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:10
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:9
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:8
; CHECK-NEXT:    ds_read_u8 v11, v2 offset:7
; CHECK-NEXT:    ds_read_u8 v12, v2 offset:6
; CHECK-NEXT:    ds_read_u8 v13, v2 offset:5
; CHECK-NEXT:    ds_read_u8 v14, v2 offset:4
; CHECK-NEXT:    ds_read_u8 v15, v2 offset:3
; CHECK-NEXT:    ds_read_u8 v16, v2 offset:2
; CHECK-NEXT:    ds_read_u8 v17, v2 offset:1
; CHECK-NEXT:    ds_read_u8 v2, v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:14
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:13
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:12
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:11
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:10
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:9
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:8
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:7
; CHECK-NEXT:    ds_read_u8 v11, v2 offset:6
; CHECK-NEXT:    ds_read_u8 v12, v2 offset:5
; CHECK-NEXT:    ds_read_u8 v13, v2 offset:4
; CHECK-NEXT:    ds_read_u8 v14, v2 offset:3
; CHECK-NEXT:    ds_read_u8 v15, v2 offset:2
; CHECK-NEXT:    ds_read_u8 v16, v2 offset:1
; CHECK-NEXT:    ds_read_u8 v17, v2
; CHECK-NEXT:    ds_read_u8 v18, v2 offset:15
; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:14
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:12
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:11
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:10
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:9
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:8
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:6
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:5
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:4
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:2
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v17
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:29
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:28
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:27
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:26
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:25
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:24
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:23
; CHECK-NEXT:    ds_read_u8 v11, v2 offset:22
; CHECK-NEXT:    ds_read_u8 v12, v2 offset:21
; CHECK-NEXT:    ds_read_u8 v13, v2 offset:20
; CHECK-NEXT:    ds_read_u8 v14, v2 offset:19
; CHECK-NEXT:    ds_read_u8 v2, v2 offset:18
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:26
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:25
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:21
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:19
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:15
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:14
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:13
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:12
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:11
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:10
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:9
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:8
; CHECK-NEXT:    ds_read_u8 v11, v2 offset:7
; CHECK-NEXT:    ds_read_u8 v12, v2 offset:6
; CHECK-NEXT:    ds_read_u8 v13, v2 offset:5
; CHECK-NEXT:    ds_read_u8 v14, v2 offset:4
; CHECK-NEXT:    ds_read_u8 v15, v2 offset:3
; CHECK-NEXT:    ds_read_u8 v16, v2 offset:2
; CHECK-NEXT:    ds_read_u8 v17, v2 offset:1
; CHECK-NEXT:    ds_read_u8 v18, v2
; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v18
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:31
; CHECK-NEXT:    ds_read_u8 v4, v2 offset:30
; CHECK-NEXT:    ds_read_u8 v5, v2 offset:29
; CHECK-NEXT:    ds_read_u8 v6, v2 offset:28
; CHECK-NEXT:    ds_read_u8 v7, v2 offset:27
; CHECK-NEXT:    ds_read_u8 v8, v2 offset:26
; CHECK-NEXT:    ds_read_u8 v9, v2 offset:25
; CHECK-NEXT:    ds_read_u8 v10, v2 offset:24
; CHECK-NEXT:    ds_read_u8 v11, v2 offset:23
; CHECK-NEXT:    ds_read_u8 v12, v2 offset:22
; CHECK-NEXT:    ds_read_u8 v13, v2 offset:21
; CHECK-NEXT:    ds_read_u8 v14, v2 offset:20
; CHECK-NEXT:    ds_read_u8 v15, v2 offset:19
; CHECK-NEXT:    ds_read_u8 v2, v2 offset:18
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:31
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_u16 v3, v2 offset:14
; CHECK-NEXT:    ds_read_u16 v4, v2 offset:12
; CHECK-NEXT:    ds_read_u16 v5, v2 offset:10
; CHECK-NEXT:    ds_read_u16 v6, v2 offset:8
; CHECK-NEXT:    ds_read_u16 v7, v2 offset:6
; CHECK-NEXT:    ds_read_u16 v8, v2 offset:4
; CHECK-NEXT:    ds_read_u16 v9, v2 offset:2
; CHECK-NEXT:    ds_read_u16 v2, v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:14
; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:10
; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:8
; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:6
; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:4
; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:2
; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
; CHECK-NEXT:    ds_read_u16 v4, v2 offset:28
; CHECK-NEXT:    ds_read_u16 v5, v2 offset:26
; CHECK-NEXT:    ds_read_u16 v6, v2 offset:24
; CHECK-NEXT:    ds_read_u16 v7, v2 offset:22
; CHECK-NEXT:    ds_read_u16 v8, v2 offset:20
; CHECK-NEXT:    ds_read_u16 v9, v2 offset:18
; CHECK-NEXT:    ds_read_u16 v10, v2 offset:16
; CHECK-NEXT:    ds_read_u16 v11, v2 offset:14
; CHECK-NEXT:    ds_read_u16 v12, v2 offset:12
; CHECK-NEXT:    ds_read_u16 v13, v2 offset:10
; CHECK-NEXT:    ds_read_u16 v14, v2 offset:8
; CHECK-NEXT:    ds_read_u16 v15, v2 offset:6
; CHECK-NEXT:    ds_read_u16 v16, v2 offset:4
; CHECK-NEXT:    ds_read_u16 v17, v2 offset:2
; CHECK-NEXT:    ds_read_u16 v2, v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_u16 v3, v2 offset:30
; CHECK-NEXT:    ds_read_u16 v4, v2 offset:28
; CHECK-NEXT:    ds_read_u16 v5, v2 offset:26
; CHECK-NEXT:    ds_read_u16 v6, v2 offset:24
; CHECK-NEXT:    ds_read_u16 v7, v2 offset:22
; CHECK-NEXT:    ds_read_u16 v8, v2 offset:20
; CHECK-NEXT:    ds_read_u16 v9, v2 offset:18
; CHECK-NEXT:    ds_read_u16 v10, v2 offset:16
; CHECK-NEXT:    ds_read_u16 v11, v2 offset:14
; CHECK-NEXT:    ds_read_u16 v12, v2 offset:12
; CHECK-NEXT:    ds_read_u16 v13, v2 offset:10
; CHECK-NEXT:    ds_read_u16 v14, v2 offset:8
; CHECK-NEXT:    ds_read_u16 v15, v2 offset:6
; CHECK-NEXT:    ds_read_u16 v16, v2 offset:4
; CHECK-NEXT:    ds_read_u16 v17, v2 offset:2
; CHECK-NEXT:    ds_read_u16 v2, v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:30
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p3_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p3_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p3_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p3_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset1:1
; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v7 offset:17
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:21
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:19
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:25
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:29
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:27
; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v10
; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p3_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p3_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p3_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p3_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[2:5], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p3_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p3_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[3:6], v2
; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v7 offset:17
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:21
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:19
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:25
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:29
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:27
; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v10
; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p3_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p3_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:16
; CHECK-NEXT:    ds_read_b128 v[7:10], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:2
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:3
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:4
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:5
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:6
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:7
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:8
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:9
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:10
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:11
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:12
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:29
; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:28
; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:27
; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:26
; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:25
; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:24
; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:23
; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:22
; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:21
; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:20
; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:19
; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:18
; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:17
; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:16
; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v19
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:31
; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:30
; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:29
; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:28
; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:27
; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:26
; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:25
; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:24
; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:23
; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:22
; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:21
; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:20
; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:19
; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:18
; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:17
; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_ushort v4, v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v4
; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:2
; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:4
; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:6
; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:8
; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:10
; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
; CHECK-NEXT:    global_load_ushort v2, v[2:3], off offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v2 offset:14
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:30
; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p4_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p4_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p4_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v2 offset:17
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v3 offset:21
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:19
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v4 offset:25
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:23
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v5 offset:29
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:18
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:22
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:20
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:24
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p4_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p4_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p4_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p4_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p4_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v2 offset:17
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v3 offset:21
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:19
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v4 offset:25
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:23
; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v5 offset:29
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:18
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:22
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:20
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:24
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p4_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p4_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p5_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1
; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x11
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:11
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:9
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:5
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:3
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:1
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_byte v[0:1], v17
; CHECK-NEXT:    s_clause 0xc
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p0_p5_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x11
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_byte v[0:1], v18
; CHECK-NEXT:    s_clause 0xd
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:31
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p5_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p0_p5_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_short v[0:1], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p5_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p5_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x3
; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p5_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x13
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:18
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:22
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:19
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:26
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:23
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:29
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p5_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p0_p5_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p0_p5_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p5_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x3
; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p5_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x13
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:18
; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:22
; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:19
; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:26
; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:23
; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:29
; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p0_p5_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p0_p5_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p0_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:5
; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:6
; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:7
; CHECK-NEXT:    flat_load_ubyte v7, v[2:3]
; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:1
; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:2
; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:4
; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:3
; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:15
; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:14
; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:13
; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:12
; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:9
; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:8
; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:11
; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v10
; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 8, v7
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v6, v11, 8, v9
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 8, v13
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 8, v15
; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 8, v17
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v10, v18, 8, v2
; CHECK-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v7
; CHECK-NEXT:    v_lshl_or_b32 v4, v10, 16, v9
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p0_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1e
; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:28
; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:29
; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:30
; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:24
; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:27
; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:26
; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:15
; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:14
; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:13
; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:12
; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:23
; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:22
; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:21
; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:20
; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:19
; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:18
; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:17
; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:16
; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:11
; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:10
; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:9
; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:8
; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:7
; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:6
; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:5
; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:4
; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:1
; CHECK-NEXT:    flat_load_ubyte v32, v[2:3]
; CHECK-NEXT:    flat_load_ubyte v33, v[2:3] offset:3
; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v9
; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
; CHECK-NEXT:    v_lshl_or_b32 v6, v10, 8, v8
; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
; CHECK-NEXT:    v_lshl_or_b32 v5, v7, 8, v15
; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v16
; CHECK-NEXT:    v_lshl_or_b32 v7, v4, 16, v3
; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v18
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v15, v19, 8, v20
; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v5
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v10, v21, 8, v22
; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v8
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v14, v23, 8, v24
; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v13
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v12, v25, 8, v26
; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v10
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v17, v27, 8, v28
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v30
; CHECK-NEXT:    v_lshl_or_b32 v4, v14, 16, v12
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v18, v31, 8, v32
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v2, v33, 8, v2
; CHECK-NEXT:    v_lshl_or_b32 v3, v17, 16, v16
; CHECK-NEXT:    v_lshl_or_b32 v2, v2, 16, v18
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:23
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:16
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p0_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1f
; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:29
; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:30
; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:31
; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:24
; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:26
; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:28
; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:27
; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:15
; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:14
; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:13
; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:12
; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:22
; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:21
; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:20
; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:19
; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:18
; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:17
; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:16
; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:11
; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:10
; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:9
; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:8
; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:7
; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:6
; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:5
; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:4
; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:1
; CHECK-NEXT:    flat_load_ubyte v33, v[2:3]
; CHECK-NEXT:    flat_load_ubyte v34, v[2:3] offset:3
; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v10
; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
; CHECK-NEXT:    v_lshl_or_b32 v6, v8, 8, v7
; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 8, v9
; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
; CHECK-NEXT:    v_lshl_or_b32 v14, v14, 8, v15
; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v3
; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 8, v17
; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
; CHECK-NEXT:    v_lshl_or_b32 v8, v18, 8, v19
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v11, v20, 8, v21
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v10, v22, 8, v23
; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v15, v24, 8, v25
; CHECK-NEXT:    v_lshl_or_b32 v9, v12, 16, v14
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v13, v26, 8, v27
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v17, v28, 8, v29
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v16, v30, 8, v31
; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v13
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v18, v32, 8, v33
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v19, v34, 8, v2
; CHECK-NEXT:    v_lshl_or_b32 v2, v11, 16, v10
; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v18
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p0_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:6
; CHECK-NEXT:    flat_load_ushort v5, v[2:3]
; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:12
; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:8
; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:4
; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:2
; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:14
; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v8
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v2, v9, 16, v5
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v6
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v7
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p0_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x13
; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:22
; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:16
; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:26
; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:25
; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:24
; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:23
; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:20
; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:18
; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:14
; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:12
; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:30
; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:29
; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:28
; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:27
; CHECK-NEXT:    flat_load_ushort v19, v[2:3] offset:10
; CHECK-NEXT:    flat_load_ushort v20, v[2:3] offset:8
; CHECK-NEXT:    flat_load_ushort v21, v[2:3] offset:6
; CHECK-NEXT:    flat_load_ushort v22, v[2:3] offset:4
; CHECK-NEXT:    flat_load_ushort v23, v[2:3] offset:2
; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 8, v9
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
; CHECK-NEXT:    v_lshl_or_b32 v7, v4, 16, v6
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v5
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v10
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v13, v15, 8, v16
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v18
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v4, v19, 16, v20
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v3, v21, 16, v22
; CHECK-NEXT:    v_lshl_or_b32 v9, v13, 16, v12
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v2, v23, 16, v2
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p0_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:30
; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:14
; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:12
; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:10
; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:8
; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:24
; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:22
; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:20
; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:18
; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:16
; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
; CHECK-NEXT:    flat_load_ushort v19, v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v5
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 16, v9
; CHECK-NEXT:    v_lshl_or_b32 v9, v7, 16, v8
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v3, v12, 16, v13
; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v11
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v2, v14, 16, v15
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v7, v16, 16, v17
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v6, v18, 16, v19
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p0_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p0_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p0_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x10
; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:18
; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:26
; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:24
; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:23
; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:30
; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:29
; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:28
; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:27
; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:22
; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:21
; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:20
; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:19
; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:16
; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:15
; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:17
; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v10, v9, 8, v10
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
; CHECK-NEXT:    v_lshl_or_b32 v8, v7, 16, v10
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v14, v15, 8, v16
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v18
; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 16, v13
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v15, v19, 8, v20
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v21
; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v12
; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v15
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:15
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p0_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p0_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p0_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p0_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p0_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x10
; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:18
; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:26
; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:24
; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:23
; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:30
; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:29
; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:28
; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:27
; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:22
; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:21
; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:20
; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:19
; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:16
; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:15
; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:17
; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v10, v9, 8, v10
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
; CHECK-NEXT:    v_lshl_or_b32 v8, v7, 16, v10
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v14, v15, 8, v16
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v18
; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 16, v13
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v15, v19, 8, v20
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v21
; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v12
; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v15
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:15
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p0_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p0_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p1_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p1_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p1_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p1_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x2
; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:23
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p1_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p1_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p1_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p1_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p1_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p1_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x2
; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:23
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p1_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p1_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p1_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p1_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p1_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p1_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p1_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p1_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p1_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p1_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p1_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p1_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p1_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p1_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p3_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p3_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[2:5], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p3_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p3_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b64 v[7:8], v2
; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:8
; CHECK-NEXT:    ds_read_b64 v[9:10], v2 offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p3_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p3_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[3:6], v2
; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p3_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p3_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[2:5], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p3_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p3_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b64 v[7:8], v2
; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:8
; CHECK-NEXT:    ds_read_b64 v[9:10], v2 offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p3_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p3_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[3:6], v2
; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p3_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p3_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p3_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p3_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset1:1
; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:15
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p3_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p3_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset1:1
; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p3_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p3_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[2:5], v2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p3_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p3_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[3:6], v2
; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:15
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p3_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p3_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[3:6], v2
; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p4_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p4_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p4_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:8
; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:23
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p4_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p4_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p4_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p4_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p4_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p4_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:8
; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:23
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p4_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p4_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p4_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p4_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p4_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p4_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:15
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p4_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p4_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p4_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p4_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p4_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p4_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:15
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p4_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p4_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p5_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v9, v14, 8, v13
; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v10, v16, 8, v15
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v11, v2, 8, v17
; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p5_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1e
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(29)
; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
; CHECK-NEXT:    s_waitcnt vmcnt(21)
; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
; CHECK-NEXT:    s_waitcnt vmcnt(19)
; CHECK-NEXT:    v_lshl_or_b32 v11, v14, 8, v13
; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v15
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v15, v23, 8, v22
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v25
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    v_lshl_or_b32 v14, v26, 8, v24
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    v_lshl_or_b32 v16, v28, 8, v27
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v29
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v18, v31, 8, v21
; CHECK-NEXT:    v_lshl_or_b32 v7, v13, 16, v12
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v19, v2, 8, v32
; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 16, v16
; CHECK-NEXT:    v_lshl_or_b32 v8, v19, 16, v18
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p1_p5_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1f
; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31
; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
; CHECK-NEXT:    s_waitcnt vmcnt(28)
; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
; CHECK-NEXT:    s_waitcnt vmcnt(26)
; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
; CHECK-NEXT:    s_waitcnt vmcnt(24)
; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
; CHECK-NEXT:    s_waitcnt vmcnt(22)
; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
; CHECK-NEXT:    s_waitcnt vmcnt(20)
; CHECK-NEXT:    v_lshl_or_b32 v11, v14, 8, v13
; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v15
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    v_lshl_or_b32 v15, v23, 8, v22
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v25
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v14, v26, 8, v24
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v16, v28, 8, v27
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v29
; CHECK-NEXT:    v_lshl_or_b32 v7, v13, 16, v12
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v18, v32, 8, v31
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v19, v2, 8, v33
; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 16, v16
; CHECK-NEXT:    v_lshl_or_b32 v8, v19, 16, v18
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p5_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v7
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v4, v10, 16, v9
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p5_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x13
; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ushort v19, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ushort v20, v2, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ushort v21, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ushort v22, v2, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v9
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v9, v14, 8, v13
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v11, v16, 8, v15
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v12, v18, 8, v17
; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v10
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v7, v20, 16, v19
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v6, v22, 16, v21
; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 16, v11
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p1_p5_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v9
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 16, v15
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v8, v18, 16, v17
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p5_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p5_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x3
; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p5_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x13
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v11, v12, 8, v11
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v12, v14, 8, v13
; CHECK-NEXT:    v_lshl_or_b32 v10, v2, 16, v7
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 8, v15
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v13, v20, 8, v19
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v8
; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 16, v11
; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:15
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p5_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p1_p5_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p1_p5_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p5_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x3
; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p5_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x13
; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v11, v12, 8, v11
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v12, v14, 8, v13
; CHECK-NEXT:    v_lshl_or_b32 v10, v2, 16, v7
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 8, v15
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v13, v20, 8, v19
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v8
; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 16, v11
; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:15
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p1_p5_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p1_p5_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p0_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:5
; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:6
; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:7
; CHECK-NEXT:    flat_load_ubyte v6, v[1:2]
; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:1
; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:2
; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:4
; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:3
; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:15
; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:14
; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:13
; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:12
; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:9
; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:8
; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:11
; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v9
; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 8, v8
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 8, v12
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 8, v14
; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 8, v16
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 8, v1
; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p0_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1e
; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:28
; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:29
; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:30
; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:24
; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:27
; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:26
; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:15
; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:14
; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:13
; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:12
; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:23
; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:22
; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:21
; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:20
; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:19
; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:18
; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:17
; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:16
; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:11
; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:10
; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:9
; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:8
; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:7
; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:6
; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:5
; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:4
; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:1
; CHECK-NEXT:    flat_load_ubyte v31, v[1:2]
; CHECK-NEXT:    flat_load_ubyte v32, v[1:2] offset:3
; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v8
; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 8, v7
; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v14
; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 8, v15
; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v17
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v19
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v8, v20, 8, v21
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v13, v22, 8, v23
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v11, v24, 8, v25
; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v15, v26, 8, v27
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v14, v28, 8, v29
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v16, v30, 8, v31
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v17, v32, 8, v1
; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
; CHECK-NEXT:    v_lshl_or_b32 v6, v10, 16, v12
; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v11
; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:23
; CHECK-NEXT:    ds_write_b64 v0, v[3:4] offset:16
; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p0_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1f
; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:29
; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:30
; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:31
; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:24
; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:26
; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:28
; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:27
; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:15
; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:14
; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:13
; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:12
; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:22
; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:21
; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:20
; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:19
; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:18
; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:17
; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:16
; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:11
; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:10
; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:9
; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:8
; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:7
; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:6
; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:5
; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:4
; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:1
; CHECK-NEXT:    flat_load_ubyte v32, v[1:2]
; CHECK-NEXT:    flat_load_ubyte v33, v[1:2] offset:3
; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v9
; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 8, v8
; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v14
; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 8, v16
; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
; CHECK-NEXT:    v_lshl_or_b32 v6, v17, 8, v18
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v9, v19, 8, v20
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v8, v21, 8, v22
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v13, v23, 8, v24
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v12, v25, 8, v26
; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v28
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v14, v29, 8, v30
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v16, v31, 8, v32
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v17, v33, 8, v1
; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
; CHECK-NEXT:    v_lshl_or_b32 v6, v11, 16, v10
; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v12
; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p0_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:6
; CHECK-NEXT:    flat_load_ushort v4, v[1:2]
; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:12
; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:4
; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:2
; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:14
; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v7
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v1, v8, 16, v4
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v5
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v6
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p0_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x13
; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:22
; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:16
; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:26
; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:25
; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:24
; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:23
; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:20
; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:18
; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:30
; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:29
; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:28
; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:27
; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:10
; CHECK-NEXT:    flat_load_ushort v18, v[1:2] offset:8
; CHECK-NEXT:    flat_load_ushort v19, v[1:2] offset:6
; CHECK-NEXT:    flat_load_ushort v20, v[1:2] offset:4
; CHECK-NEXT:    flat_load_ushort v21, v[1:2] offset:2
; CHECK-NEXT:    flat_load_ushort v22, v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v9
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v1, v10, 16, v4
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v12
; CHECK-NEXT:    v_lshl_or_b32 v11, v5, 8, v6
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v14
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v16
; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 16, v7
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v3, v17, 16, v18
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v20
; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v9
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v5, v21, 16, v22
; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:16
; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[3:4] offset1:1
; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p0_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:30
; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:14
; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:12
; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:10
; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:8
; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:24
; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:22
; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:20
; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:18
; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:16
; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
; CHECK-NEXT:    flat_load_ushort v18, v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v4
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v10
; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v9
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v12
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v3, v13, 16, v14
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v16
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v18
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p0_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p0_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p0_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p0_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x10
; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:18
; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:26
; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:24
; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:23
; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:30
; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:29
; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:28
; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:27
; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:22
; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:21
; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:20
; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:19
; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:16
; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:15
; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:17
; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v7
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v9, v8, 8, v9
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
; CHECK-NEXT:    v_lshl_or_b32 v7, v6, 16, v9
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v13, v14, 8, v15
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v11, v16, 8, v17
; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v12
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v14, v18, 8, v19
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 8, v20
; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v11
; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 16, v14
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    ds_write_b128 v0, v[5:8] offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p0_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p0_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p0_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p0_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_write_b128 v0, v[1:4]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p0_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x10
; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:18
; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:26
; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:24
; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:23
; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:30
; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:29
; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:28
; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:27
; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:22
; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:21
; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:20
; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:19
; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:16
; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:15
; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:17
; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v7
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v9, v8, 8, v9
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
; CHECK-NEXT:    v_lshl_or_b32 v7, v6, 16, v9
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v13, v14, 8, v15
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v11, v16, 8, v17
; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v12
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v14, v18, 8, v19
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 8, v20
; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v11
; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 16, v14
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_write_b128 v0, v[1:4]
; CHECK-NEXT:    ds_write_b128 v0, v[5:8] offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p0_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p0_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    ds_write_b128 v0, v[3:6] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
; CHECK-NEXT:    ds_write_b128 v0, v[7:10]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p1_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p1_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p1_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p1_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x2
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p1_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p1_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p1_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p1_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p1_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p1_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x2
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p1_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p1_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p1_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p1_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p1_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p1_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b128 v0, v[7:10] offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p1_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p1_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p1_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p1_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b128 v0, v[1:4]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p1_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p1_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write_b128 v0, v[3:6]
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b128 v0, v[7:10] offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p1_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p1_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write_b128 v0, v[3:6]
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b128 v0, v[7:10] offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p3_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p3_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p3_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p3_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b64 v[5:6], v1 offset:23
; CHECK-NEXT:    ds_read_b64 v[7:8], v1 offset:16
; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p3_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p3_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset0:2 offset1:3
; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p3_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p3_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p3_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p3_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b64 v[5:6], v1 offset:23
; CHECK-NEXT:    ds_read_b64 v[7:8], v1 offset:16
; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p3_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p3_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset0:2 offset1:3
; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p3_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p3_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p3_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p3_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
; CHECK-NEXT:    ds_read_b128 v[6:9], v1 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    ds_write_b128 v0, v[6:9] offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p3_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p3_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset0:2 offset1:3
; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p3_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p3_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[1:4], v1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    ds_write_b128 v0, v[1:4]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p3_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p3_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[2:5], v1 offset:15
; CHECK-NEXT:    ds_read_b128 v[6:9], v1
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    ds_write_b128 v0, v[2:5] offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    ds_write_b128 v0, v[6:9]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p3_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p3_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[2:5], v1 offset:16
; CHECK-NEXT:    ds_read_b128 v[6:9], v1
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    ds_write_b128 v0, v[2:5] offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    ds_write_b128 v0, v[6:9]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p4_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p4_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p4_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p4_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x2
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p4_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p4_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p4_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p4_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p4_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p4_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x2
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p4_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p4_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p4_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p4_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p4_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p4_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b128 v0, v[7:10] offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p4_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p4_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p4_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p4_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b128 v0, v[1:4]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p4_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p4_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write_b128 v0, v[3:6]
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b128 v0, v[7:10] offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p4_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p4_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    ds_write_b128 v0, v[3:6]
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b128 v0, v[7:10] offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p5_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13
; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9
; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 8, v10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v8, v13, 8, v12
; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v14
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v10, v1, 8, v16
; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p5_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1e
; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9
; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11
; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(29)
; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
; CHECK-NEXT:    s_waitcnt vmcnt(21)
; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
; CHECK-NEXT:    s_waitcnt vmcnt(19)
; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v12
; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 8, v14
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v12, v20, 8, v19
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v24
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v23
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v26
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v28
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v20
; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v18, v1, 8, v31
; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 16, v15
; CHECK-NEXT:    v_lshl_or_b32 v7, v18, 16, v17
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p3_p5_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1f
; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9
; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11
; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31
; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
; CHECK-NEXT:    s_waitcnt vmcnt(28)
; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
; CHECK-NEXT:    s_waitcnt vmcnt(26)
; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
; CHECK-NEXT:    s_waitcnt vmcnt(24)
; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
; CHECK-NEXT:    s_waitcnt vmcnt(22)
; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
; CHECK-NEXT:    s_waitcnt vmcnt(20)
; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v12
; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 8, v14
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    v_lshl_or_b32 v12, v20, 8, v19
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v24
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v23
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v26
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v28
; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v17, v31, 8, v30
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v18, v1, 8, v32
; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 16, v15
; CHECK-NEXT:    v_lshl_or_b32 v7, v18, 16, v17
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p5_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p5_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x13
; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ushort v18, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ushort v19, v1, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ushort v20, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ushort v21, v1, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v8, v13, 8, v12
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v10, v15, 8, v14
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v11, v17, 8, v16
; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v18
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v5, v21, 16, v20
; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 16, v10
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p3_p5_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p5_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p5_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x3
; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p5_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p5_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x13
; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v2
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    v_lshl_or_b32 v1, v9, 8, v4
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v10, v11, 8, v10
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v11, v13, 8, v12
; CHECK-NEXT:    v_lshl_or_b32 v4, v1, 16, v2
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v3, v15, 8, v14
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 8, v16
; CHECK-NEXT:    v_lshl_or_b32 v2, v11, 16, v10
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v18
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v3
; CHECK-NEXT:    v_lshl_or_b32 v1, v13, 16, v12
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset1:1
; CHECK-NEXT:    ds_write_b128 v0, v[1:4] offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p5_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p3_p5_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p3_p5_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p5_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x3
; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b128 v0, v[2:5]
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p5_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x13
; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    v_lshl_or_b32 v1, v9, 8, v8
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    v_lshl_or_b32 v10, v11, 8, v10
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    v_lshl_or_b32 v11, v13, 8, v12
; CHECK-NEXT:    v_lshl_or_b32 v9, v1, 16, v6
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v18
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v7
; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 16, v10
; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b128 v0, v[2:5]
; CHECK-NEXT:    ds_write_b128 v0, v[6:9] offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p3_p5_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p3_p5_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    ds_write_b128 v0, v[2:5]
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    ds_write_b128 v0, v[6:9] offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p0_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:15
; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:14
; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:13
; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:12
; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:11
; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:10
; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:9
; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:8
; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:7
; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:6
; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:5
; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:4
; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:3
; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:2
; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:1
; CHECK-NEXT:    flat_load_ubyte v1, v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xe
; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:14
; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:13
; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:12
; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:11
; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:10
; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:9
; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:8
; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:7
; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:6
; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:5
; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:4
; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:3
; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:2
; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:1
; CHECK-NEXT:    flat_load_ubyte v17, v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:30
; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:29
; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:28
; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:27
; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:26
; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:25
; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:24
; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:23
; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:22
; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:21
; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:20
; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:19
; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:18
; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:17
; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:16
; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p0_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:15
; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:14
; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:13
; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:12
; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:11
; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:10
; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:9
; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:8
; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:7
; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:6
; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:5
; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:4
; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:3
; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:2
; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:1
; CHECK-NEXT:    flat_load_ubyte v18, v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:31
; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:30
; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:29
; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:28
; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:27
; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:26
; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:25
; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:24
; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:23
; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:22
; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:21
; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:20
; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:19
; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:18
; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:17
; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p0_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:14
; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:12
; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:10
; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:6
; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:4
; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:2
; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:30
; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:24
; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:22
; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:20
; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:18
; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:16
; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:10
; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:8
; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p0_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:30
; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:24
; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:22
; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:20
; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:18
; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:16
; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:10
; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:8
; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p0_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p0_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p0_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p0_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x10
; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:17
; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:18
; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:15
; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:16
; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:21
; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:22
; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:19
; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:20
; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:25
; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:26
; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:24
; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:29
; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:30
; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:27
; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:28
; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p0_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p0_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p0_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p0_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p0_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x10
; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:17
; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:18
; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:15
; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:16
; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:21
; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:22
; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:19
; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:20
; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:25
; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:26
; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:24
; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:29
; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:30
; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:27
; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:28
; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p0_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p0_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p1_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1e
; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off
; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:1
; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:2
; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:3
; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:4
; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:5
; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:6
; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:7
; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:8
; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:9
; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:10
; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:11
; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:12
; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:13
; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:14
; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:30
; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:29
; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:28
; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:27
; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:26
; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:25
; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:24
; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:23
; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:22
; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:21
; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:20
; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:19
; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:18
; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:17
; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:16
; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(29)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(28)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(26)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(24)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(22)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(21)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(20)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(19)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1f
; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off
; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:31
; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:30
; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:29
; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:28
; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:27
; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:26
; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:25
; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:24
; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:23
; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:22
; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:21
; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:20
; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:19
; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:18
; CHECK-NEXT:    global_load_ubyte v33, v[1:2], off offset:17
; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(31)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(29)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(28)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(26)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(24)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(22)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(21)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(20)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(19)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:31
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p1_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    global_load_ushort v3, v[1:2], off
; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:2
; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:4
; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:6
; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:8
; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:10
; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:12
; CHECK-NEXT:    global_load_ushort v1, v[1:2], off offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p1_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ushort v3, v[1:2], off offset:30
; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p1_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p1_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p1_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p1_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p1_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p1_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p1_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p1_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p1_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p1_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p1_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p1_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p3_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_u8 v2, v1 offset:15
; CHECK-NEXT:    ds_read_u8 v3, v1 offset:14
; CHECK-NEXT:    ds_read_u8 v4, v1 offset:13
; CHECK-NEXT:    ds_read_u8 v5, v1 offset:12
; CHECK-NEXT:    ds_read_u8 v6, v1 offset:11
; CHECK-NEXT:    ds_read_u8 v7, v1 offset:10
; CHECK-NEXT:    ds_read_u8 v8, v1 offset:9
; CHECK-NEXT:    ds_read_u8 v9, v1 offset:8
; CHECK-NEXT:    ds_read_u8 v10, v1 offset:7
; CHECK-NEXT:    ds_read_u8 v11, v1 offset:6
; CHECK-NEXT:    ds_read_u8 v12, v1 offset:5
; CHECK-NEXT:    ds_read_u8 v13, v1 offset:4
; CHECK-NEXT:    ds_read_u8 v14, v1 offset:3
; CHECK-NEXT:    ds_read_u8 v15, v1 offset:2
; CHECK-NEXT:    ds_read_u8 v16, v1 offset:1
; CHECK-NEXT:    ds_read_u8 v1, v1
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_u8 v2, v1
; CHECK-NEXT:    ds_read_u8 v3, v1 offset:1
; CHECK-NEXT:    ds_read_u8 v4, v1 offset:2
; CHECK-NEXT:    ds_read_u8 v5, v1 offset:3
; CHECK-NEXT:    ds_read_u8 v6, v1 offset:4
; CHECK-NEXT:    ds_read_u8 v7, v1 offset:5
; CHECK-NEXT:    ds_read_u8 v8, v1 offset:6
; CHECK-NEXT:    ds_read_u8 v9, v1 offset:7
; CHECK-NEXT:    ds_read_u8 v10, v1 offset:8
; CHECK-NEXT:    ds_read_u8 v11, v1 offset:9
; CHECK-NEXT:    ds_read_u8 v12, v1 offset:10
; CHECK-NEXT:    ds_read_u8 v13, v1 offset:11
; CHECK-NEXT:    ds_read_u8 v14, v1 offset:12
; CHECK-NEXT:    ds_read_u8 v15, v1 offset:13
; CHECK-NEXT:    ds_read_u8 v16, v1 offset:14
; CHECK-NEXT:    ds_read_u8 v17, v1 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    ds_read_u8 v2, v1 offset:24
; CHECK-NEXT:    ds_read_u8 v3, v1 offset:25
; CHECK-NEXT:    ds_read_u8 v4, v1 offset:26
; CHECK-NEXT:    ds_read_u8 v18, v1 offset:27
; CHECK-NEXT:    ds_read_u8 v19, v1 offset:28
; CHECK-NEXT:    ds_read_u8 v20, v1 offset:29
; CHECK-NEXT:    ds_read_u8 v21, v1 offset:30
; CHECK-NEXT:    ds_read_u8 v22, v1 offset:16
; CHECK-NEXT:    ds_read_u8 v23, v1 offset:17
; CHECK-NEXT:    ds_read_u8 v24, v1 offset:18
; CHECK-NEXT:    ds_read_u8 v25, v1 offset:19
; CHECK-NEXT:    ds_read_u8 v26, v1 offset:20
; CHECK-NEXT:    ds_read_u8 v27, v1 offset:21
; CHECK-NEXT:    ds_read_u8 v28, v1 offset:22
; CHECK-NEXT:    ds_read_u8 v1, v1 offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(27)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(26)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt lgkmcnt(25)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt lgkmcnt(23)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    s_waitcnt lgkmcnt(22)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt lgkmcnt(21)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    s_waitcnt lgkmcnt(20)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt lgkmcnt(19)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt lgkmcnt(18)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt lgkmcnt(16)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p3_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_u8 v2, v1 offset:15
; CHECK-NEXT:    ds_read_u8 v3, v1 offset:14
; CHECK-NEXT:    ds_read_u8 v4, v1 offset:13
; CHECK-NEXT:    ds_read_u8 v5, v1 offset:12
; CHECK-NEXT:    ds_read_u8 v6, v1 offset:11
; CHECK-NEXT:    ds_read_u8 v7, v1 offset:8
; CHECK-NEXT:    ds_read_u8 v8, v1 offset:9
; CHECK-NEXT:    ds_read_u8 v9, v1 offset:10
; CHECK-NEXT:    ds_read_u8 v10, v1
; CHECK-NEXT:    ds_read_u8 v11, v1 offset:1
; CHECK-NEXT:    ds_read_u8 v12, v1 offset:2
; CHECK-NEXT:    ds_read_u8 v13, v1 offset:3
; CHECK-NEXT:    ds_read_u8 v14, v1 offset:4
; CHECK-NEXT:    ds_read_u8 v15, v1 offset:5
; CHECK-NEXT:    ds_read_u8 v16, v1 offset:6
; CHECK-NEXT:    ds_read_u8 v17, v1 offset:7
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    ds_read_u8 v2, v1 offset:24
; CHECK-NEXT:    ds_read_u8 v3, v1 offset:25
; CHECK-NEXT:    ds_read_u8 v4, v1 offset:26
; CHECK-NEXT:    ds_read_u8 v5, v1 offset:27
; CHECK-NEXT:    ds_read_u8 v6, v1 offset:28
; CHECK-NEXT:    ds_read_u8 v18, v1 offset:29
; CHECK-NEXT:    ds_read_u8 v19, v1 offset:30
; CHECK-NEXT:    ds_read_u8 v20, v1 offset:31
; CHECK-NEXT:    ds_read_u8 v21, v1 offset:16
; CHECK-NEXT:    ds_read_u8 v22, v1 offset:17
; CHECK-NEXT:    ds_read_u8 v23, v1 offset:18
; CHECK-NEXT:    ds_read_u8 v24, v1 offset:19
; CHECK-NEXT:    ds_read_u8 v25, v1 offset:20
; CHECK-NEXT:    ds_read_u8 v26, v1 offset:21
; CHECK-NEXT:    ds_read_u8 v27, v1 offset:22
; CHECK-NEXT:    ds_read_u8 v1, v1 offset:23
; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt lgkmcnt(16)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:31
; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p3_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_u16 v2, v1
; CHECK-NEXT:    ds_read_u16 v3, v1 offset:2
; CHECK-NEXT:    ds_read_u16 v4, v1 offset:4
; CHECK-NEXT:    ds_read_u16 v5, v1 offset:6
; CHECK-NEXT:    ds_read_u16 v6, v1 offset:8
; CHECK-NEXT:    ds_read_u16 v7, v1 offset:10
; CHECK-NEXT:    ds_read_u16 v8, v1 offset:12
; CHECK-NEXT:    ds_read_u16 v1, v1 offset:14
; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_u8 v2, v1 offset:30
; CHECK-NEXT:    ds_read_u16 v3, v1 offset:28
; CHECK-NEXT:    ds_read_u16 v4, v1 offset:26
; CHECK-NEXT:    ds_read_u16 v5, v1 offset:24
; CHECK-NEXT:    ds_read_u16 v6, v1 offset:22
; CHECK-NEXT:    ds_read_u16 v7, v1 offset:20
; CHECK-NEXT:    ds_read_u16 v8, v1 offset:18
; CHECK-NEXT:    ds_read_u16 v9, v1 offset:16
; CHECK-NEXT:    ds_read_u16 v10, v1 offset:14
; CHECK-NEXT:    ds_read_u16 v11, v1 offset:12
; CHECK-NEXT:    ds_read_u16 v12, v1 offset:10
; CHECK-NEXT:    ds_read_u16 v13, v1 offset:8
; CHECK-NEXT:    ds_read_u16 v14, v1 offset:6
; CHECK-NEXT:    ds_read_u16 v15, v1 offset:4
; CHECK-NEXT:    ds_read_u16 v16, v1 offset:2
; CHECK-NEXT:    ds_read_u16 v1, v1
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p3_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_u16 v2, v1 offset:30
; CHECK-NEXT:    ds_read_u16 v3, v1 offset:28
; CHECK-NEXT:    ds_read_u16 v4, v1 offset:26
; CHECK-NEXT:    ds_read_u16 v5, v1 offset:24
; CHECK-NEXT:    ds_read_u16 v6, v1 offset:22
; CHECK-NEXT:    ds_read_u16 v7, v1 offset:20
; CHECK-NEXT:    ds_read_u16 v8, v1 offset:18
; CHECK-NEXT:    ds_read_u16 v9, v1 offset:16
; CHECK-NEXT:    ds_read_u16 v10, v1 offset:14
; CHECK-NEXT:    ds_read_u16 v11, v1 offset:12
; CHECK-NEXT:    ds_read_u16 v12, v1 offset:10
; CHECK-NEXT:    ds_read_u16 v13, v1 offset:8
; CHECK-NEXT:    ds_read_u16 v14, v1 offset:6
; CHECK-NEXT:    ds_read_u16 v15, v1 offset:4
; CHECK-NEXT:    ds_read_u16 v16, v1 offset:2
; CHECK-NEXT:    ds_read_u16 v1, v1
; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p3_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p3_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p3_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p3_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
; CHECK-NEXT:    ds_read_b128 v[6:9], v1 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v6
; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v6
; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p3_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p3_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p3_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p3_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[1:4], v1
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p3_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p3_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[2:5], v1
; CHECK-NEXT:    ds_read_b128 v[6:9], v1 offset:15
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v6
; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v6
; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p3_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p3_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    ds_read_b128 v[2:5], v1
; CHECK-NEXT:    ds_read_b128 v[6:9], v1 offset:16
; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p4_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1e
; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off
; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:1
; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:2
; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:3
; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:4
; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:5
; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:6
; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:7
; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:8
; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:9
; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:10
; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:11
; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:12
; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:13
; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:14
; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:30
; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:29
; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:28
; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:27
; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:26
; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:25
; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:24
; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:23
; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:22
; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:21
; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:20
; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:19
; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:18
; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:17
; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:16
; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(29)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(28)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(26)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(24)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(22)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(21)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(20)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(19)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p4_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1f
; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off
; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:31
; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:30
; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:29
; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:28
; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:27
; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:26
; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:25
; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:24
; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:23
; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:22
; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:21
; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:20
; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:19
; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:18
; CHECK-NEXT:    global_load_ubyte v33, v[1:2], off offset:17
; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(31)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(30)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(29)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(28)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(27)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(26)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(25)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(24)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(23)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(22)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(21)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(20)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(19)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:31
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p4_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    global_load_ushort v3, v[1:2], off
; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:2
; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:4
; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:6
; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:8
; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:10
; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:12
; CHECK-NEXT:    global_load_ushort v1, v[1:2], off offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p4_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    global_load_ushort v3, v[1:2], off offset:30
; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p4_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p4_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p4_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p4_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p4_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p4_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p4_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p4_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p4_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p4_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p4_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p4_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x1
; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p5_sz16_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13
; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11
; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9
; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7
; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3
; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1
; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x11
; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:13
; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:11
; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:9
; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:5
; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:3
; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:1
; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_clause 0xc
; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
; CHECK-LABEL: memcpy_p5_p5_sz32_align_1_1:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x11
; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13
; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11
; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9
; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7
; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3
; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1
; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_clause 0xd
; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31
; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:31
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p5_sz16_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
; CHECK-LABEL: memcpy_p5_p5_sz32_align_2_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0xf
; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p5_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p5_sz16_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x3
; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p5_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p5_sz31_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x13
; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_dword v16, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v17, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v18, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v19, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(19)
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p5_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
; CHECK-LABEL: memcpy_p5_p5_sz32_align_8_8:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 32, i1 false)
  ret void
}

define void @memcpy_p5_p5_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p5_sz16_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x3
; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 16, i1 false)
  ret void
}

define void @memcpy_p5_p5_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p5_sz31_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x13
; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17
; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18
; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15
; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21
; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22
; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19
; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25
; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26
; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23
; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29
; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30
; CHECK-NEXT:    buffer_load_dword v16, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v17, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v18, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v19, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(19)
; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:17
; CHECK-NEXT:    s_waitcnt vmcnt(18)
; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:18
; CHECK-NEXT:    s_waitcnt vmcnt(17)
; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT:    s_waitcnt vmcnt(16)
; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(15)
; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:21
; CHECK-NEXT:    s_waitcnt vmcnt(14)
; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:22
; CHECK-NEXT:    s_waitcnt vmcnt(13)
; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT:    s_waitcnt vmcnt(12)
; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(11)
; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:25
; CHECK-NEXT:    s_waitcnt vmcnt(10)
; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:26
; CHECK-NEXT:    s_waitcnt vmcnt(9)
; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT:    s_waitcnt vmcnt(8)
; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:29
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
  ret void
}

define void @memcpy_p5_p5_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
; CHECK-LABEL: memcpy_p5_p5_sz32_align_16_16:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT:    s_clause 0x7
; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen
; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_waitcnt vmcnt(7)
; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT:    s_waitcnt vmcnt(6)
; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT:    s_waitcnt vmcnt(5)
; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT:    s_waitcnt vmcnt(4)
; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT:    s_waitcnt vmcnt(3)
; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
; CHECK-NEXT:    s_waitcnt vmcnt(2)
; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT:    s_waitcnt vmcnt(1)
; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT:    s_setpc_b64 s[30:31]
entry:
  tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 32, i1 false)
  ret void
}

declare void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2

attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }