llvm/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll

; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -asm-verbose=0 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10-ASM %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s -filetype=obj | llvm-objdump -d --arch-name=amdgcn --mcpu=gfx1030 --symbolize-operands - | FileCheck --check-prefixes=GCN,GFX10,GFX10-DIS %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s

; GFX8-NOT: s_inst_prefetch
; GFX8-NOT: .palign 6

; GCN-LABEL: test_loop_64
; GFX10:          s_movk_i32 s{{[0-9]+}}, 0x400
; GFX10-DIS-NEXT: {{^$}}
; GFX10-ASM-NEXT: [[L1:.LBB[0-9_]+]]:
; GFX10-DIS-NEXT: <[[L1:L[0-9]+]]>:
; GFX10:          s_sleep 0
; GFX10:          s_cbranch_scc0 [[L1]]
; GFX10-NEXT:     s_endpgm
define amdgpu_kernel void @test_loop_64(ptr addrspace(1) nocapture %arg) {
bb:
  br label %bb2

bb1:                                              ; preds = %bb2
  ret void

bb2:                                              ; preds = %bb2, %bb
  %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ]
  %tmp2 = add nuw nsw i32 %tmp1, 1
  %tmp3 = icmp eq i32 %tmp2, 1024
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  br i1 %tmp3, label %bb1, label %bb2
}

; GCN-LABEL: test_loop_128
; GFX10:          s_movk_i32 s{{[0-9]+}}, 0x400
; GFX10-ASM-NEXT: .p2align 6
; GFX10-DIS-NEXT: s_nop 0
; GFX10-NOT:      s_inst_prefetch
; GFX10-ASM:      [[L1:.LBB[0-9_]+]]:
; GFX10-DIS:      <[[L1:L[0-9]+]]>:
; GFX10:          s_sleep 0
; GFX10:          s_cbranch_scc0 [[L1]]
; GFX10-NEXT:     s_endpgm
define amdgpu_kernel void @test_loop_128(ptr addrspace(1) nocapture %arg) {
bb:
  br label %bb2

bb1:                                              ; preds = %bb2
  ret void

bb2:                                              ; preds = %bb2, %bb
  %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ]
  %tmp2 = add nuw nsw i32 %tmp1, 1
  %tmp3 = icmp eq i32 %tmp2, 1024
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  br i1 %tmp3, label %bb1, label %bb2
}

; GCN-LABEL: test_loop_192
; GFX10:          s_movk_i32 s{{[0-9]+}}, 0x400
; GFX10-NEXT:     s_inst_prefetch 0x1
; GFX10-ASM-NEXT: .p2align 6
; GFX10-DIS-NEXT: s_nop 0
; GFX10-NOT:      s_inst_prefetch
; GFX10-ASM:      [[L1:.LBB[0-9_]+]]:
; GFX10-DIS:      <[[L1:L[0-9]+]]>:
; GFX10:          s_sleep 0
; GFX10:          s_cbranch_scc0 [[L1]]
; GFX10-NEXT:     s_inst_prefetch 0x2
; GFX10-NEXT:     s_endpgm
define amdgpu_kernel void @test_loop_192(ptr addrspace(1) nocapture %arg) {
bb:
  br label %bb2

bb1:                                              ; preds = %bb2
  ret void

bb2:                                              ; preds = %bb2, %bb
  %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ]
  %tmp2 = add nuw nsw i32 %tmp1, 1
  %tmp3 = icmp eq i32 %tmp2, 1024
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  br i1 %tmp3, label %bb1, label %bb2
}

; GCN-LABEL: test_loop_256
; GFX10:          s_movk_i32 s{{[0-9]+}}, 0x400
; GFX10-DIS-NEXT: {{^$}}
; GFX10-ASM-NEXT: [[L1:.LBB[0-9_]+]]:
; GFX10-DIS-NEXT: <[[L1:L[0-9]+]]>:
; GFX10:          s_sleep 0
; GFX10:          s_cbranch_scc0 [[L1]]
; GFX10-NEXT:     s_endpgm
define amdgpu_kernel void @test_loop_256(ptr addrspace(1) nocapture %arg) {
bb:
  br label %bb2

bb1:                                              ; preds = %bb2
  ret void

bb2:                                              ; preds = %bb2, %bb
  %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ]
  %tmp2 = add nuw nsw i32 %tmp1, 1
  %tmp3 = icmp eq i32 %tmp2, 1024
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  br i1 %tmp3, label %bb1, label %bb2
}

; GCN-LABEL: test_loop_prefetch_inner_outer
; GFX10:          s_inst_prefetch 0x1
; GFX10-ASM-NEXT: .p2align 6
; GFX10-DIS-NEXT: s_nop 0
; GFX10-NOT:      s_inst_prefetch
; GFX10-ASM:      [[L1:.LBB[0-9_]+]]:
; GFX10-DIS:      <[[L1:L[0-9]+]]>:
; GFX10-NOT:      s_inst_prefetch
; GFX10-ASM:      .p2align 6
; GFX10-DIS:      s_nop 0
; GFX10-NOT:      s_inst_prefetch
; GFX10-ASM:      [[L2:.LBB[0-9_]+]]:
; GFX10-DIS:      <[[L2:L[0-9]+]]>:
; GFX10-NOT:      s_inst_prefetch
; GFX10:          s_sleep 0
; GFX10:          s_cbranch_scc{{[01]}} [[L2]]
; GFX10-NOT:      s_inst_prefetch
; GFX10:          s_cbranch_scc{{[01]}} [[L1]]
; GFX10-NEXT:     s_inst_prefetch 0x2
; GFX10-NEXT:     s_endpgm
define amdgpu_kernel void @test_loop_prefetch_inner_outer(ptr addrspace(1) nocapture %arg) {
bb:
  br label %bb2

bb1:
  ret void

bb2:
  %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb4 ]
  %tmp2 = add nuw nsw i32 %tmp1, 1
  %tmp3 = icmp eq i32 %tmp2, 1024
  br label %bb3

bb3:
  %tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb3 ]
  %tmp5 = add nuw nsw i32 %tmp4, 1
  %tmp6 = icmp eq i32 %tmp5, 1024
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  br i1 %tmp6, label %bb4, label %bb3

bb4:
  br i1 %tmp3, label %bb1, label %bb2
}

; GCN-LABEL: test_loop_prefetch_inner_outer_noouter
; GFX10-NOT:      .p2align 6
; GFX10-NOT:      s_nop
; GFX10-NOT:      s_inst_prefetch
; GFX10-ASM:      [[L0:.LBB[0-9_]+]]:
; GFX10-DIS:      <[[L0:L[0-9]+]]>:
; GFX10:          s_inst_prefetch 0x1
; GFX10-ASM-NEXT: .p2align 6
; GFX10-DIS-NEXT: s_nop 0
; GFX10-NOT:      s_inst_prefetch
; GFX10-ASM:      [[L1:.LBB[0-9_]+]]:
; GFX10-DIS:      <[[L1:L[0-9]+]]>:
; GFX10-NOT:      s_inst_prefetch
; GFX10-ASM:      .p2align 6
; GFX10-DIS:      s_nop 0
; GFX10-NOT:      s_inst_prefetch
; GFX10-ASM:      [[L2:.LBB[0-9_]+]]:
; GFX10-DIS:      <[[L2:L[0-9]+]]>:
; GFX10-NOT:      s_inst_prefetch
; GFX10:          s_sleep 0
; GFX10:          s_cbranch_scc{{[01]}} [[L2]]
; GFX10-NOT:      s_inst_prefetch
; GFX10:          s_cbranch_scc{{[01]}} [[L1]]
; GFX10-NEXT:     s_inst_prefetch 0x2
; GFX10:          s_cbranch_scc{{[01]}} [[L0]]
; GFX10-NEXT:     s_endpgm
define amdgpu_kernel void @test_loop_prefetch_inner_outer_noouter(ptr addrspace(1) nocapture %arg) {
bb:
  br label %bb2

bb1:
  ret void

bb2:
  %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb6 ]
  %tmp2 = add nuw nsw i32 %tmp1, 1
  %tmp3 = icmp eq i32 %tmp2, 1024
  br label %bb3

bb3:
  %tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb5 ]
  %tmp5 = add nuw nsw i32 %tmp4, 1
  %tmp6 = icmp eq i32 %tmp5, 1024
  br label %bb4

bb4:
  %tmp7 = phi i32 [ 0, %bb3 ], [ %tmp8, %bb4 ]
  %tmp8 = add nuw nsw i32 %tmp7, 1
  %tmp9 = icmp eq i32 %tmp8, 1024
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  br i1 %tmp9, label %bb5, label %bb4

bb5:
  br i1 %tmp6, label %bb6, label %bb3

bb6:
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  tail call void @llvm.amdgcn.s.sleep(i32 0)
  br i1 %tmp3, label %bb1, label %bb2
}

declare void @llvm.amdgcn.s.sleep(i32)