llvm/llvm/test/Transforms/SimpleLoopUnswitch/AMDGPU/nontrivial-unswitch-divergent-target.ll

; RUN: opt -mtriple=amdgcn-- -passes='loop(simple-loop-unswitch<nontrivial>),verify<loops>' -S < %s | FileCheck %s
; RUN: opt -mtriple=amdgcn-- -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -S < %s | FileCheck %s
; RUN: opt -mtriple=amdgcn-- -passes='simple-loop-unswitch<nontrivial>' -verify-memoryssa -S < %s | FileCheck %s

declare i32 @a()
declare i32 @b()
declare i32 @c()

; Non-trivial loop unswitching where there are two distinct trivial
; conditions to unswitch within the loop. The conditions are divergent
; and should not unswitch.
define void @test1(ptr %ptr, i1 %cond1, i1 %cond2) {
; CHECK-LABEL: @test1(
entry:
  br label %loop_begin
; CHECK-NEXT:  entry:
; CHECK-NEXT:    br label %loop_begin

loop_begin:
  br i1 %cond1, label %loop_a, label %loop_b
; CHECK: loop_begin:
; CHECK-NEXT: br i1 %cond1, label %loop_a, label %loop_b

loop_a:
  %unused.a = call i32 @a()
  br label %latch
; CHECK: loop_a:
; CHECK-NEXT: %unused.a = call i32 @a()
; CHECK-NEXT: br label %latch

loop_b:
  br i1 %cond2, label %loop_b_a, label %loop_b_b
; CHECK: loop_b:
; CHECK-NEXT: br i1 %cond2, label %loop_b_a, label %loop_b_b

loop_b_a:
  %unused.b = call i32 @b()
  br label %latch
; CHECK: loop_b_a:
; CHECK-NEXT: %unused.b = call i32 @b()
; CHECK-NEXT: br label %latch

loop_b_b:
  %unused.c = call i32 @c()
  br label %latch
; CHECK: loop_b_b:
; CHECK-NEXT: %unused.c = call i32 @c()
; CHECK-NEXT: br label %latch

latch:
  %v = load i1, ptr %ptr
  br i1 %v, label %loop_begin, label %loop_exit
; CHECK: latch:
; CHECK-NEXT: %v = load i1, ptr %ptr
; CHECK-NEXT: br i1 %v, label %loop_begin, label %loop_exit

loop_exit:
  ret void
; CHECK: loop_exit:
; CHECK-NEXT: ret void
}

; Non-trivial loop unswitching where there are two distinct trivial
; conditions to unswitch within the loop. The conditions are known to
; be uniform, so it should be unswitchable. However, unswitch
; currently does not make use of UniformityAnalysis.
define amdgpu_kernel void @test1_uniform(ptr %ptr, i1 %cond1, i1 %cond2) {
; CHECK-LABEL: @test1_uniform(
entry:
  br label %loop_begin
; CHECK-NEXT:  entry:
; CHECK-NEXT:    br label %loop_begin

loop_begin:
  br i1 %cond1, label %loop_a, label %loop_b
; CHECK: loop_begin:
; CHECK-NEXT: br i1 %cond1, label %loop_a, label %loop_b

loop_a:
  %unused.a = call i32 @a()
  br label %latch
; CHECK: loop_a:
; CHECK-NEXT: %unused.a = call i32 @a()
; CHECK-NEXT: br label %latch

loop_b:
  br i1 %cond2, label %loop_b_a, label %loop_b_b
; CHECK: loop_b:
; CHECK-NEXT: br i1 %cond2, label %loop_b_a, label %loop_b_b

loop_b_a:
  %unused.b = call i32 @b()
  br label %latch
; CHECK: loop_b_a:
; CHECK-NEXT: %unused.b = call i32 @b()
; CHECK-NEXT: br label %latch

loop_b_b:
  %unused.c = call i32 @c()
  br label %latch
; CHECK: loop_b_b:
; CHECK-NEXT: %unused.c = call i32 @c()
; CHECK-NEXT: br label %latch

latch:
  %v = load i1, ptr %ptr
  br i1 %v, label %loop_begin, label %loop_exit
; CHECK: latch:
; CHECK-NEXT: %v = load i1, ptr %ptr
; CHECK-NEXT: br i1 %v, label %loop_begin, label %loop_exit

loop_exit:
  ret void
; CHECK: loop_exit:
; CHECK-NEXT: ret void
}

; Non-trivial loop unswitching where there are two distinct trivial
; conditions to unswitch within the loop. There is no divergence
; because it's assumed it can only execute with a workgroup of size 1.
define void @test1_single_lane_execution(ptr %ptr, i1 %cond1, i1 %cond2) #0 {
; CHECK-LABEL: @test1_single_lane_execution(
entry:
  br label %loop_begin
; CHECK-NEXT:  entry:
; CHECK-NEXT:    br i1 %cond1, label %entry.split.us, label %entry.split

loop_begin:
  br i1 %cond1, label %loop_a, label %loop_b

loop_a:
  call i32 @a()
  br label %latch
; The 'loop_a' unswitched loop.
;
; CHECK:       entry.split.us:
; CHECK-NEXT:    br label %loop_begin.us
;
; CHECK:       loop_begin.us:
; CHECK-NEXT:    br label %loop_a.us
;
; CHECK:       loop_a.us:
; CHECK-NEXT:    call i32 @a()
; CHECK-NEXT:    br label %latch.us
;
; CHECK:       latch.us:
; CHECK-NEXT:    %[[V:.*]] = load i1, ptr %ptr
; CHECK-NEXT:    br i1 %[[V]], label %loop_begin.us, label %loop_exit.split.us
;
; CHECK:       loop_exit.split.us:
; CHECK-NEXT:    br label %loop_exit

loop_b:
  br i1 %cond2, label %loop_b_a, label %loop_b_b
; The second unswitched condition.
;
; CHECK:       entry.split:
; CHECK-NEXT:    br i1 %cond2, label %entry.split.split.us, label %entry.split.split

loop_b_a:
  call i32 @b()
  br label %latch
; The 'loop_b_a' unswitched loop.
;
; CHECK:       entry.split.split.us:
; CHECK-NEXT:    br label %loop_begin.us1
;
; CHECK:       loop_begin.us1:
; CHECK-NEXT:    br label %loop_b.us
;
; CHECK:       loop_b.us:
; CHECK-NEXT:    br label %loop_b_a.us
;
; CHECK:       loop_b_a.us:
; CHECK-NEXT:    call i32 @b()
; CHECK-NEXT:    br label %latch.us2
;
; CHECK:       latch.us2:
; CHECK-NEXT:    %[[V:.*]] = load i1, ptr %ptr
; CHECK-NEXT:    br i1 %[[V]], label %loop_begin.us1, label %loop_exit.split.split.us
;
; CHECK:       loop_exit.split.split.us:
; CHECK-NEXT:    br label %loop_exit.split

loop_b_b:
  call i32 @c()
  br label %latch
; The 'loop_b_b' unswitched loop.
;
; CHECK:       entry.split.split:
; CHECK-NEXT:    br label %loop_begin
;
; CHECK:       loop_begin:
; CHECK-NEXT:    br label %loop_b
;
; CHECK:       loop_b:
; CHECK-NEXT:    br label %loop_b_b
;
; CHECK:       loop_b_b:
; CHECK-NEXT:    call i32 @c()
; CHECK-NEXT:    br label %latch
;
; CHECK:       latch:
; CHECK-NEXT:    %[[V:.*]] = load i1, ptr %ptr
; CHECK-NEXT:    br i1 %[[V]], label %loop_begin, label %loop_exit.split.split
;
; CHECK:       loop_exit.split.split:
; CHECK-NEXT:    br label %loop_exit.split

latch:
  %v = load i1, ptr %ptr
  br i1 %v, label %loop_begin, label %loop_exit

loop_exit:
  ret void
; CHECK:       loop_exit.split:
; CHECK-NEXT:    br label %loop_exit
;
; CHECK:       loop_exit:
; CHECK-NEXT:    ret
}

attributes #0 = { "amdgpu-flat-work-group-size"="1,1" }