llvm/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/irreducible-2.ll

; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s

define amdgpu_kernel void @cycle_diverge_enter(i32 %n, i32 %a, i32 %b) #0 {
;      entry(div)
;      /   \
;     H <-> B
;           |
;           X
; CHECK-LABEL: for function 'cycle_diverge_enter':
; CHECK-NOT: DIVERGENT: %uni.
; CHECK-NOT: DIVERGENT: br i1 %uni.

entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %div.cond = icmp slt i32 %tid, 0
  %uni.cond = icmp slt i32 %a, 0
  br i1 %div.cond, label %B, label %H  ; divergent branch

H:
  %div.merge.h = phi i32 [ 0, %entry ], [ %b, %B ]
  br label %B
; CHECK: DIVERGENT: %div.merge.h

B:
  %div.merge.b = phi i32 [ %a, %H ], [1, %entry ]
  %div.cond.b = icmp sgt i32 %div.merge.b, 0
  %div.b.inc = add i32 %b, 1
  br i1 %div.cond, label %X, label %H ; divergent branch
; CHECK: DIVERGENT: %div.merge.b

X:
  %div.use = add i32 %div.merge.b, 1
  ret void
; CHECK: DIVERGENT: %div.use =

}

define amdgpu_kernel void @cycle_diverge_exit(i32 %n, i32 %a, i32 %b) #0 {
;      entry
;      /   \
;     H <-> B(div)
;           |
;           X
;
; CHECK-LABEL: for function 'cycle_diverge_exit':
; CHECK-NOT: DIVERGENT: %uni.
; CHECK-NOT: DIVERGENT: br i1 %uni.

entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %div.cond = icmp slt i32 %tid, 0
  %uni.cond = icmp slt i32 %a, 0
  br i1 %uni.cond, label %B, label %H

H:
  %uni.merge.h = phi i32 [ 0, %entry ], [ %b, %B ]
  br label %B

B:
  %uni.merge.b = phi i32 [ %a, %H ], [1, %entry ]
  %uni.cond.b = icmp sgt i32 %uni.merge.b, 0
  %uni.b.inc = add i32 %b, 1
  br i1 %div.cond, label %X, label %H ; divergent branch

X:
  %div.use = add i32 %uni.merge.b, 1
  ret void
; CHECK: DIVERGENT: %div.use =
}

define amdgpu_kernel void @cycle_reentrance(i32 %n, i32 %a, i32 %b) #0 {
; For this case, threads enter the cycle from C would take C->D->H,
; at the point of H, diverged threads may continue looping in cycle(H-B-D)
; until all threads exit the cycle(H-B-D) and cause temporal divergence
; exiting at edge H->C. We currently do not analyze such kind of inner
; cycle temporal divergence. Instead, we mark all values in the cycle
; being divergent conservatively.
;      entry--\
;       |     |
;  ---> H(div)|
;  |   / \    /
;  |  B   C<--
;  ^   \ /
;  \----D
;       |
;       X
; CHECK-LABEL: for function 'cycle_reentrance':
; CHECK-NOT: DIVERGENT: %uni.
; CHECK-NOT: DIVERGENT: br i1 %uni.

entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %div.cond = icmp slt i32 %tid, 0
  %uni.cond = icmp slt i32 %a, 0
  br i1 %uni.cond, label %H, label %C

H:
  %div.merge.h = phi i32 [ 0, %entry ], [ %b, %D ]
  br i1 %div.cond, label %B, label %C  ; divergent branch

B:
  %div.inc.b = add i32 %div.merge.h, 1
; CHECK: DIVERGENT: %div.inc.b
  br label %D

C:
  %div.merge.c = phi i32 [0, %entry], [%div.merge.h, %H]
  %div.inc.c = add i32 %div.merge.c, 2
; CHECK: DIVERGENT: %div.inc.c
  br label %D

D:
  %div.merge.d = phi i32 [ %div.inc.b, %B ], [ %div.inc.c, %C ]
; CHECK: DIVERGENT: %div.merge.d
  br i1 %uni.cond, label %X, label %H

X:
  ret void
}

define amdgpu_kernel void @cycle_reentrance2(i32 %n, i32 %a, i32 %b) #0 {
; This is mostly the same as cycle_reentrance, the only difference is
; the successor order, thus different dfs visiting order. This is just
; make sure we are doing uniform analysis correctly under different dfs
; order.
;      entry--\
;       |     |
;  ---> H(div)|
;  |   / \    /
;  |  B   C<--
;  ^   \ /
;  \----D
;       |
;       X
; CHECK-LABEL: for function 'cycle_reentrance2':
; CHECK-NOT: DIVERGENT: %uni.
; CHECK-NOT: DIVERGENT: br i1 %uni.

entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %div.cond = icmp slt i32 %tid, 0
  %uni.cond = icmp slt i32 %a, 0
  br i1 %uni.cond, label %C, label %H

H:
  %div.merge.h = phi i32 [ 0, %entry ], [ %b, %D ]
  br i1 %div.cond, label %B, label %C  ; divergent branch

B:
  %div.inc.b = add i32 %div.merge.h, 1
; CHECK: DIVERGENT: %div.inc.b
  br label %D

C:
  %div.merge.c = phi i32 [0, %entry], [%div.merge.h, %H]
  %div.inc.c = add i32 %div.merge.c, 2
; CHECK: DIVERGENT: %div.inc.c
  br label %D

D:
  %div.merge.d = phi i32 [ %div.inc.b, %B ], [ %div.inc.c, %C ]
; CHECK: DIVERGENT: %div.merge.d
  br i1 %uni.cond, label %X, label %H

X:
  ret void
}

define amdgpu_kernel void @cycle_join_dominated_by_diverge(i32 %n, i32 %a, i32 %b) #0 {
; the join-node D is dominated by diverge point H2
;      entry
;       | |
;  --> H1 |
;  |     \|
;  |      H2(div)
;  |     / \
;  |    B   C
;  ^     \ /
;  \------D
;         |
;         X
; CHECK-LABEL: for function 'cycle_join_dominated_by_diverge':
; CHECK-NOT: DIVERGENT: %uni.
; CHECK-NOT: DIVERGENT: br i1 %uni.

entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %div.cond = icmp slt i32 %tid, 0
  %uni.cond = icmp slt i32 %a, 0
  br i1 %uni.cond, label %H1, label %H2

H1:
  %uni.merge.h1 = phi i32 [ 0, %entry ], [ %b, %D ]
  br label %H2

H2:
  %uni.merge.h2 = phi i32 [ 0, %entry ], [ %b, %H1 ]
  br i1 %div.cond, label %B, label %C  ; divergent branch

B:
  %uni.inc.b = add i32 %uni.merge.h2, 1
  br label %D

C:
  %uni.inc.c = add i32 %uni.merge.h2, 2
  br label %D

D:
  %div.merge.d = phi i32 [ %uni.inc.b, %B ], [ %uni.inc.c, %C ]
; CHECK: DIVERGENT: %div.merge.d
  br i1 %uni.cond, label %X, label %H1

X:
  ret void
}

define amdgpu_kernel void @cycle_join_dominated_by_entry(i32 %n, i32 %a, i32 %b) #0 {
; the join-node D is dominated by cycle entry H2
;      entry
;       | |
;  --> H1 |
;  |     \|
;  |      H2 -----
;  |      |      |
;  |      A(div) |
;  |     / \     v
;  |    B   C   /
;  ^     \ /   /
;  \------D <-/
;         |
;         X
; CHECK-LABEL: for function 'cycle_join_dominated_by_entry':
; CHECK-NOT: DIVERGENT: %uni.
; CHECK-NOT: DIVERGENT: br i1 %uni.

entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %div.cond = icmp slt i32 %tid, 0
  %uni.cond = icmp slt i32 %a, 0
  br i1 %uni.cond, label %H1, label %H2

H1:
  %uni.merge.h1 = phi i32 [ 0, %entry ], [ %b, %D ]
  br label %H2

H2:
  %uni.merge.h2 = phi i32 [ 0, %entry ], [ %b, %H1 ]
  br i1 %uni.cond, label %A, label %D

A:
  br i1 %div.cond, label %B, label %C  ; divergent branch

B:
  %uni.inc.b = add i32 %uni.merge.h2, 1
  br label %D

C:
  %uni.inc.c = add i32 %uni.merge.h2, 2
  br label %D

D:
  %div.merge.d = phi i32 [ %uni.inc.b, %B ], [ %uni.inc.c, %C ], [%uni.merge.h2, %H2]
; CHECK: DIVERGENT: %div.merge.d
  br i1 %uni.cond, label %X, label %H1

X:
  ret void
}

define amdgpu_kernel void @cycle_join_not_dominated(i32 %n, i32 %a, i32 %b) #0 {
; if H is the header, the sync label propagation may stop at join node D.
; But join node D is not dominated by divergence starting block C, and also
; not dominated by any entries(H/C). So we conservatively mark all the values
; in the cycle divergent for now.
;      entry
;       |  |
;  ---> H  |
;  |   / \ v
;  |  B<--C(div)
;  ^   \ /
;  \----D
;       |
;       X
; CHECK-LABEL: for function 'cycle_join_not_dominated':
; CHECK-NOT: DIVERGENT: %uni.

entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %div.cond = icmp slt i32 %tid, 0
  %uni.cond = icmp slt i32 %a, 0
  br i1 %uni.cond, label %C, label %H

H:
  %div.merge.h = phi i32 [ 0, %entry ], [ %b, %D ]
  br i1 %uni.cond, label %B, label %C

B:
  %div.merge.b = phi i32 [ 0, %H ], [ %b, %C ]
  %div.inc.b = add i32 %div.merge.b, 1
; CHECK: DIVERGENT: %div.inc.b
  br label %D

C:
  %div.merge.c = phi i32 [0, %entry], [%div.merge.h, %H]
  %div.inc.c = add i32 %div.merge.c, 2
; CHECK: DIVERGENT: %div.inc.c
  br i1 %div.cond, label %D, label %B  ; divergent branch

D:
  %div.merge.d = phi i32 [ %div.inc.b, %B ], [ %div.inc.c, %C ]
; CHECK: DIVERGENT: %div.merge.d
  br i1 %uni.cond, label %X, label %H

X:
  ret void
}

define amdgpu_kernel void @cycle_join_not_dominated2(i32 %n, i32 %a, i32 %b) #0 {
; This is mostly the same as cycle_join_not_dominated, the only difference is
; the dfs visiting order, so the cycle analysis result is different.
;      entry
;       |  |
;  ---> H  |
;  |   / \ v
;  |  B<--C(div)
;  ^   \ /
;  \----D
;       |
;       X
; CHECK-LABEL: for function 'cycle_join_not_dominated2':
; CHECK-NOT: DIVERGENT: %uni.

entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %div.cond = icmp slt i32 %tid, 0
  %uni.cond = icmp slt i32 %a, 0
  br i1 %uni.cond, label %H, label %C

H:
  %div.merge.h = phi i32 [ 0, %entry ], [ %b, %D ]
  br i1 %uni.cond, label %B, label %C

B:
  %div.merge.b = phi i32 [ 0, %H ], [ %b, %C ]
  %div.inc.b = add i32 %div.merge.b, 1
; CHECK: DIVERGENT: %div.inc.b
  br label %D

C:
  %div.merge.c = phi i32 [0, %entry], [%div.merge.h, %H]
  %div.inc.c = add i32 %div.merge.c, 2
; CHECK: DIVERGENT: %div.inc.c
  br i1 %div.cond, label %D, label %B  ; divergent branch

D:
  %div.merge.d = phi i32 [ %div.inc.b, %B ], [ %div.inc.c, %C ]
; CHECK: DIVERGENT: %div.merge.d
  br i1 %uni.cond, label %X, label %H

X:
  ret void
}

define amdgpu_kernel void @natural_loop_two_backedges(i32 %n, i32 %a, i32 %b) #0 {
; FIXME: the uni.merge.h can be viewed as uniform.
; CHECK-LABEL: for function 'natural_loop_two_backedges':

entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %div.cond = icmp slt i32 %tid, 0
  %uni.cond = icmp slt i32 %a, 0
  br label %H

H:
  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ], [ %uni.inc, %C]
  %uni.inc = add i32 %uni.merge.h, 1
  br label %B

B:
  br i1 %div.cond, label %C, label %H

C:
  br i1 %uni.cond, label %X, label %H

X:
  ret void
}

define amdgpu_kernel void @natural_loop_two_backedges2(i32 %n, i32 %a, i32 %b) #0 {
; FIXME: the uni.merge.h can be viewed as uniform.
; CHECK-LABEL: for function 'natural_loop_two_backedges2':

entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %div.cond = icmp slt i32 %tid, 0
  %uni.cond = icmp slt i32 %a, 0
  br label %H

H:
  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ], [ %uni.inc, %C]
  %uni.inc = add i32 %uni.merge.h, 1
  br i1 %uni.cond, label %B, label %D

B:
  br i1 %div.cond, label %C, label %H

C:
  br label %H

D:
  br i1 %uni.cond, label %B, label %X

X:
  ret void
}

define amdgpu_kernel void @cycle_enter_nested(i32 %n, i32 %a, i32 %b) #0 {
;
;   entry(div)
;       |   \
;   --> H1   |
;  /    |    |
;  | -> H2   |
;  | |  |    /
;  | \--B <--
;  ^    |
;  \----C
;       |
;       X
; CHECK-LABEL: for function 'cycle_enter_nested':
; CHECK-NOT: DIVERGENT: %uni.
; CHECK-NOT: DIVERGENT: br i1 %uni.

entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %div.cond = icmp slt i32 %tid, 0
  %uni.cond = icmp slt i32 %a, 0
  br i1 %div.cond, label %B, label %H1

H1:
  %div.merge.h1 = phi i32 [ 1, %entry ], [ %b, %C ]
  br label %H2
; CHECK: DIVERGENT: %div.merge.h1

H2:
  %div.merge.h2 = phi i32 [ 2, %B ], [ %a, %H1 ]
; CHECK: DIVERGENT: %div.merge.h2
  br label %B

B:
  %div.merge.b = phi i32 [0, %entry], [%a, %H2]
; CHECK: DIVERGENT: %div.merge.b
  br i1 %uni.cond, label %C, label %H2

C:
  br i1 %uni.cond, label %X, label %H1

X:
  ret void
}

define amdgpu_kernel void @cycle_inner_exit_enter(i32 %n, i32 %a, i32 %b) #0 {
;          entry
;        /      \
;       E1-> A-> E2
;       ^    |    \
;       |    E3-> E4
;       |    ^   /
;       |     \ /
;       C <----B
;       |
;       X
; CHECK-LABEL: for function 'cycle_inner_exit_enter':
; CHECK-NOT: DIVERGENT: %uni.
; CHECK-NOT: DIVERGENT: br i1 %uni.

entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %div.cond = icmp slt i32 %tid, 0
  %uni.cond = icmp slt i32 %a, 0
  br i1 %uni.cond, label %E2, label %E1

E1:
  %div.merge.e1 = phi i32 [ 1, %entry ], [ %b, %C ]
  br label %A
; CHECK: DIVERGENT: %div.merge.e1

A:
  br i1 %uni.cond, label %E2, label %E3

E2:
  %div.merge.e2 = phi i32 [ 2, %entry ], [ %a, %A ]
; CHECK: DIVERGENT: %div.merge.e2
  br label %E4

E3:
  %div.merge.e3 = phi i32 [ 0, %A ], [ %b, %B ]
; CHECK: DIVERGENT: %div.merge.e3
  br label %E4

E4:
  %div.merge.e4 = phi i32 [ 0, %E2 ], [ %a, %E3 ]
; CHECK: DIVERGENT: %div.merge.e4
  br label %B

B:
  br i1 %div.cond, label %C, label %E3

C:
  br i1 %uni.cond, label %X, label %E1

X:
  ret void
}

define amdgpu_kernel void @cycle_inner_exit_enter2(i32 %n, i32 %a, i32 %b) #0 {
; This case is almost the same as cycle_inner_exit_enter, with only different
; dfs visiting order, thus different cycle hierarchy.
;          entry
;        /      \
;       E1-> A-> E2
;       ^    |    \
;       |    E3-> E4
;       |    ^   /
;       |     \ /
;       C <----B
;       |
;       X
; CHECK-LABEL: for function 'cycle_inner_exit_enter2':
; CHECK-NOT: DIVERGENT: %uni.
; CHECK-NOT: DIVERGENT: br i1 %uni.

entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %div.cond = icmp slt i32 %tid, 0
  %uni.cond = icmp slt i32 %a, 0
  br i1 %uni.cond, label %E1, label %E2

E1:
  %div.merge.e1 = phi i32 [ 1, %entry ], [ %b, %C ]
  br label %A
; CHECK: DIVERGENT: %div.merge.e1

A:
  br i1 %uni.cond, label %E2, label %E3

E2:
  %div.merge.e2 = phi i32 [ 2, %entry ], [ %a, %A ]
; CHECK: DIVERGENT: %div.merge.e2
  br label %E4

E3:
  %div.merge.e3 = phi i32 [ 0, %A ], [ %b, %B ]
; CHECK: DIVERGENT: %div.merge.e3
  br label %E4

E4:
  %div.merge.e4 = phi i32 [ 0, %E2 ], [ %a, %E3 ]
; CHECK: DIVERGENT: %div.merge.e4
  br label %B

B:
  br i1 %div.cond, label %C, label %E3

C:
  br i1 %uni.cond, label %X, label %E1

X:
  ret void
}

define amdgpu_kernel void @always_uniform() {
; CHECK-LABEL: UniformityInfo for function 'always_uniform':
; CHECK: CYCLES ASSSUMED DIVERGENT:
; CHECK:   depth=1: entries(bb2 bb3)

bb:
  %inst = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
  %inst1 = icmp ugt i32 %inst, 0
  br i1 %inst1, label %bb3, label %bb2
; CHECK:   DIVERGENT:   %inst = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
; CHECK:   DIVERGENT:   %inst1 = icmp ugt i32 %inst, 0
; CHECK:   DIVERGENT:   br i1 %inst1, label %bb3, label %bb2

bb2:                                              ; preds = %bb3, %bb
  br label %bb3

bb3:                                              ; preds = %bb2, %bb
  %inst4 = tail call i64 @llvm.amdgcn.icmp.i64.i16(i16 0, i16 0, i32 0)
  %inst5 = trunc i64 %inst4 to i32
  %inst6 = and i32 0, %inst5
  br label %bb2
; CHECK-LABEL: BLOCK bb3
; CHECK-NOT: DIVERGENT: {{.*}} call i64 @llvm.amdgcn.icmp.i64.i16
; CHECK:   DIVERGENT:   %inst5 = trunc i64 %inst4 to i32
; CHECK:   DIVERGENT:   %inst6 = and i32 0, %inst5
}

declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)

declare i64 @llvm.amdgcn.icmp.i64.i16(i16, i16, i32 immarg)

declare i32 @llvm.amdgcn.workitem.id.x() #0

attributes #0 = { nounwind readnone }