llvm/llvm/test/CodeGen/AMDGPU/regcoalescer-resolve-lane-conflict-by-subranges.mir

# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -run-pass register-coalescer -verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s
#


---
# the COPY can be coalesced based on subregister liveness
name:            subrange_coalesce_liveout
tracksRegLiveness: true
body:             |
  ; GCN-LABEL: name: subrange_coalesce_liveout
  ; GCN: bb.0:
  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec
  ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
  ; GCN-NEXT:   S_BRANCH %bb.1
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT: bb.1:
  ; GCN-NEXT:   successors: %bb.2(0x80000000)
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub1, implicit $exec
  ; GCN-NEXT:   S_BRANCH %bb.2
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT: bb.2:
  ; GCN-NEXT:   dead [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub2, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
  ; GCN-NEXT:   S_ENDPGM 0
  bb.0:
    successors: %bb.1, %bb.2
    liveins: $vgpr0_vgpr1

    %0:vreg_64 = COPY $vgpr0_vgpr1
    %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec
    %2:vgpr_32 = COPY %1.sub0
    S_CBRANCH_EXECZ %bb.2, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.2

    %2:vgpr_32 = V_AND_B32_e64 %1.sub0, %1.sub1, implicit $exec
    S_BRANCH %bb.2

  bb.2:
    %4:vgpr_32 = V_ADD_U32_e32 %1.sub2, %2, implicit $exec
    S_ENDPGM 0
...

---
# early-clobber stops the coalescer from coalescing the COPY
name:            subrange_coalesce_early_clobber
tracksRegLiveness: true
body:             |
  ; GCN-LABEL: name: subrange_coalesce_early_clobber
  ; GCN: bb.0:
  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec
  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub0
  ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
  ; GCN-NEXT:   S_BRANCH %bb.1
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT: bb.1:
  ; GCN-NEXT:   successors: %bb.2(0x80000000)
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT:   early-clobber [[COPY1]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub2, implicit $exec
  ; GCN-NEXT:   S_BRANCH %bb.2
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT: bb.2:
  ; GCN-NEXT:   dead [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub2, [[COPY1]], implicit $exec
  ; GCN-NEXT:   S_ENDPGM 0
  bb.0:
    successors: %bb.1, %bb.2
    liveins: $vgpr0_vgpr1

    %0:vreg_64 = COPY $vgpr0_vgpr1
    %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec
    %2:vgpr_32 = COPY %1.sub0
    S_CBRANCH_EXECZ %bb.2, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.2

    early-clobber %2:vgpr_32 = V_AND_B32_e64 %1.sub0, %1.sub2, implicit $exec
    S_BRANCH %bb.2

  bb.2:
    %4:vgpr_32 = V_ADD_U32_e32 %1.sub2, %2, implicit $exec
    S_ENDPGM 0
...

---
# non-conflict lane(sub1) was redefined, coalescable
name:            subrange_coalesce_unrelated_sub_redefined
tracksRegLiveness: true
body:             |
  ; GCN-LABEL: name: subrange_coalesce_unrelated_sub_redefined
  ; GCN: bb.0:
  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec
  ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
  ; GCN-NEXT:   S_BRANCH %bb.1
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT: bb.1:
  ; GCN-NEXT:   successors: %bb.2(0x80000000)
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub1, implicit $exec
  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub1:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
  ; GCN-NEXT:   S_BRANCH %bb.2
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT: bb.2:
  ; GCN-NEXT:   dead [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
  ; GCN-NEXT:   S_ENDPGM 0
  bb.0:
    successors: %bb.1, %bb.2
    liveins: $vgpr0_vgpr1

    %0:vreg_64 = COPY $vgpr0_vgpr1
    %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec
    %2:vgpr_32 = COPY %1.sub0
    S_CBRANCH_EXECZ %bb.2, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.2

    %2:vgpr_32 = V_AND_B32_e64 %1.sub0, %1.sub1, implicit $exec
    %1.sub1:vreg_128 = V_AND_B32_e64 %2, %2, implicit $exec
    S_BRANCH %bb.2

  bb.2:
    %4:vgpr_32 = V_ADD_U32_e32 %1.sub1, %2, implicit $exec
    S_ENDPGM 0
...

---
# Another complex example showing the capability of resolving lane conflict
# based on subranges.
name:            subrange_coalesce_complex_pattern
tracksRegLiveness: true
body:             |
  ; GCN-LABEL: name: subrange_coalesce_complex_pattern
  ; GCN: bb.0:
  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec
  ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
  ; GCN-NEXT:   S_BRANCH %bb.1
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT: bb.1:
  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub2:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
  ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
  ; GCN-NEXT:   S_BRANCH %bb.2
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT: bb.2:
  ; GCN-NEXT:   dead [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub2, implicit $exec
  ; GCN-NEXT:   S_ENDPGM 0
  bb.0:
    successors: %bb.1, %bb.2
    liveins: $vgpr0_vgpr1

    %0:vreg_64 = COPY $vgpr0_vgpr1
    %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec
    %2:vgpr_32 = COPY %1.sub0
    S_CBRANCH_EXECZ %bb.2, implicit $exec
    S_BRANCH %bb.1

  bb.1:
    successors: %bb.1, %bb.2

    %2:vgpr_32 = V_AND_B32_e64 %1.sub1, %2, implicit $exec
    %1.sub2:vreg_128 = V_AND_B32_e64 %2, %2, implicit $exec
    S_CBRANCH_EXECZ %bb.1, implicit $exec
    S_BRANCH %bb.2

  bb.2:
    %4:vgpr_32 = V_ADD_U32_e32 %1.sub1, %1.sub2, implicit $exec
    S_ENDPGM 0

...