llvm/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL12 %s
; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL12 %s
; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL10 %s
; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL10 %s

; This shouldn't be too different from wave32, so we'll only test one case.

define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 inreg %exec, { i32, ptr addrspace(5), i32, i64 } %vgpr, i32 %x, i32 %y) {
; GISEL12-LABEL: basic:
; GISEL12:       ; %bb.0: ; %entry
; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
; GISEL12-NEXT:    s_wait_expcnt 0x0
; GISEL12-NEXT:    s_wait_samplecnt 0x0
; GISEL12-NEXT:    s_wait_bvhcnt 0x0
; GISEL12-NEXT:    s_wait_kmcnt 0x0
; GISEL12-NEXT:    s_or_saveexec_b64 s[10:11], -1
; GISEL12-NEXT:    s_mov_b32 s8, s3
; GISEL12-NEXT:    s_mov_b32 s9, s4
; GISEL12-NEXT:    s_mov_b32 s4, s5
; GISEL12-NEXT:    s_mov_b32 s5, s6
; GISEL12-NEXT:    s_wait_alu 0xfffe
; GISEL12-NEXT:    s_and_saveexec_b64 s[6:7], s[10:11]
; GISEL12-NEXT:  ; %bb.1: ; %shader
; GISEL12-NEXT:    s_or_saveexec_b64 s[10:11], -1
; GISEL12-NEXT:    s_wait_alu 0xfffe
; GISEL12-NEXT:    v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GISEL12-NEXT:    v_cmp_ne_u32_e64 s[12:13], 0, v0
; GISEL12-NEXT:    v_mov_b32_e32 v0, s12
; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GISEL12-NEXT:    v_mov_b32_e32 v1, s13
; GISEL12-NEXT:    s_mov_b64 exec, s[10:11]
; GISEL12-NEXT:    v_mov_b32_e32 v11, v0
; GISEL12-NEXT:    v_add_nc_u32_e32 v10, 42, v13
; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
; GISEL12-NEXT:    v_mov_b32_e32 v12, v1
; GISEL12-NEXT:  ; %bb.2: ; %tail
; GISEL12-NEXT:    s_or_b64 exec, exec, s[6:7]
; GISEL12-NEXT:    s_mov_b64 exec, s[4:5]
; GISEL12-NEXT:    s_wait_alu 0xfffe
; GISEL12-NEXT:    s_setpc_b64 s[8:9]
;
; DAGISEL12-LABEL: basic:
; DAGISEL12:       ; %bb.0: ; %entry
; DAGISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
; DAGISEL12-NEXT:    s_wait_expcnt 0x0
; DAGISEL12-NEXT:    s_wait_samplecnt 0x0
; DAGISEL12-NEXT:    s_wait_bvhcnt 0x0
; DAGISEL12-NEXT:    s_wait_kmcnt 0x0
; DAGISEL12-NEXT:    s_or_saveexec_b64 s[10:11], -1
; DAGISEL12-NEXT:    s_mov_b32 s7, s6
; DAGISEL12-NEXT:    s_mov_b32 s6, s5
; DAGISEL12-NEXT:    s_mov_b32 s5, s4
; DAGISEL12-NEXT:    s_mov_b32 s4, s3
; DAGISEL12-NEXT:    s_wait_alu 0xfffe
; DAGISEL12-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
; DAGISEL12-NEXT:  ; %bb.1: ; %shader
; DAGISEL12-NEXT:    s_or_saveexec_b64 s[10:11], -1
; DAGISEL12-NEXT:    s_wait_alu 0xfffe
; DAGISEL12-NEXT:    v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
; DAGISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; DAGISEL12-NEXT:    v_cmp_ne_u32_e64 s[12:13], 0, v0
; DAGISEL12-NEXT:    s_mov_b64 exec, s[10:11]
; DAGISEL12-NEXT:    v_mov_b32_e32 v11, s12
; DAGISEL12-NEXT:    v_add_nc_u32_e32 v10, 42, v13
; DAGISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
; DAGISEL12-NEXT:    v_mov_b32_e32 v12, s13
; DAGISEL12-NEXT:  ; %bb.2: ; %tail
; DAGISEL12-NEXT:    s_or_b64 exec, exec, s[8:9]
; DAGISEL12-NEXT:    s_mov_b64 exec, s[6:7]
; DAGISEL12-NEXT:    s_wait_alu 0xfffe
; DAGISEL12-NEXT:    s_setpc_b64 s[4:5]
;
; GISEL10-LABEL: basic:
; GISEL10:       ; %bb.0: ; %entry
; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL10-NEXT:    s_or_saveexec_b64 s[10:11], -1
; GISEL10-NEXT:    s_mov_b32 s8, s3
; GISEL10-NEXT:    s_mov_b32 s9, s4
; GISEL10-NEXT:    s_mov_b32 s4, s5
; GISEL10-NEXT:    s_mov_b32 s5, s6
; GISEL10-NEXT:    s_and_saveexec_b64 s[6:7], s[10:11]
; GISEL10-NEXT:  ; %bb.1: ; %shader
; GISEL10-NEXT:    s_or_saveexec_b64 s[10:11], -1
; GISEL10-NEXT:    v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
; GISEL10-NEXT:    v_cmp_ne_u32_e64 s[12:13], 0, v0
; GISEL10-NEXT:    v_mov_b32_e32 v0, s12
; GISEL10-NEXT:    v_mov_b32_e32 v1, s13
; GISEL10-NEXT:    s_mov_b64 exec, s[10:11]
; GISEL10-NEXT:    v_mov_b32_e32 v11, v0
; GISEL10-NEXT:    v_add_nc_u32_e32 v10, 42, v13
; GISEL10-NEXT:    v_mov_b32_e32 v12, v1
; GISEL10-NEXT:  ; %bb.2: ; %tail
; GISEL10-NEXT:    s_or_b64 exec, exec, s[6:7]
; GISEL10-NEXT:    s_mov_b64 exec, s[4:5]
; GISEL10-NEXT:    s_setpc_b64 s[8:9]
;
; DAGISEL10-LABEL: basic:
; DAGISEL10:       ; %bb.0: ; %entry
; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL10-NEXT:    s_or_saveexec_b64 s[10:11], -1
; DAGISEL10-NEXT:    s_mov_b32 s7, s6
; DAGISEL10-NEXT:    s_mov_b32 s6, s5
; DAGISEL10-NEXT:    s_mov_b32 s5, s4
; DAGISEL10-NEXT:    s_mov_b32 s4, s3
; DAGISEL10-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
; DAGISEL10-NEXT:  ; %bb.1: ; %shader
; DAGISEL10-NEXT:    s_or_saveexec_b64 s[10:11], -1
; DAGISEL10-NEXT:    v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
; DAGISEL10-NEXT:    v_cmp_ne_u32_e64 s[12:13], 0, v0
; DAGISEL10-NEXT:    s_mov_b64 exec, s[10:11]
; DAGISEL10-NEXT:    v_mov_b32_e32 v11, s12
; DAGISEL10-NEXT:    v_add_nc_u32_e32 v10, 42, v13
; DAGISEL10-NEXT:    v_mov_b32_e32 v12, s13
; DAGISEL10-NEXT:  ; %bb.2: ; %tail
; DAGISEL10-NEXT:    s_or_b64 exec, exec, s[8:9]
; DAGISEL10-NEXT:    s_mov_b64 exec, s[6:7]
; DAGISEL10-NEXT:    s_setpc_b64 s[4:5]
entry:
  %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
  br i1 %entry_exec, label %shader, label %tail

shader:
  %nonwwm = add i32 %x, 42
  %vgpr.1 = insertvalue { i32, ptr addrspace(5), i32, i64} %vgpr, i32 %nonwwm, 2

  %full.vgpr = call i32 @llvm.amdgcn.set.inactive.i32(i32 %x, i32 71)
  %non.zero = icmp ne i32 %full.vgpr, 0
  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %non.zero)
  %wwm = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %ballot)
  %vgpr.2 = insertvalue { i32, ptr addrspace(5), i32, i64} %vgpr.1, i64 %wwm, 3

  br label %tail

tail:
  %vgpr.args = phi { i32, ptr addrspace(5), i32, i64} [%vgpr, %entry], [%vgpr.2, %shader]
  call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i64 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i64 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i64 } %vgpr.args, i32 0)
  unreachable
}