llvm/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s

define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
; SI-LABEL: static_exact:
; SI:       ; %bb.0: ; %.entry
; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
; SI-NEXT:    s_andn2_b64 exec, exec, exec
; SI-NEXT:    s_cbranch_scc0 .LBB0_2
; SI-NEXT:  ; %bb.1: ; %.entry
; SI-NEXT:    s_mov_b64 exec, 0
; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; SI-NEXT:    s_endpgm
; SI-NEXT:  .LBB0_2:
; SI-NEXT:    s_mov_b64 exec, 0
; SI-NEXT:    exp null off, off, off, off done vm
; SI-NEXT:    s_endpgm
;
; GFX9-LABEL: static_exact:
; GFX9:       ; %bb.0: ; %.entry
; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
; GFX9-NEXT:    s_andn2_b64 exec, exec, exec
; GFX9-NEXT:    s_cbranch_scc0 .LBB0_2
; GFX9-NEXT:  ; %bb.1: ; %.entry
; GFX9-NEXT:    s_mov_b64 exec, 0
; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; GFX9-NEXT:    s_endpgm
; GFX9-NEXT:  .LBB0_2:
; GFX9-NEXT:    s_mov_b64 exec, 0
; GFX9-NEXT:    exp null off, off, off, off done vm
; GFX9-NEXT:    s_endpgm
;
; GFX10-32-LABEL: static_exact:
; GFX10-32:       ; %bb.0: ; %.entry
; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, exec_lo
; GFX10-32-NEXT:    s_cbranch_scc0 .LBB0_2
; GFX10-32-NEXT:  ; %bb.1: ; %.entry
; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; GFX10-32-NEXT:    s_endpgm
; GFX10-32-NEXT:  .LBB0_2:
; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
; GFX10-32-NEXT:    exp null off, off, off, off done vm
; GFX10-32-NEXT:    s_endpgm
;
; GFX10-64-LABEL: static_exact:
; GFX10-64:       ; %bb.0: ; %.entry
; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
; GFX10-64-NEXT:    s_andn2_b64 exec, exec, exec
; GFX10-64-NEXT:    s_cbranch_scc0 .LBB0_2
; GFX10-64-NEXT:  ; %bb.1: ; %.entry
; GFX10-64-NEXT:    s_mov_b64 exec, 0
; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; GFX10-64-NEXT:    s_endpgm
; GFX10-64-NEXT:  .LBB0_2:
; GFX10-64-NEXT:    s_mov_b64 exec, 0
; GFX10-64-NEXT:    exp null off, off, off, off done vm
; GFX10-64-NEXT:    s_endpgm
.entry:
  %c0 = fcmp olt float %arg0, 0.000000e+00
  %c1 = fcmp oge float %arg1, 0.0
  call void @llvm.amdgcn.wqm.demote(i1 false)
  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
  ret void
}

define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
; SI-LABEL: dynamic_exact:
; SI:       ; %bb.0: ; %.entry
; SI-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
; SI-NEXT:    s_mov_b64 s[2:3], exec
; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
; SI-NEXT:    s_cbranch_scc0 .LBB1_2
; SI-NEXT:  ; %bb.1: ; %.entry
; SI-NEXT:    s_and_b64 exec, exec, s[2:3]
; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; SI-NEXT:    s_endpgm
; SI-NEXT:  .LBB1_2:
; SI-NEXT:    s_mov_b64 exec, 0
; SI-NEXT:    exp null off, off, off, off done vm
; SI-NEXT:    s_endpgm
;
; GFX9-LABEL: dynamic_exact:
; GFX9:       ; %bb.0: ; %.entry
; GFX9-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
; GFX9-NEXT:    s_mov_b64 s[2:3], exec
; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
; GFX9-NEXT:    s_cbranch_scc0 .LBB1_2
; GFX9-NEXT:  ; %bb.1: ; %.entry
; GFX9-NEXT:    s_and_b64 exec, exec, s[2:3]
; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; GFX9-NEXT:    s_endpgm
; GFX9-NEXT:  .LBB1_2:
; GFX9-NEXT:    s_mov_b64 exec, 0
; GFX9-NEXT:    exp null off, off, off, off done vm
; GFX9-NEXT:    s_endpgm
;
; GFX10-32-LABEL: dynamic_exact:
; GFX10-32:       ; %bb.0: ; %.entry
; GFX10-32-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT:    s_xor_b32 s0, s0, exec_lo
; GFX10-32-NEXT:    s_andn2_b32 s1, s1, s0
; GFX10-32-NEXT:    s_cbranch_scc0 .LBB1_2
; GFX10-32-NEXT:  ; %bb.1: ; %.entry
; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; GFX10-32-NEXT:    s_endpgm
; GFX10-32-NEXT:  .LBB1_2:
; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
; GFX10-32-NEXT:    exp null off, off, off, off done vm
; GFX10-32-NEXT:    s_endpgm
;
; GFX10-64-LABEL: dynamic_exact:
; GFX10-64:       ; %bb.0: ; %.entry
; GFX10-64-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
; GFX10-64-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
; GFX10-64-NEXT:    s_cbranch_scc0 .LBB1_2
; GFX10-64-NEXT:  ; %bb.1: ; %.entry
; GFX10-64-NEXT:    s_and_b64 exec, exec, s[2:3]
; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; GFX10-64-NEXT:    s_endpgm
; GFX10-64-NEXT:  .LBB1_2:
; GFX10-64-NEXT:    s_mov_b64 exec, 0
; GFX10-64-NEXT:    exp null off, off, off, off done vm
; GFX10-64-NEXT:    s_endpgm
.entry:
  %c0 = fcmp olt float %arg0, 0.000000e+00
  %c1 = fcmp oge float %arg1, 0.0
  call void @llvm.amdgcn.wqm.demote(i1 %c1)
  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
  ret void
}

define amdgpu_ps void @branch(float %arg0, float %arg1) {
; SI-LABEL: branch:
; SI:       ; %bb.0: ; %.entry
; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
; SI-NEXT:    s_mov_b64 s[2:3], exec
; SI-NEXT:    v_or_b32_e32 v0, v0, v1
; SI-NEXT:    v_and_b32_e32 v0, 1, v0
; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
; SI-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
; SI-NEXT:    s_cbranch_execz .LBB2_3
; SI-NEXT:  ; %bb.1: ; %.demote
; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
; SI-NEXT:    s_cbranch_scc0 .LBB2_4
; SI-NEXT:  ; %bb.2: ; %.demote
; SI-NEXT:    s_mov_b64 exec, 0
; SI-NEXT:  .LBB2_3: ; %.continue
; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; SI-NEXT:    s_endpgm
; SI-NEXT:  .LBB2_4:
; SI-NEXT:    s_mov_b64 exec, 0
; SI-NEXT:    exp null off, off, off, off done vm
; SI-NEXT:    s_endpgm
;
; GFX9-LABEL: branch:
; GFX9:       ; %bb.0: ; %.entry
; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
; GFX9-NEXT:    s_mov_b64 s[2:3], exec
; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
; GFX9-NEXT:    s_cbranch_execz .LBB2_3
; GFX9-NEXT:  ; %bb.1: ; %.demote
; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
; GFX9-NEXT:    s_cbranch_scc0 .LBB2_4
; GFX9-NEXT:  ; %bb.2: ; %.demote
; GFX9-NEXT:    s_mov_b64 exec, 0
; GFX9-NEXT:  .LBB2_3: ; %.continue
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; GFX9-NEXT:    s_endpgm
; GFX9-NEXT:  .LBB2_4:
; GFX9-NEXT:    s_mov_b64 exec, 0
; GFX9-NEXT:    exp null off, off, off, off done vm
; GFX9-NEXT:    s_endpgm
;
; GFX10-32-LABEL: branch:
; GFX10-32:       ; %bb.0: ; %.entry
; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v1, v1
; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
; GFX10-32-NEXT:    v_or_b32_e32 v0, v0, v1
; GFX10-32-NEXT:    v_and_b32_e32 v0, 1, v0
; GFX10-32-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
; GFX10-32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s0
; GFX10-32-NEXT:    s_xor_b32 s0, exec_lo, s2
; GFX10-32-NEXT:    s_cbranch_execz .LBB2_3
; GFX10-32-NEXT:  ; %bb.1: ; %.demote
; GFX10-32-NEXT:    s_andn2_b32 s1, s1, exec_lo
; GFX10-32-NEXT:    s_cbranch_scc0 .LBB2_4
; GFX10-32-NEXT:  ; %bb.2: ; %.demote
; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
; GFX10-32-NEXT:  .LBB2_3: ; %.continue
; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; GFX10-32-NEXT:    s_endpgm
; GFX10-32-NEXT:  .LBB2_4:
; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
; GFX10-32-NEXT:    exp null off, off, off, off done vm
; GFX10-32-NEXT:    s_endpgm
;
; GFX10-64-LABEL: branch:
; GFX10-64:       ; %bb.0: ; %.entry
; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v1, v1
; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
; GFX10-64-NEXT:    v_or_b32_e32 v0, v0, v1
; GFX10-64-NEXT:    v_and_b32_e32 v0, 1, v0
; GFX10-64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
; GFX10-64-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
; GFX10-64-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
; GFX10-64-NEXT:    s_cbranch_execz .LBB2_3
; GFX10-64-NEXT:  ; %bb.1: ; %.demote
; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
; GFX10-64-NEXT:    s_cbranch_scc0 .LBB2_4
; GFX10-64-NEXT:  ; %bb.2: ; %.demote
; GFX10-64-NEXT:    s_mov_b64 exec, 0
; GFX10-64-NEXT:  .LBB2_3: ; %.continue
; GFX10-64-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; GFX10-64-NEXT:    s_endpgm
; GFX10-64-NEXT:  .LBB2_4:
; GFX10-64-NEXT:    s_mov_b64 exec, 0
; GFX10-64-NEXT:    exp null off, off, off, off done vm
; GFX10-64-NEXT:    s_endpgm
.entry:
  %i0 = fptosi float %arg0 to i32
  %i1 = fptosi float %arg1 to i32
  %c0 = or i32 %i0, %i1
  %c1 = and i32 %c0, 1
  %c2 = icmp eq i32 %c1, 0
  br i1 %c2, label %.continue, label %.demote

.demote:
  call void @llvm.amdgcn.wqm.demote(i1 false)
  br label %.continue

.continue:
  %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00
  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
  ret void
}


define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
; SI-LABEL: wqm_demote_1:
; SI:       ; %bb.0: ; %.entry
; SI-NEXT:    s_mov_b64 s[12:13], exec
; SI-NEXT:    s_wqm_b64 exec, exec
; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
; SI-NEXT:    s_cbranch_execz .LBB3_3
; SI-NEXT:  ; %bb.1: ; %.demote
; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
; SI-NEXT:    s_cbranch_scc0 .LBB3_4
; SI-NEXT:  ; %bb.2: ; %.demote
; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
; SI-NEXT:  .LBB3_3: ; %.continue
; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    v_add_f32_e32 v0, v0, v0
; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    s_branch .LBB3_5
; SI-NEXT:  .LBB3_4:
; SI-NEXT:    s_mov_b64 exec, 0
; SI-NEXT:    exp null off, off, off, off done vm
; SI-NEXT:    s_endpgm
; SI-NEXT:  .LBB3_5:
;
; GFX9-LABEL: wqm_demote_1:
; GFX9:       ; %bb.0: ; %.entry
; GFX9-NEXT:    s_mov_b64 s[12:13], exec
; GFX9-NEXT:    s_wqm_b64 exec, exec
; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
; GFX9-NEXT:    s_cbranch_execz .LBB3_3
; GFX9-NEXT:  ; %bb.1: ; %.demote
; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
; GFX9-NEXT:    s_cbranch_scc0 .LBB3_4
; GFX9-NEXT:  ; %bb.2: ; %.demote
; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
; GFX9-NEXT:  .LBB3_3: ; %.continue
; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_branch .LBB3_5
; GFX9-NEXT:  .LBB3_4:
; GFX9-NEXT:    s_mov_b64 exec, 0
; GFX9-NEXT:    exp null off, off, off, off done vm
; GFX9-NEXT:    s_endpgm
; GFX9-NEXT:  .LBB3_5:
;
; GFX10-32-LABEL: wqm_demote_1:
; GFX10-32:       ; %bb.0: ; %.entry
; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v1
; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
; GFX10-32-NEXT:    s_cbranch_execz .LBB3_3
; GFX10-32-NEXT:  ; %bb.1: ; %.demote
; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
; GFX10-32-NEXT:    s_cbranch_scc0 .LBB3_4
; GFX10-32-NEXT:  ; %bb.2: ; %.demote
; GFX10-32-NEXT:    s_wqm_b32 s14, s12
; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s14
; GFX10-32-NEXT:  .LBB3_3: ; %.continue
; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
; GFX10-32-NEXT:    s_branch .LBB3_5
; GFX10-32-NEXT:  .LBB3_4:
; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
; GFX10-32-NEXT:    exp null off, off, off, off done vm
; GFX10-32-NEXT:    s_endpgm
; GFX10-32-NEXT:  .LBB3_5:
;
; GFX10-64-LABEL: wqm_demote_1:
; GFX10-64:       ; %bb.0: ; %.entry
; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
; GFX10-64-NEXT:    s_wqm_b64 exec, exec
; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
; GFX10-64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
; GFX10-64-NEXT:    s_cbranch_execz .LBB3_3
; GFX10-64-NEXT:  ; %bb.1: ; %.demote
; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
; GFX10-64-NEXT:    s_cbranch_scc0 .LBB3_4
; GFX10-64-NEXT:  ; %bb.2: ; %.demote
; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
; GFX10-64-NEXT:  .LBB3_3: ; %.continue
; GFX10-64-NEXT:    s_or_b64 exec, exec, s[14:15]
; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
; GFX10-64-NEXT:    s_branch .LBB3_5
; GFX10-64-NEXT:  .LBB3_4:
; GFX10-64-NEXT:    s_mov_b64 exec, 0
; GFX10-64-NEXT:    exp null off, off, off, off done vm
; GFX10-64-NEXT:    s_endpgm
; GFX10-64-NEXT:  .LBB3_5:
.entry:
  %z.cmp = fcmp olt float %z, 0.0
  br i1 %z.cmp, label %.continue, label %.demote

.demote:
  call void @llvm.amdgcn.wqm.demote(i1 false)
  br label %.continue

.continue:
  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
  %tex0 = extractelement <4 x float> %tex, i32 0
  %tex1 = extractelement <4 x float> %tex, i32 0
  %coord1 = fadd float %tex0, %tex1
  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0

  ret <4 x float> %rtex
}

define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
; SI-LABEL: wqm_demote_2:
; SI:       ; %bb.0: ; %.entry
; SI-NEXT:    s_mov_b64 s[12:13], exec
; SI-NEXT:    s_wqm_b64 exec, exec
; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
; SI-NEXT:    s_cbranch_execz .LBB4_3
; SI-NEXT:  ; %bb.1: ; %.demote
; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
; SI-NEXT:    s_cbranch_scc0 .LBB4_4
; SI-NEXT:  ; %bb.2: ; %.demote
; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
; SI-NEXT:  .LBB4_3: ; %.continue
; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
; SI-NEXT:    v_add_f32_e32 v0, v0, v0
; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    s_branch .LBB4_5
; SI-NEXT:  .LBB4_4:
; SI-NEXT:    s_mov_b64 exec, 0
; SI-NEXT:    exp null off, off, off, off done vm
; SI-NEXT:    s_endpgm
; SI-NEXT:  .LBB4_5:
;
; GFX9-LABEL: wqm_demote_2:
; GFX9:       ; %bb.0: ; %.entry
; GFX9-NEXT:    s_mov_b64 s[12:13], exec
; GFX9-NEXT:    s_wqm_b64 exec, exec
; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
; GFX9-NEXT:    s_cbranch_execz .LBB4_3
; GFX9-NEXT:  ; %bb.1: ; %.demote
; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
; GFX9-NEXT:    s_cbranch_scc0 .LBB4_4
; GFX9-NEXT:  ; %bb.2: ; %.demote
; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
; GFX9-NEXT:  .LBB4_3: ; %.continue
; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_branch .LBB4_5
; GFX9-NEXT:  .LBB4_4:
; GFX9-NEXT:    s_mov_b64 exec, 0
; GFX9-NEXT:    exp null off, off, off, off done vm
; GFX9-NEXT:    s_endpgm
; GFX9-NEXT:  .LBB4_5:
;
; GFX10-32-LABEL: wqm_demote_2:
; GFX10-32:       ; %bb.0: ; %.entry
; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
; GFX10-32-NEXT:    s_cbranch_execz .LBB4_3
; GFX10-32-NEXT:  ; %bb.1: ; %.demote
; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
; GFX10-32-NEXT:    s_cbranch_scc0 .LBB4_4
; GFX10-32-NEXT:  ; %bb.2: ; %.demote
; GFX10-32-NEXT:    s_wqm_b32 s14, s12
; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s14
; GFX10-32-NEXT:  .LBB4_3: ; %.continue
; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
; GFX10-32-NEXT:    s_branch .LBB4_5
; GFX10-32-NEXT:  .LBB4_4:
; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
; GFX10-32-NEXT:    exp null off, off, off, off done vm
; GFX10-32-NEXT:    s_endpgm
; GFX10-32-NEXT:  .LBB4_5:
;
; GFX10-64-LABEL: wqm_demote_2:
; GFX10-64:       ; %bb.0: ; %.entry
; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
; GFX10-64-NEXT:    s_wqm_b64 exec, exec
; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
; GFX10-64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
; GFX10-64-NEXT:    s_cbranch_execz .LBB4_3
; GFX10-64-NEXT:  ; %bb.1: ; %.demote
; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
; GFX10-64-NEXT:    s_cbranch_scc0 .LBB4_4
; GFX10-64-NEXT:  ; %bb.2: ; %.demote
; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
; GFX10-64-NEXT:  .LBB4_3: ; %.continue
; GFX10-64-NEXT:    s_or_b64 exec, exec, s[14:15]
; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
; GFX10-64-NEXT:    s_branch .LBB4_5
; GFX10-64-NEXT:  .LBB4_4:
; GFX10-64-NEXT:    s_mov_b64 exec, 0
; GFX10-64-NEXT:    exp null off, off, off, off done vm
; GFX10-64-NEXT:    s_endpgm
; GFX10-64-NEXT:  .LBB4_5:
.entry:
  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
  %tex0 = extractelement <4 x float> %tex, i32 0
  %tex1 = extractelement <4 x float> %tex, i32 0
  %z.cmp = fcmp olt float %tex0, 0.0
  br i1 %z.cmp, label %.continue, label %.demote

.demote:
  call void @llvm.amdgcn.wqm.demote(i1 false)
  br label %.continue

.continue:
  %coord1 = fadd float %tex0, %tex1
  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0

  ret <4 x float> %rtex
}

define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
; SI-LABEL: wqm_demote_dynamic:
; SI:       ; %bb.0: ; %.entry
; SI-NEXT:    s_mov_b64 s[12:13], exec
; SI-NEXT:    s_wqm_b64 exec, exec
; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
; SI-NEXT:    s_xor_b64 s[14:15], vcc, exec
; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
; SI-NEXT:    s_cbranch_scc0 .LBB5_2
; SI-NEXT:  ; %bb.1: ; %.entry
; SI-NEXT:    s_wqm_b64 s[14:15], s[12:13]
; SI-NEXT:    s_and_b64 exec, exec, s[14:15]
; SI-NEXT:    v_add_f32_e32 v0, v0, v0
; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; SI-NEXT:    s_waitcnt vmcnt(0)
; SI-NEXT:    s_branch .LBB5_3
; SI-NEXT:  .LBB5_2:
; SI-NEXT:    s_mov_b64 exec, 0
; SI-NEXT:    exp null off, off, off, off done vm
; SI-NEXT:    s_endpgm
; SI-NEXT:  .LBB5_3:
;
; GFX9-LABEL: wqm_demote_dynamic:
; GFX9:       ; %bb.0: ; %.entry
; GFX9-NEXT:    s_mov_b64 s[12:13], exec
; GFX9-NEXT:    s_wqm_b64 exec, exec
; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
; GFX9-NEXT:    s_xor_b64 s[14:15], vcc, exec
; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
; GFX9-NEXT:    s_cbranch_scc0 .LBB5_2
; GFX9-NEXT:  ; %bb.1: ; %.entry
; GFX9-NEXT:    s_wqm_b64 s[14:15], s[12:13]
; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_branch .LBB5_3
; GFX9-NEXT:  .LBB5_2:
; GFX9-NEXT:    s_mov_b64 exec, 0
; GFX9-NEXT:    exp null off, off, off, off done vm
; GFX9-NEXT:    s_endpgm
; GFX9-NEXT:  .LBB5_3:
;
; GFX10-32-LABEL: wqm_demote_dynamic:
; GFX10-32:       ; %bb.0: ; %.entry
; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT:    s_xor_b32 s13, vcc_lo, exec_lo
; GFX10-32-NEXT:    s_andn2_b32 s12, s12, s13
; GFX10-32-NEXT:    s_cbranch_scc0 .LBB5_2
; GFX10-32-NEXT:  ; %bb.1: ; %.entry
; GFX10-32-NEXT:    s_wqm_b32 s13, s12
; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s13
; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
; GFX10-32-NEXT:    s_branch .LBB5_3
; GFX10-32-NEXT:  .LBB5_2:
; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
; GFX10-32-NEXT:    exp null off, off, off, off done vm
; GFX10-32-NEXT:    s_endpgm
; GFX10-32-NEXT:  .LBB5_3:
;
; GFX10-64-LABEL: wqm_demote_dynamic:
; GFX10-64:       ; %bb.0: ; %.entry
; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
; GFX10-64-NEXT:    s_wqm_b64 exec, exec
; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
; GFX10-64-NEXT:    s_xor_b64 s[14:15], vcc, exec
; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
; GFX10-64-NEXT:    s_cbranch_scc0 .LBB5_2
; GFX10-64-NEXT:  ; %bb.1: ; %.entry
; GFX10-64-NEXT:    s_wqm_b64 s[14:15], s[12:13]
; GFX10-64-NEXT:    s_and_b64 exec, exec, s[14:15]
; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
; GFX10-64-NEXT:    s_branch .LBB5_3
; GFX10-64-NEXT:  .LBB5_2:
; GFX10-64-NEXT:    s_mov_b64 exec, 0
; GFX10-64-NEXT:    exp null off, off, off, off done vm
; GFX10-64-NEXT:    s_endpgm
; GFX10-64-NEXT:  .LBB5_3:
.entry:
  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
  %tex0 = extractelement <4 x float> %tex, i32 0
  %tex1 = extractelement <4 x float> %tex, i32 0
  %z.cmp = fcmp olt float %tex0, 0.0
  call void @llvm.amdgcn.wqm.demote(i1 %z.cmp)
  %coord1 = fadd float %tex0, %tex1
  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0

  ret <4 x float> %rtex
}


define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
; SI-LABEL: wqm_deriv:
; SI:       ; %bb.0: ; %.entry
; SI-NEXT:    s_mov_b64 s[0:1], exec
; SI-NEXT:    s_wqm_b64 exec, exec
; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
; SI-NEXT:    s_cbranch_execz .LBB6_3
; SI-NEXT:  ; %bb.1: ; %.demote0
; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
; SI-NEXT:    s_cbranch_scc0 .LBB6_7
; SI-NEXT:  ; %bb.2: ; %.demote0
; SI-NEXT:    s_wqm_b64 s[4:5], s[0:1]
; SI-NEXT:    s_and_b64 exec, exec, s[4:5]
; SI-NEXT:  .LBB6_3: ; %.continue0
; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
; SI-NEXT:    s_mov_b64 s[2:3], s[0:1]
; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
; SI-NEXT:    v_mov_b32_e32 v1, v0
; SI-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
; SI-NEXT:    s_nop 0
; SI-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; SI-NEXT:    s_nop 1
; SI-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
; SI-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
; SI-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
; SI-NEXT:    s_cbranch_execz .LBB6_6
; SI-NEXT:  ; %bb.4: ; %.demote1
; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
; SI-NEXT:    s_cbranch_scc0 .LBB6_7
; SI-NEXT:  ; %bb.5: ; %.demote1
; SI-NEXT:    s_mov_b64 exec, 0
; SI-NEXT:  .LBB6_6: ; %.continue1
; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
; SI-NEXT:    v_bfrev_b32_e32 v0, 60
; SI-NEXT:    v_mov_b32_e32 v1, 0x3c00
; SI-NEXT:    exp mrt0 v1, v1, v0, v0 done compr vm
; SI-NEXT:    s_endpgm
; SI-NEXT:  .LBB6_7:
; SI-NEXT:    s_mov_b64 exec, 0
; SI-NEXT:    exp null off, off, off, off done vm
; SI-NEXT:    s_endpgm
;
; GFX9-LABEL: wqm_deriv:
; GFX9:       ; %bb.0: ; %.entry
; GFX9-NEXT:    s_mov_b64 s[0:1], exec
; GFX9-NEXT:    s_wqm_b64 exec, exec
; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
; GFX9-NEXT:    s_cbranch_execz .LBB6_3
; GFX9-NEXT:  ; %bb.1: ; %.demote0
; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
; GFX9-NEXT:    s_cbranch_scc0 .LBB6_7
; GFX9-NEXT:  ; %bb.2: ; %.demote0
; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
; GFX9-NEXT:  .LBB6_3: ; %.continue0
; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v1, v0
; GFX9-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
; GFX9-NEXT:    s_nop 0
; GFX9-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT:    s_nop 1
; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
; GFX9-NEXT:    s_cbranch_execz .LBB6_6
; GFX9-NEXT:  ; %bb.4: ; %.demote1
; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
; GFX9-NEXT:    s_cbranch_scc0 .LBB6_7
; GFX9-NEXT:  ; %bb.5: ; %.demote1
; GFX9-NEXT:    s_mov_b64 exec, 0
; GFX9-NEXT:  .LBB6_6: ; %.continue1
; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
; GFX9-NEXT:    s_endpgm
; GFX9-NEXT:  .LBB6_7:
; GFX9-NEXT:    s_mov_b64 exec, 0
; GFX9-NEXT:    exp null off, off, off, off done vm
; GFX9-NEXT:    s_endpgm
;
; GFX10-32-LABEL: wqm_deriv:
; GFX10-32:       ; %bb.0: ; %.entry
; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s1
; GFX10-32-NEXT:    s_cbranch_execz .LBB6_3
; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
; GFX10-32-NEXT:    s_cbranch_scc0 .LBB6_7
; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
; GFX10-32-NEXT:    s_wqm_b32 s2, s0
; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
; GFX10-32-NEXT:  .LBB6_3: ; %.continue0
; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
; GFX10-32-NEXT:    s_mov_b32 s1, s0
; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s1
; GFX10-32-NEXT:    v_mov_b32_e32 v1, v0
; GFX10-32-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
; GFX10-32-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT:    s_xor_b32 s1, s0, -1
; GFX10-32-NEXT:    s_or_b32 s1, s1, vcc_lo
; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
; GFX10-32-NEXT:    s_cbranch_execz .LBB6_6
; GFX10-32-NEXT:  ; %bb.4: ; %.demote1
; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
; GFX10-32-NEXT:    s_cbranch_scc0 .LBB6_7
; GFX10-32-NEXT:  ; %bb.5: ; %.demote1
; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
; GFX10-32-NEXT:  .LBB6_6: ; %.continue1
; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
; GFX10-32-NEXT:    s_endpgm
; GFX10-32-NEXT:  .LBB6_7:
; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
; GFX10-32-NEXT:    exp null off, off, off, off done vm
; GFX10-32-NEXT:    s_endpgm
;
; GFX10-64-LABEL: wqm_deriv:
; GFX10-64:       ; %bb.0: ; %.entry
; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
; GFX10-64-NEXT:    s_wqm_b64 exec, exec
; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
; GFX10-64-NEXT:    s_cbranch_execz .LBB6_3
; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
; GFX10-64-NEXT:    s_cbranch_scc0 .LBB6_7
; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
; GFX10-64-NEXT:  .LBB6_3: ; %.continue0
; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
; GFX10-64-NEXT:    s_mov_b64 s[2:3], s[0:1]
; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
; GFX10-64-NEXT:    v_mov_b32_e32 v1, v0
; GFX10-64-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
; GFX10-64-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
; GFX10-64-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
; GFX10-64-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
; GFX10-64-NEXT:    s_cbranch_execz .LBB6_6
; GFX10-64-NEXT:  ; %bb.4: ; %.demote1
; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
; GFX10-64-NEXT:    s_cbranch_scc0 .LBB6_7
; GFX10-64-NEXT:  ; %bb.5: ; %.demote1
; GFX10-64-NEXT:    s_mov_b64 exec, 0
; GFX10-64-NEXT:  .LBB6_6: ; %.continue1
; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
; GFX10-64-NEXT:    s_endpgm
; GFX10-64-NEXT:  .LBB6_7:
; GFX10-64-NEXT:    s_mov_b64 exec, 0
; GFX10-64-NEXT:    exp null off, off, off, off done vm
; GFX10-64-NEXT:    s_endpgm
.entry:
  %p0 = extractelement <2 x float> %input, i32 0
  %p1 = extractelement <2 x float> %input, i32 1
  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 0, i32 0, i32 %index) #2
  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 0, i32 0, i32 %index) #2
  %argi = fptosi float %arg to i32
  %cond0 = icmp eq i32 %argi, 0
  br i1 %cond0, label %.continue0, label %.demote0

.demote0:
  call void @llvm.amdgcn.wqm.demote(i1 false)
  br label %.continue0

.continue0:
  %live = call i1 @llvm.amdgcn.live.mask()
  %live.cond = select i1 %live, i32 0, i32 1065353216
  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
  %live.v0f = bitcast i32 %live.v0 to float
  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
  %live.v1f = bitcast i32 %live.v1 to float
  %v0 = fsub float %live.v0f, %live.v1f
  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
  %cond2 = and i1 %live, %cond1
  br i1 %cond2, label %.continue1, label %.demote1

.demote1:
  call void @llvm.amdgcn.wqm.demote(i1 false)
  br label %.continue1

.continue1:
  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true) #3
  ret void
}

define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) {
; SI-LABEL: wqm_deriv_loop:
; SI:       ; %bb.0: ; %.entry
; SI-NEXT:    s_mov_b64 s[0:1], exec
; SI-NEXT:    s_wqm_b64 exec, exec
; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
; SI-NEXT:    s_mov_b32 s6, 0
; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
; SI-NEXT:    s_cbranch_execz .LBB7_3
; SI-NEXT:  ; %bb.1: ; %.demote0
; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
; SI-NEXT:    s_cbranch_scc0 .LBB7_9
; SI-NEXT:  ; %bb.2: ; %.demote0
; SI-NEXT:    s_wqm_b64 s[4:5], s[0:1]
; SI-NEXT:    s_and_b64 exec, exec, s[4:5]
; SI-NEXT:  .LBB7_3: ; %.continue0.preheader
; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
; SI-NEXT:    s_mov_b64 s[2:3], 0
; SI-NEXT:    s_branch .LBB7_5
; SI-NEXT:  .LBB7_4: ; %.continue1
; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
; SI-NEXT:    s_add_i32 s6, s6, 1
; SI-NEXT:    v_cmp_ge_i32_e32 vcc, s6, v1
; SI-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
; SI-NEXT:    s_cbranch_execz .LBB7_8
; SI-NEXT:  .LBB7_5: ; %.continue0
; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
; SI-NEXT:    v_mov_b32_e32 v0, s6
; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
; SI-NEXT:    v_mov_b32_e32 v2, v0
; SI-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
; SI-NEXT:    s_nop 0
; SI-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; SI-NEXT:    s_nop 1
; SI-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
; SI-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
; SI-NEXT:    s_xor_b64 s[4:5], exec, s[8:9]
; SI-NEXT:    s_cbranch_execz .LBB7_4
; SI-NEXT:  ; %bb.6: ; %.demote1
; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
; SI-NEXT:    s_cbranch_scc0 .LBB7_9
; SI-NEXT:  ; %bb.7: ; %.demote1
; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
; SI-NEXT:    s_wqm_b64 s[8:9], s[0:1]
; SI-NEXT:    s_and_b64 exec, exec, s[8:9]
; SI-NEXT:    s_branch .LBB7_4
; SI-NEXT:  .LBB7_8: ; %.return
; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
; SI-NEXT:    v_bfrev_b32_e32 v0, 60
; SI-NEXT:    v_mov_b32_e32 v1, 0x3c00
; SI-NEXT:    exp mrt0 v1, v1, v0, v0 done compr vm
; SI-NEXT:    s_endpgm
; SI-NEXT:  .LBB7_9:
; SI-NEXT:    s_mov_b64 exec, 0
; SI-NEXT:    exp null off, off, off, off done vm
; SI-NEXT:    s_endpgm
;
; GFX9-LABEL: wqm_deriv_loop:
; GFX9:       ; %bb.0: ; %.entry
; GFX9-NEXT:    s_mov_b64 s[0:1], exec
; GFX9-NEXT:    s_wqm_b64 exec, exec
; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
; GFX9-NEXT:    s_mov_b32 s6, 0
; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
; GFX9-NEXT:    s_cbranch_execz .LBB7_3
; GFX9-NEXT:  ; %bb.1: ; %.demote0
; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
; GFX9-NEXT:    s_cbranch_scc0 .LBB7_9
; GFX9-NEXT:  ; %bb.2: ; %.demote0
; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
; GFX9-NEXT:  .LBB7_3: ; %.continue0.preheader
; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT:    s_mov_b64 s[2:3], 0
; GFX9-NEXT:    s_branch .LBB7_5
; GFX9-NEXT:  .LBB7_4: ; %.continue1
; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT:    s_add_i32 s6, s6, 1
; GFX9-NEXT:    v_cmp_ge_i32_e32 vcc, s6, v1
; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT:    s_cbranch_execz .LBB7_8
; GFX9-NEXT:  .LBB7_5: ; %.continue0
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    v_mov_b32_e32 v0, s6
; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
; GFX9-NEXT:    v_mov_b32_e32 v2, v0
; GFX9-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
; GFX9-NEXT:    s_nop 0
; GFX9-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT:    s_nop 1
; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[8:9]
; GFX9-NEXT:    s_cbranch_execz .LBB7_4
; GFX9-NEXT:  ; %bb.6: ; %.demote1
; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
; GFX9-NEXT:    s_cbranch_scc0 .LBB7_9
; GFX9-NEXT:  ; %bb.7: ; %.demote1
; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
; GFX9-NEXT:    s_wqm_b64 s[8:9], s[0:1]
; GFX9-NEXT:    s_and_b64 exec, exec, s[8:9]
; GFX9-NEXT:    s_branch .LBB7_4
; GFX9-NEXT:  .LBB7_8: ; %.return
; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
; GFX9-NEXT:    s_endpgm
; GFX9-NEXT:  .LBB7_9:
; GFX9-NEXT:    s_mov_b64 exec, 0
; GFX9-NEXT:    exp null off, off, off, off done vm
; GFX9-NEXT:    s_endpgm
;
; GFX10-32-LABEL: wqm_deriv_loop:
; GFX10-32:       ; %bb.0: ; %.entry
; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
; GFX10-32-NEXT:    s_mov_b32 s1, 0
; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s2
; GFX10-32-NEXT:    s_cbranch_execz .LBB7_3
; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
; GFX10-32-NEXT:    s_cbranch_scc0 .LBB7_9
; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
; GFX10-32-NEXT:    s_wqm_b32 s3, s0
; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
; GFX10-32-NEXT:  .LBB7_3: ; %.continue0.preheader
; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
; GFX10-32-NEXT:    s_mov_b32 s2, 0
; GFX10-32-NEXT:    s_branch .LBB7_5
; GFX10-32-NEXT:  .LBB7_4: ; %.continue1
; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
; GFX10-32-NEXT:    s_add_i32 s2, s2, 1
; GFX10-32-NEXT:    v_cmp_ge_i32_e32 vcc_lo, s2, v1
; GFX10-32-NEXT:    s_or_b32 s1, vcc_lo, s1
; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s1
; GFX10-32-NEXT:    s_cbranch_execz .LBB7_8
; GFX10-32-NEXT:  .LBB7_5: ; %.continue0
; GFX10-32-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-32-NEXT:    s_mov_b32 s3, s0
; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, s2, 0, s3
; GFX10-32-NEXT:    s_xor_b32 s3, s0, -1
; GFX10-32-NEXT:    v_mov_b32_e32 v2, v0
; GFX10-32-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-32-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT:    s_or_b32 s3, s3, vcc_lo
; GFX10-32-NEXT:    s_and_saveexec_b32 s4, s3
; GFX10-32-NEXT:    s_xor_b32 s3, exec_lo, s4
; GFX10-32-NEXT:    s_cbranch_execz .LBB7_4
; GFX10-32-NEXT:  ; %bb.6: ; %.demote1
; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
; GFX10-32-NEXT:    s_cbranch_scc0 .LBB7_9
; GFX10-32-NEXT:  ; %bb.7: ; %.demote1
; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
; GFX10-32-NEXT:    s_wqm_b32 s4, s0
; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s4
; GFX10-32-NEXT:    s_branch .LBB7_4
; GFX10-32-NEXT:  .LBB7_8: ; %.return
; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
; GFX10-32-NEXT:    s_endpgm
; GFX10-32-NEXT:  .LBB7_9:
; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
; GFX10-32-NEXT:    exp null off, off, off, off done vm
; GFX10-32-NEXT:    s_endpgm
;
; GFX10-64-LABEL: wqm_deriv_loop:
; GFX10-64:       ; %bb.0: ; %.entry
; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
; GFX10-64-NEXT:    s_wqm_b64 exec, exec
; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
; GFX10-64-NEXT:    s_mov_b32 s6, 0
; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
; GFX10-64-NEXT:    s_cbranch_execz .LBB7_3
; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
; GFX10-64-NEXT:    s_cbranch_scc0 .LBB7_9
; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
; GFX10-64-NEXT:  .LBB7_3: ; %.continue0.preheader
; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
; GFX10-64-NEXT:    s_mov_b64 s[2:3], 0
; GFX10-64-NEXT:    s_branch .LBB7_5
; GFX10-64-NEXT:  .LBB7_4: ; %.continue1
; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
; GFX10-64-NEXT:    s_add_i32 s6, s6, 1
; GFX10-64-NEXT:    v_cmp_ge_i32_e32 vcc, s6, v1
; GFX10-64-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
; GFX10-64-NEXT:    s_andn2_b64 exec, exec, s[2:3]
; GFX10-64-NEXT:    s_cbranch_execz .LBB7_8
; GFX10-64-NEXT:  .LBB7_5: ; %.continue0
; GFX10-64-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-64-NEXT:    s_mov_b64 s[4:5], s[0:1]
; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, s6, 0, s[4:5]
; GFX10-64-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
; GFX10-64-NEXT:    v_mov_b32_e32 v2, v0
; GFX10-64-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-64-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
; GFX10-64-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
; GFX10-64-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[8:9]
; GFX10-64-NEXT:    s_cbranch_execz .LBB7_4
; GFX10-64-NEXT:  ; %bb.6: ; %.demote1
; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
; GFX10-64-NEXT:    s_cbranch_scc0 .LBB7_9
; GFX10-64-NEXT:  ; %bb.7: ; %.demote1
; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
; GFX10-64-NEXT:    s_wqm_b64 s[8:9], s[0:1]
; GFX10-64-NEXT:    s_and_b64 exec, exec, s[8:9]
; GFX10-64-NEXT:    s_branch .LBB7_4
; GFX10-64-NEXT:  .LBB7_8: ; %.return
; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
; GFX10-64-NEXT:    s_endpgm
; GFX10-64-NEXT:  .LBB7_9:
; GFX10-64-NEXT:    s_mov_b64 exec, 0
; GFX10-64-NEXT:    exp null off, off, off, off done vm
; GFX10-64-NEXT:    s_endpgm
.entry:
  %p0 = extractelement <2 x float> %input, i32 0
  %p1 = extractelement <2 x float> %input, i32 1
  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 0, i32 0, i32 %index) #2
  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 0, i32 0, i32 %index) #2
  %argi = fptosi float %arg to i32
  %cond0 = icmp eq i32 %argi, 0
  br i1 %cond0, label %.continue0, label %.demote0

.demote0:
  call void @llvm.amdgcn.wqm.demote(i1 false)
  br label %.continue0

.continue0:
  %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ]
  %live = call i1 @llvm.amdgcn.live.mask()
  %live.cond = select i1 %live, i32 0, i32 %count
  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
  %live.v0f = bitcast i32 %live.v0 to float
  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
  %live.v1f = bitcast i32 %live.v1 to float
  %v0 = fsub float %live.v0f, %live.v1f
  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
  %cond2 = and i1 %live, %cond1
  br i1 %cond2, label %.continue1, label %.demote1

.demote1:
  call void @llvm.amdgcn.wqm.demote(i1 false)
  br label %.continue1

.continue1:
  %next = add i32 %count, 1
  %loop.cond = icmp slt i32 %next, %limit
  br i1 %loop.cond, label %.continue0, label %.return

.return:
  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true) #3
  ret void
}

define amdgpu_ps void @static_exact_nop(float %arg0, float %arg1) {
; SI-LABEL: static_exact_nop:
; SI:       ; %bb.0: ; %.entry
; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; SI-NEXT:    s_endpgm
;
; GFX9-LABEL: static_exact_nop:
; GFX9:       ; %bb.0: ; %.entry
; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; GFX9-NEXT:    s_endpgm
;
; GFX10-32-LABEL: static_exact_nop:
; GFX10-32:       ; %bb.0: ; %.entry
; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; GFX10-32-NEXT:    s_endpgm
;
; GFX10-64-LABEL: static_exact_nop:
; GFX10-64:       ; %bb.0: ; %.entry
; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
; GFX10-64-NEXT:    s_endpgm
.entry:
  %c0 = fcmp olt float %arg0, 0.000000e+00
  %c1 = fcmp oge float %arg1, 0.0
  call void @llvm.amdgcn.wqm.demote(i1 true)
  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
  ret void
}


declare void @llvm.amdgcn.wqm.demote(i1) #0
declare i1 @llvm.amdgcn.live.mask() #0
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare float @llvm.amdgcn.wqm.f32(float) #1
declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2
declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2
declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3
declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind readnone speculatable }
attributes #3 = { inaccessiblememonly nounwind }
attributes #4 = { convergent nounwind readnone }