llvm/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -early-live-intervals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v1, s4
; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[4:5]
; GCN-NEXT:    s_mov_b64 exec, s[4:5]
; GCN-NEXT:    v_mov_b32_e32 v1, v0
; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
  store i32 %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
; GCN-LABEL: set_inactive_imm_poison:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT:    v_mov_b32_e32 v0, 1
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, -1
; GCN-NEXT:    v_mov_b32_e32 v1, v0
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
  store i32 %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
; GCN-LABEL: set_inactive_64:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s7, 0xf000
; GCN-NEXT:    s_mov_b32 s6, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    s_mov_b32 s4, s0
; GCN-NEXT:    s_mov_b32 s5, s1
; GCN-NEXT:    v_mov_b32_e32 v2, s3
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, s2
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, v0
; GCN-NEXT:    v_mov_b32_e32 v3, v1
; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
  %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
  store i64 %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
; GCN-LABEL: set_inactive_imm_poison_64:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT:    v_mov_b32_e32 v0, 1
; GCN-NEXT:    v_mov_b32_e32 v1, 0
; GCN-NEXT:    v_mov_b32_e32 v2, v0
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, -1
; GCN-NEXT:    v_mov_b32_e32 v3, v1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
  %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
  store i64 %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) {
; GCN-LABEL: set_inactive_scc:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
; GCN-NEXT:    s_load_dword s8, s[2:3], 0x2c
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    s_buffer_load_dword s4, s[4:7], 0x0
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT:    v_mov_b32_e32 v1, s8
; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[2:3]
; GCN-NEXT:    s_mov_b64 exec, s[2:3]
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    s_cmp_lg_u32 s4, 56
; GCN-NEXT:    v_mov_b32_e32 v1, v0
; GCN-NEXT:    s_mov_b64 s[2:3], -1
; GCN-NEXT:    s_cbranch_scc1 .LBB4_3
; GCN-NEXT:  ; %bb.1: ; %Flow
; GCN-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
; GCN-NEXT:    s_cbranch_vccz .LBB4_4
; GCN-NEXT:  .LBB4_2: ; %.exit
; GCN-NEXT:    s_endpgm
; GCN-NEXT:  .LBB4_3: ; %.one
; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v1
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, -1
; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0
; GCN-NEXT:    s_cbranch_execnz .LBB4_2
; GCN-NEXT:  .LBB4_4: ; %.zero
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, -1
; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT:    s_endpgm
  %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
  %cmp = icmp eq i32 %val, 56
  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
  br i1 %cmp, label %.zero, label %.one

.zero:
  store i32 %tmp, ptr addrspace(1) %out
  br label %.exit

.one:
  %tmp.1 = add i32 %tmp, 1
  store i32 %tmp.1, ptr addrspace(1) %out
  br label %.exit

.exit:
  ret void
}

define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v1, s4
; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT:    v_mov_b32_e32 v0, 0x40400000
; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT:    s_mov_b64 exec, s[4:5]
; GCN-NEXT:    v_mov_b32_e32 v1, v0
; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
  %tmp = call float @llvm.amdgcn.strict.wwm.f32(float %tmp.0)
  store float %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s7, 0xf000
; GCN-NEXT:    s_mov_b32 s6, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    s_mov_b32 s4, s0
; GCN-NEXT:    s_mov_b32 s5, s1
; GCN-NEXT:    v_mov_b32_e32 v2, s3
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_mov_b32_e32 v0, 0x4010cccc
; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, s2
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_mov_b32_e32 v0, 0xcccccccd
; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, v0
; GCN-NEXT:    v_mov_b32_e32 v3, v1
; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
  %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0)
  store double %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v1, s4
; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT:    s_mov_b64 exec, s[4:5]
; GCN-NEXT:    v_mov_b32_e32 v1, v0
; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
  %tmp = call <2 x i16> @llvm.amdgcn.strict.wwm.v2i16(<2 x i16> %tmp.0)
  store <2 x i16> %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v1, s4
; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT:    s_mov_b64 exec, s[4:5]
; GCN-NEXT:    v_mov_b32_e32 v1, v0
; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
  %tmp = call <2 x half> @llvm.amdgcn.strict.wwm.v2i16(<2 x half> %tmp.0)
  store <2 x half> %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
; GCN-LABEL: set_inactive_v2i32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s7, 0xf000
; GCN-NEXT:    s_mov_b32 s6, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    s_mov_b32 s4, s0
; GCN-NEXT:    s_mov_b32 s5, s1
; GCN-NEXT:    v_mov_b32_e32 v2, s3
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_cndmask_b32_e64 v1, 1, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, s2
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, v0
; GCN-NEXT:    v_mov_b32_e32 v3, v1
; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
  %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0)
  store <2 x i32> %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; GCN-LABEL: set_inactive_v2f32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s7, 0xf000
; GCN-NEXT:    s_mov_b32 s6, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    s_mov_b32 s4, s0
; GCN-NEXT:    s_mov_b32 s5, s1
; GCN-NEXT:    v_mov_b32_e32 v2, s3
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, s2
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, v0
; GCN-NEXT:    v_mov_b32_e32 v3, v1
; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
  %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0)
  store <2 x float> %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v1, s4
; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT:    s_mov_b64 exec, s[4:5]
; GCN-NEXT:    v_mov_b32_e32 v1, v0
; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
  %tmp = call <2 x bfloat> @llvm.amdgcn.strict.wwm.v2bf16(<2 x bfloat> %tmp.0)
  store <2 x bfloat> %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) {
; GCN-LABEL: set_inactive_v4i16:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s7, 0xf000
; GCN-NEXT:    s_mov_b32 s6, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    s_mov_b32 s4, s0
; GCN-NEXT:    s_mov_b32 s5, s1
; GCN-NEXT:    v_mov_b32_e32 v2, s3
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, s2
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, v0
; GCN-NEXT:    v_mov_b32_e32 v3, v1
; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
  %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0)
  store <4 x i16> %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; GCN-LABEL: set_inactive_v4f16:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s7, 0xf000
; GCN-NEXT:    s_mov_b32 s6, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    s_mov_b32 s4, s0
; GCN-NEXT:    s_mov_b32 s5, s1
; GCN-NEXT:    v_mov_b32_e32 v2, s3
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, s2
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, v0
; GCN-NEXT:    v_mov_b32_e32 v3, v1
; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
  %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0)
  store <4 x half> %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) {
; GCN-LABEL: set_inactive_v4bf16:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s7, 0xf000
; GCN-NEXT:    s_mov_b32 s6, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    s_mov_b32 s4, s0
; GCN-NEXT:    s_mov_b32 s5, s1
; GCN-NEXT:    v_mov_b32_e32 v2, s3
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, s2
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, v0
; GCN-NEXT:    v_mov_b32_e32 v3, v1
; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
  %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)
  store <4 x bfloat> %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s7, 0xf000
; GCN-NEXT:    s_mov_b32 s6, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    s_mov_b32 s4, s0
; GCN-NEXT:    s_mov_b32 s5, s1
; GCN-NEXT:    v_mov_b32_e32 v2, s3
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, s2
; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
; GCN-NEXT:    s_mov_b64 exec, s[0:1]
; GCN-NEXT:    v_mov_b32_e32 v2, v0
; GCN-NEXT:    v_mov_b32_e32 v3, v1
; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
  %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0)
  store ptr %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
; GCN-LABEL: set_inactive_p2:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v1, s4
; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
; GCN-NEXT:    s_mov_b64 exec, s[4:5]
; GCN-NEXT:    v_mov_b32_e32 v1, v0
; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
  %tmp = call ptr addrspace(2) @llvm.amdgcn.strict.wwm.p2(ptr addrspace(2) %tmp.0)
  store ptr addrspace(2) %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GCN-LABEL: set_inactive_p3:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v1, s4
; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
; GCN-NEXT:    s_mov_b64 exec, s[4:5]
; GCN-NEXT:    v_mov_b32_e32 v1, v0
; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
  %tmp = call ptr addrspace(3) @llvm.amdgcn.strict.wwm.p3(ptr addrspace(3) %tmp.0)
  store ptr addrspace(3) %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
; GCN-LABEL: set_inactive_p5:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v1, s4
; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
; GCN-NEXT:    s_mov_b64 exec, s[4:5]
; GCN-NEXT:    v_mov_b32_e32 v1, v0
; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
  %tmp = call ptr addrspace(5) @llvm.amdgcn.strict.wwm.p5(ptr addrspace(5) %tmp.0)
  store ptr addrspace(5) %tmp, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
; GCN-LABEL: set_inactive_p6:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT:    s_mov_b32 s3, 0xf000
; GCN-NEXT:    s_mov_b32 s2, -1
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v1, s4
; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
; GCN-NEXT:    s_mov_b64 exec, s[4:5]
; GCN-NEXT:    v_mov_b32_e32 v1, v0
; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT:    s_endpgm
  %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
  %tmp = call ptr addrspace(6) @llvm.amdgcn.strict.wwm.p6(ptr addrspace(6) %tmp.0)
  store ptr addrspace(6) %tmp, ptr addrspace(1) %out
  ret void
}

declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1
declare i64 @llvm.amdgcn.strict.wwm.i64(i64) #1
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)

attributes #0 = { convergent readnone }
attributes #1 = { convergent nounwind readnone speculatable willreturn }