llvm/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH %s

define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_x_25ulp:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-DENORM-NEXT:    v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v0, s2
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v0, v0
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v2, s2
; GCN-DENORM-NEXT:    v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT:    v_ldexp_f32 v0, v0, v2
; GCN-DENORM-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_x_25ulp:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v0, s2
; GCN-FLUSH-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load float, ptr addrspace(1) %arg, align 4
  %div = fdiv float 1.000000e+00, %load, !fpmath !0
  store float %div, ptr addrspace(1) %arg, align 4
  ret void
}

define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_x_25ulp:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-DENORM-NEXT:    v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e64 v0, -s2
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v0, v0
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v2, s2
; GCN-DENORM-NEXT:    v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT:    v_ldexp_f32 v0, v0, v2
; GCN-DENORM-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_x_25ulp:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_rcp_f32_e64 v0, -s2
; GCN-FLUSH-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load float, ptr addrspace(1) %arg, align 4
  %div = fdiv float -1.000000e+00, %load, !fpmath !0
  store float %div, ptr addrspace(1) %arg, align 4
  ret void
}

define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_minus_x_25ulp:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-DENORM-NEXT:    v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e64 v0, -s2
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v0, v0
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v2, s2
; GCN-DENORM-NEXT:    v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT:    v_ldexp_f32 v0, v0, v2
; GCN-DENORM-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_minus_x_25ulp:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_rcp_f32_e64 v0, -s2
; GCN-FLUSH-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load float, ptr addrspace(1) %arg, align 4
  %neg = fneg float %load
  %div = fdiv float 1.000000e+00, %neg, !fpmath !0
  store float %div, ptr addrspace(1) %arg, align 4
  ret void
}

define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_minus_x_25ulp:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-DENORM-NEXT:    v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v0, s2
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v0, v0
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v2, s2
; GCN-DENORM-NEXT:    v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT:    v_ldexp_f32 v0, v0, v2
; GCN-DENORM-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_25ulp:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v0, s2
; GCN-FLUSH-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load float, ptr addrspace(1) %arg, align 4
  %neg = fsub float -0.000000e+00, %load
  %div = fdiv float -1.000000e+00, %neg, !fpmath !0
  store float %div, ptr addrspace(1) %arg, align 4
  ret void
}

define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_v4_1_by_x_25ulp:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN-DENORM-NEXT:    v_mov_b32_e32 v4, 0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v0, s0
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v2, s1
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v0, v0
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v2, v2
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v1, s0
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v3, s1
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v5, s2
; GCN-DENORM-NEXT:    v_sub_u32_e32 v1, 0, v1
; GCN-DENORM-NEXT:    v_sub_u32_e32 v3, 0, v3
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v5, v5
; GCN-DENORM-NEXT:    v_ldexp_f32 v0, v0, v1
; GCN-DENORM-NEXT:    v_ldexp_f32 v1, v2, v3
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v3, s3
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v6, s2
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v3, v3
; GCN-DENORM-NEXT:    v_sub_u32_e32 v2, 0, v6
; GCN-DENORM-NEXT:    v_ldexp_f32 v2, v5, v2
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v5, s3
; GCN-DENORM-NEXT:    v_sub_u32_e32 v5, 0, v5
; GCN-DENORM-NEXT:    v_ldexp_f32 v3, v3, v5
; GCN-DENORM-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_v4_1_by_x_25ulp:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v4, 0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v0, s0
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v1, s1
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v2, s2
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v3, s3
; GCN-FLUSH-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load <4 x float>, ptr addrspace(1) %arg, align 16
  %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %load, !fpmath !0
  store <4 x float> %div, ptr addrspace(1) %arg, align 16
  ret void
}

define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_v4_minus_1_by_x_25ulp:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN-DENORM-NEXT:    v_mov_b32_e32 v4, 0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e64 v0, -s0
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e64 v2, -s1
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v0, v0
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v2, v2
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v1, s0
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v3, s1
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e64 v5, -s2
; GCN-DENORM-NEXT:    v_sub_u32_e32 v1, 0, v1
; GCN-DENORM-NEXT:    v_sub_u32_e32 v3, 0, v3
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v5, v5
; GCN-DENORM-NEXT:    v_ldexp_f32 v0, v0, v1
; GCN-DENORM-NEXT:    v_ldexp_f32 v1, v2, v3
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e64 v3, -s3
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v6, s2
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v3, v3
; GCN-DENORM-NEXT:    v_sub_u32_e32 v2, 0, v6
; GCN-DENORM-NEXT:    v_ldexp_f32 v2, v5, v2
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v5, s3
; GCN-DENORM-NEXT:    v_sub_u32_e32 v5, 0, v5
; GCN-DENORM-NEXT:    v_ldexp_f32 v3, v3, v5
; GCN-DENORM-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_v4_minus_1_by_x_25ulp:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v4, 0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_rcp_f32_e64 v0, -s0
; GCN-FLUSH-NEXT:    v_rcp_f32_e64 v1, -s1
; GCN-FLUSH-NEXT:    v_rcp_f32_e64 v2, -s2
; GCN-FLUSH-NEXT:    v_rcp_f32_e64 v3, -s3
; GCN-FLUSH-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load <4 x float>, ptr addrspace(1) %arg, align 16
  %div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %load, !fpmath !0
  store <4 x float> %div, ptr addrspace(1) %arg, align 16
  ret void
}

define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_v4_1_by_minus_x_25ulp:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN-DENORM-NEXT:    v_mov_b32_e32 v4, 0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e64 v0, -s0
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e64 v2, -s1
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v0, v0
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v2, v2
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v1, s0
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v3, s1
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e64 v5, -s2
; GCN-DENORM-NEXT:    v_sub_u32_e32 v1, 0, v1
; GCN-DENORM-NEXT:    v_sub_u32_e32 v3, 0, v3
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v5, v5
; GCN-DENORM-NEXT:    v_ldexp_f32 v0, v0, v1
; GCN-DENORM-NEXT:    v_ldexp_f32 v1, v2, v3
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e64 v3, -s3
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v6, s2
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v3, v3
; GCN-DENORM-NEXT:    v_sub_u32_e32 v2, 0, v6
; GCN-DENORM-NEXT:    v_ldexp_f32 v2, v5, v2
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v5, s3
; GCN-DENORM-NEXT:    v_sub_u32_e32 v5, 0, v5
; GCN-DENORM-NEXT:    v_ldexp_f32 v3, v3, v5
; GCN-DENORM-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_v4_1_by_minus_x_25ulp:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v4, 0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_rcp_f32_e64 v0, -s0
; GCN-FLUSH-NEXT:    v_rcp_f32_e64 v1, -s1
; GCN-FLUSH-NEXT:    v_rcp_f32_e64 v2, -s2
; GCN-FLUSH-NEXT:    v_rcp_f32_e64 v3, -s3
; GCN-FLUSH-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load <4 x float>, ptr addrspace(1) %arg, align 16
  %neg = fneg <4 x float> %load
  %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %neg, !fpmath !0
  store <4 x float> %div, ptr addrspace(1) %arg, align 16
  ret void
}

define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_v4_minus_1_by_minus_x_25ulp:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN-DENORM-NEXT:    v_mov_b32_e32 v4, 0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v0, s0
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v2, s1
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v0, v0
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v2, v2
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v1, s0
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v3, s1
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v5, s2
; GCN-DENORM-NEXT:    v_sub_u32_e32 v1, 0, v1
; GCN-DENORM-NEXT:    v_sub_u32_e32 v3, 0, v3
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v5, v5
; GCN-DENORM-NEXT:    v_ldexp_f32 v0, v0, v1
; GCN-DENORM-NEXT:    v_ldexp_f32 v1, v2, v3
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v3, s3
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v6, s2
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v3, v3
; GCN-DENORM-NEXT:    v_sub_u32_e32 v2, 0, v6
; GCN-DENORM-NEXT:    v_ldexp_f32 v2, v5, v2
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v5, s3
; GCN-DENORM-NEXT:    v_sub_u32_e32 v5, 0, v5
; GCN-DENORM-NEXT:    v_ldexp_f32 v3, v3, v5
; GCN-DENORM-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_v4_minus_1_by_minus_x_25ulp:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v4, 0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v0, s0
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v1, s1
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v2, s2
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v3, s3
; GCN-FLUSH-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load <4 x float>, ptr addrspace(1) %arg, align 16
  %neg = fneg <4 x float> %load
  %div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %neg, !fpmath !0
  store <4 x float> %div, ptr addrspace(1) %arg, align 16
  ret void
}

define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_v4_c_by_x_25ulp:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN-DENORM-NEXT:    v_mov_b32_e32 v4, 0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v1, s0
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v2, s1
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v1, v1
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v2, v2
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v3, s1
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e64 v5, -s2
; GCN-DENORM-NEXT:    v_sub_u32_e32 v3, 0, v3
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v5, v5
; GCN-DENORM-NEXT:    v_mul_f32_e32 v7, 0.5, v1
; GCN-DENORM-NEXT:    v_ldexp_f32 v1, v2, v3
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v2, s3
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v3, v2
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v6, s2
; GCN-DENORM-NEXT:    v_sub_u32_e32 v2, 0, v6
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v0, s0
; GCN-DENORM-NEXT:    v_ldexp_f32 v2, v5, v2
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v5, s3
; GCN-DENORM-NEXT:    v_sub_u32_e32 v0, 2, v0
; GCN-DENORM-NEXT:    v_mul_f32_e32 v3, -0.5, v3
; GCN-DENORM-NEXT:    v_sub_u32_e32 v5, 2, v5
; GCN-DENORM-NEXT:    v_ldexp_f32 v0, v7, v0
; GCN-DENORM-NEXT:    v_ldexp_f32 v3, v3, v5
; GCN-DENORM-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_v4_c_by_x_25ulp:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v0, 0x6f800000
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v1, 0x2f800000
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v4, 0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_cmp_gt_f32_e64 vcc, |s0|, v0
; GCN-FLUSH-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
; GCN-FLUSH-NEXT:    v_cmp_gt_f32_e64 vcc, |s3|, v0
; GCN-FLUSH-NEXT:    v_cndmask_b32_e32 v5, 1.0, v1, vcc
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, s0, v3
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v1, s3, v5
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v0, v0
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v6, v1
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v1, s1
; GCN-FLUSH-NEXT:    v_rcp_f32_e64 v2, -s2
; GCN-FLUSH-NEXT:    v_add_f32_e32 v0, v0, v0
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v6, -2.0, v6
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, v3, v0
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v5, v6
; GCN-FLUSH-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load <4 x float>, ptr addrspace(1) %arg, align 16
  %div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %load, !fpmath !0
  store <4 x float> %div, ptr addrspace(1) %arg, align 16
  ret void
}

define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_v4_c_by_minus_x_25ulp:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN-DENORM-NEXT:    v_mov_b32_e32 v4, 0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e64 v1, -s0
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e64 v2, -s1
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v1, v1
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v2, v2
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v3, s1
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v5, s2
; GCN-DENORM-NEXT:    v_sub_u32_e32 v3, 0, v3
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v5, v5
; GCN-DENORM-NEXT:    v_mul_f32_e32 v7, 0.5, v1
; GCN-DENORM-NEXT:    v_ldexp_f32 v1, v2, v3
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e64 v2, -s3
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v3, v2
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v6, s2
; GCN-DENORM-NEXT:    v_sub_u32_e32 v2, 0, v6
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v0, s0
; GCN-DENORM-NEXT:    v_ldexp_f32 v2, v5, v2
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v5, s3
; GCN-DENORM-NEXT:    v_sub_u32_e32 v0, 2, v0
; GCN-DENORM-NEXT:    v_mul_f32_e32 v3, -0.5, v3
; GCN-DENORM-NEXT:    v_sub_u32_e32 v5, 2, v5
; GCN-DENORM-NEXT:    v_ldexp_f32 v0, v7, v0
; GCN-DENORM-NEXT:    v_ldexp_f32 v3, v3, v5
; GCN-DENORM-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_v4_c_by_minus_x_25ulp:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v0, 0x6f800000
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v2, 0x2f800000
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v4, 0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_cmp_gt_f32_e64 vcc, |s0|, v0
; GCN-FLUSH-NEXT:    v_cndmask_b32_e32 v3, 1.0, v2, vcc
; GCN-FLUSH-NEXT:    v_cmp_gt_f32_e64 vcc, |s3|, v0
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v5, s0, v3
; GCN-FLUSH-NEXT:    v_mul_f32_e64 v6, -s0, v3
; GCN-FLUSH-NEXT:    v_cndmask_b32_e32 v7, 1.0, v2, vcc
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v5, v5
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v6, v6
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, s3, v7
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v8, v0
; GCN-FLUSH-NEXT:    v_rcp_f32_e64 v1, -s1
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v2, s2
; GCN-FLUSH-NEXT:    v_sub_f32_e32 v0, v6, v5
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, v3, v0
; GCN-FLUSH-NEXT:    v_add_f32_e32 v3, v8, v8
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v7, v3
; GCN-FLUSH-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load <4 x float>, ptr addrspace(1) %arg, align 16
  %neg = fneg <4 x float> %load
  %div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %neg, !fpmath !0
  store <4 x float> %div, ptr addrspace(1) %arg, align 16
  ret void
}

define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) {
; GCN-DENORM-LABEL: div_v_by_x_25ulp:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-DENORM-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GCN-DENORM-NEXT:    v_mov_b32_e32 v0, 0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v2, s4
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v3, s4
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_frexp_mant_f32_e32 v1, s2
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v1, v1
; GCN-DENORM-NEXT:    v_frexp_exp_i32_f32_e32 v4, s2
; GCN-DENORM-NEXT:    v_sub_u32_e32 v2, v2, v4
; GCN-DENORM-NEXT:    v_mul_f32_e32 v1, v3, v1
; GCN-DENORM-NEXT:    v_ldexp_f32 v1, v1, v2
; GCN-DENORM-NEXT:    global_store_dword v0, v1, s[0:1]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_v_by_x_25ulp:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-FLUSH-NEXT:    s_load_dword s4, s[2:3], 0x2c
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v0, 0x6f800000
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v1, 0x2f800000
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v2, 0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_cmp_gt_f32_e64 vcc, |s2|, v0
; GCN-FLUSH-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v1, s2, v0
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v1, s4, v1
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v1
; GCN-FLUSH-NEXT:    global_store_dword v2, v0, s[0:1]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load float, ptr addrspace(1) %arg, align 4
  %div = fdiv float %num, %load, !fpmath !0
  store float %div, ptr addrspace(1) %arg, align 4
  ret void
}

define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) {
; GCN-LABEL: div_1_by_x_fast:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT:    v_mov_b32_e32 v1, 0
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_rcp_f32_e32 v0, s2
; GCN-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-NEXT:    s_endpgm
  %load = load float, ptr addrspace(1) %arg, align 4
  %div = fdiv fast float 1.000000e+00, %load, !fpmath !0
  store float %div, ptr addrspace(1) %arg, align 4
  ret void
}

define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_x_fast:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-DENORM-NEXT:    v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_rcp_f32_e64 v0, -s2
; GCN-DENORM-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_x_fast:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v0, s2
; GCN-FLUSH-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
; GCN-FLUSH-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load float, ptr addrspace(1) %arg, align 4
  %div = fdiv fast float -1.000000e+00, %load, !fpmath !0
  store float %div, ptr addrspace(1) %arg, align 4
  ret void
}

define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) {
; GCN-LABEL: div_1_by_minus_x_fast:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT:    v_mov_b32_e32 v1, 0
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_rcp_f32_e64 v0, -s2
; GCN-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-NEXT:    s_endpgm
  %load = load float, ptr addrspace(1) %arg, align 4
  %neg = fneg float %load, !fpmath !0
  %div = fdiv fast float 1.000000e+00, %neg
  store float %div, ptr addrspace(1) %arg, align 4
  ret void
}

define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_minus_x_fast:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-DENORM-NEXT:    v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v0, s2
; GCN-DENORM-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_fast:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dword s2, s[0:1], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_rcp_f32_e64 v0, -s2
; GCN-FLUSH-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
; GCN-FLUSH-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load float, ptr addrspace(1) %arg, align 4
  %neg = fsub float -0.000000e+00, %load, !fpmath !0
  %div = fdiv fast float -1.000000e+00, %neg
  store float %div, ptr addrspace(1) %arg, align 4
  ret void
}

define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_x_correctly_rounded:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dword s4, s[0:1], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
; GCN-DENORM-NEXT:    v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT:    v_fma_f32 v3, -v0, v2, 1.0
; GCN-DENORM-NEXT:    v_fma_f32 v2, v3, v2, v2
; GCN-DENORM-NEXT:    v_mul_f32_e32 v3, v1, v2
; GCN-DENORM-NEXT:    v_fma_f32 v4, -v0, v3, v1
; GCN-DENORM-NEXT:    v_fma_f32 v3, v4, v2, v3
; GCN-DENORM-NEXT:    v_fma_f32 v0, -v0, v3, v1
; GCN-DENORM-NEXT:    v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT:    v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT:    v_div_fixup_f32 v0, v0, s4, 1.0
; GCN-DENORM-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_x_correctly_rounded:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dword s4, s[0:1], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
; GCN-FLUSH-NEXT:    v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GCN-FLUSH-NEXT:    v_fma_f32 v3, -v0, v2, 1.0
; GCN-FLUSH-NEXT:    v_fma_f32 v2, v3, v2, v2
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v1, v2
; GCN-FLUSH-NEXT:    v_fma_f32 v4, -v0, v3, v1
; GCN-FLUSH-NEXT:    v_fma_f32 v3, v4, v2, v3
; GCN-FLUSH-NEXT:    v_fma_f32 v0, -v0, v3, v1
; GCN-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GCN-FLUSH-NEXT:    v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT:    v_div_fixup_f32 v0, v0, s4, 1.0
; GCN-FLUSH-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load float, ptr addrspace(1) %arg, align 4
  %div = fdiv float 1.000000e+00, %load
  store float %div, ptr addrspace(1) %arg, align 4
  ret void
}

define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_x_correctly_rounded:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dword s4, s[0:1], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_div_scale_f32 v0, s[2:3], s4, s4, -1.0
; GCN-DENORM-NEXT:    v_div_scale_f32 v1, vcc, -1.0, s4, -1.0
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT:    v_fma_f32 v3, -v0, v2, 1.0
; GCN-DENORM-NEXT:    v_fma_f32 v2, v3, v2, v2
; GCN-DENORM-NEXT:    v_mul_f32_e32 v3, v1, v2
; GCN-DENORM-NEXT:    v_fma_f32 v4, -v0, v3, v1
; GCN-DENORM-NEXT:    v_fma_f32 v3, v4, v2, v3
; GCN-DENORM-NEXT:    v_fma_f32 v0, -v0, v3, v1
; GCN-DENORM-NEXT:    v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT:    v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT:    v_div_fixup_f32 v0, v0, s4, -1.0
; GCN-DENORM-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_x_correctly_rounded:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dword s4, s[0:1], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_div_scale_f32 v0, s[2:3], s4, s4, -1.0
; GCN-FLUSH-NEXT:    v_div_scale_f32 v1, vcc, -1.0, s4, -1.0
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GCN-FLUSH-NEXT:    v_fma_f32 v3, -v0, v2, 1.0
; GCN-FLUSH-NEXT:    v_fma_f32 v2, v3, v2, v2
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v1, v2
; GCN-FLUSH-NEXT:    v_fma_f32 v4, -v0, v3, v1
; GCN-FLUSH-NEXT:    v_fma_f32 v3, v4, v2, v3
; GCN-FLUSH-NEXT:    v_fma_f32 v0, -v0, v3, v1
; GCN-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GCN-FLUSH-NEXT:    v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT:    v_div_fixup_f32 v0, v0, s4, -1.0
; GCN-FLUSH-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load float, ptr addrspace(1) %arg, align 4
  %div = fdiv float -1.000000e+00, %load
  store float %div, ptr addrspace(1) %arg, align 4
  ret void
}

define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_minus_x_correctly_rounded:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dword s4, s[0:1], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_div_scale_f32 v0, s[2:3], s4, s4, -1.0
; GCN-DENORM-NEXT:    v_div_scale_f32 v1, vcc, -1.0, s4, -1.0
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT:    v_fma_f32 v3, -v0, v2, 1.0
; GCN-DENORM-NEXT:    v_fma_f32 v2, v3, v2, v2
; GCN-DENORM-NEXT:    v_mul_f32_e32 v3, v1, v2
; GCN-DENORM-NEXT:    v_fma_f32 v4, -v0, v3, v1
; GCN-DENORM-NEXT:    v_fma_f32 v3, v4, v2, v3
; GCN-DENORM-NEXT:    v_fma_f32 v0, -v0, v3, v1
; GCN-DENORM-NEXT:    v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT:    v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT:    v_div_fixup_f32 v0, v0, s4, -1.0
; GCN-DENORM-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_minus_x_correctly_rounded:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dword s4, s[0:1], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_div_scale_f32 v0, s[2:3], -s4, -s4, 1.0
; GCN-FLUSH-NEXT:    v_div_scale_f32 v1, vcc, 1.0, -s4, 1.0
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GCN-FLUSH-NEXT:    v_fma_f32 v3, -v0, v2, 1.0
; GCN-FLUSH-NEXT:    v_fma_f32 v2, v3, v2, v2
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v1, v2
; GCN-FLUSH-NEXT:    v_fma_f32 v4, -v0, v3, v1
; GCN-FLUSH-NEXT:    v_fma_f32 v3, v4, v2, v3
; GCN-FLUSH-NEXT:    v_fma_f32 v0, -v0, v3, v1
; GCN-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GCN-FLUSH-NEXT:    v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT:    v_div_fixup_f32 v0, v0, -s4, 1.0
; GCN-FLUSH-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load float, ptr addrspace(1) %arg, align 4
  %neg = fsub float -0.000000e+00, %load
  %div = fdiv float 1.000000e+00, %neg
  store float %div, ptr addrspace(1) %arg, align 4
  ret void
}

define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_minus_x_correctly_rounded:
; GCN-DENORM:       ; %bb.0:
; GCN-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    s_load_dword s4, s[0:1], 0x0
; GCN-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-DENORM-NEXT:    v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
; GCN-DENORM-NEXT:    v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
; GCN-DENORM-NEXT:    v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT:    v_fma_f32 v3, -v0, v2, 1.0
; GCN-DENORM-NEXT:    v_fma_f32 v2, v3, v2, v2
; GCN-DENORM-NEXT:    v_mul_f32_e32 v3, v1, v2
; GCN-DENORM-NEXT:    v_fma_f32 v4, -v0, v3, v1
; GCN-DENORM-NEXT:    v_fma_f32 v3, v4, v2, v3
; GCN-DENORM-NEXT:    v_fma_f32 v0, -v0, v3, v1
; GCN-DENORM-NEXT:    v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT:    v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT:    v_div_fixup_f32 v0, v0, s4, 1.0
; GCN-DENORM-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-DENORM-NEXT:    s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_correctly_rounded:
; GCN-FLUSH:       ; %bb.0:
; GCN-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    s_load_dword s4, s[0:1], 0x0
; GCN-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT:    v_div_scale_f32 v0, s[2:3], -s4, -s4, -1.0
; GCN-FLUSH-NEXT:    v_div_scale_f32 v1, vcc, -1.0, -s4, -1.0
; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GCN-FLUSH-NEXT:    v_fma_f32 v3, -v0, v2, 1.0
; GCN-FLUSH-NEXT:    v_fma_f32 v2, v3, v2, v2
; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v1, v2
; GCN-FLUSH-NEXT:    v_fma_f32 v4, -v0, v3, v1
; GCN-FLUSH-NEXT:    v_fma_f32 v3, v4, v2, v3
; GCN-FLUSH-NEXT:    v_fma_f32 v0, -v0, v3, v1
; GCN-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GCN-FLUSH-NEXT:    v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT:    v_div_fixup_f32 v0, v0, -s4, -1.0
; GCN-FLUSH-NEXT:    global_store_dword v1, v0, s[0:1]
; GCN-FLUSH-NEXT:    s_endpgm
  %load = load float, ptr addrspace(1) %arg, align 4
  %neg = fsub float -0.000000e+00, %load
  %div = fdiv float -1.000000e+00, %neg
  store float %div, ptr addrspace(1) %arg, align 4
  ret void
}

!0 = !{float 2.500000e+00}