//===-- A class to store high precision floating point numbers --*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H #include "FEnvImpl.h" #include "FPBits.h" #include "hdr/errno_macros.h" #include "hdr/fenv_macros.h" #include "multiply_add.h" #include "rounding_mode.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/big_int.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/macros/properties/types.h" #include <stddef.h> namespace LIBC_NAMESPACE_DECL { namespace fputil { // A generic class to perform computations of high precision floating points. // We store the value in dyadic format, including 3 fields: // sign : boolean value - false means positive, true means negative // exponent: the exponent value of the least significant bit of the mantissa. // mantissa: unsigned integer of length `Bits`. // So the real value that is stored is: // real value = (-1)^sign * 2^exponent * (mantissa as unsigned integer) // The stored data is normal if for non-zero mantissa, the leading bit is 1. // The outputs of the constructors and most functions will be normalized. // To simplify and improve the efficiency, many functions will assume that the // inputs are normal. template <size_t Bits> struct DyadicFloat { … }; // Quick add - Add 2 dyadic floats with rounding toward 0 and then normalize the // output: // - Align the exponents so that: // new a.exponent = new b.exponent = max(a.exponent, b.exponent) // - Add or subtract the mantissas depending on the signs. // - Normalize the result. // The absolute errors compared to the mathematical sum is bounded by: // | quick_add(a, b) - (a + b) | < MSB(a + b) * 2^(-Bits + 2), // i.e., errors are up to 2 ULPs. // Assume inputs are normalized (by constructors or other functions) so that we // don't need to normalize the inputs again in this function. If the inputs are // not normalized, the results might lose precision significantly. template <size_t Bits> LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a, DyadicFloat<Bits> b) { … } // Quick Mul - Slightly less accurate but efficient multiplication of 2 dyadic // floats with rounding toward 0 and then normalize the output: // result.exponent = a.exponent + b.exponent + Bits, // result.mantissa = quick_mul_hi(a.mantissa + b.mantissa) // ~ (full product a.mantissa * b.mantissa) >> Bits. // The errors compared to the mathematical product is bounded by: // 2 * errors of quick_mul_hi = 2 * (UInt<Bits>::WORD_COUNT - 1) in ULPs. // Assume inputs are normalized (by constructors or other functions) so that we // don't need to normalize the inputs again in this function. If the inputs are // not normalized, the results might lose precision significantly. template <size_t Bits> LIBC_INLINE constexpr DyadicFloat<Bits> quick_mul(const DyadicFloat<Bits> &a, const DyadicFloat<Bits> &b) { … } // Simple polynomial approximation. template <size_t Bits> LIBC_INLINE constexpr DyadicFloat<Bits> multiply_add(const DyadicFloat<Bits> &a, const DyadicFloat<Bits> &b, const DyadicFloat<Bits> &c) { … } // Simple exponentiation implementation for printf. Only handles positive // exponents, since division isn't implemented. template <size_t Bits> LIBC_INLINE constexpr DyadicFloat<Bits> pow_n(const DyadicFloat<Bits> &a, uint32_t power) { … } template <size_t Bits> LIBC_INLINE constexpr DyadicFloat<Bits> mul_pow_2(const DyadicFloat<Bits> &a, int32_t pow_2) { … } } // namespace fputil } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H