// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2007 Julien Pommier // Copyright (C) 2014 Pedro Gonnet ([email protected]) // Copyright (C) 2009-2019 Gael Guennebaud <[email protected]> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /* The exp and log functions of this file initially come from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ #ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H #define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H // IWYU pragma: private #include "../../InternalHeaderCheck.h" namespace Eigen { namespace internal { // Creates a Scalar integer type with same bit-width. template <typename T> struct make_integer; template <> struct make_integer<float> { … }; template <> struct make_integer<double> { … }; template <> struct make_integer<half> { … }; template <> struct make_integer<bfloat16> { … }; template <typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) { … } // Safely applies frexp, correctly handles denormals. // Assumes IEEE floating point format. template <typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent) { … } // Safely applies ldexp, correctly handles overflows, underflows and denormals. // Assumes IEEE floating point format. template <typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent) { … } // Explicitly multiplies // a * (2^e) // clamping e to the range // [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()] // // This is approx 7x faster than pldexp_impl, but will prematurely over/underflow // if 2^e doesn't fit into a normal floating-point Scalar. // // Assumes IEEE floating point format template <typename Packet> struct pldexp_fast_impl { … }; // Natural or base 2 logarithm. // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can // be easily approximated by a polynomial centered on m=1 for stability. // TODO(gonnet): Further reduce the interval allowing for lower-degree // polynomial interpolants -> ... -> profit! template <typename Packet, bool base2> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_float(const Packet _x) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Packet _x) { … } /* Returns the base e (2.718...) or base 2 logarithm of x. * The argument is separated into its exponent and fractional parts. * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)], * is approximated by * * log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x). * * for more detail see: http://www.netlib.org/cephes/ */ template <typename Packet, bool base2> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_double(const Packet _x) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Packet _x) { … } /** \internal \returns log(1 + x) computed using W. Kahan's formula. See: http://www.plunk.org/~hatch/rightway.php */ template <typename Packet> Packet generic_plog1p(const Packet& x) { … } /** \internal \returns exp(x)-1 computed using W. Kahan's formula. See: http://www.plunk.org/~hatch/rightway.php */ template <typename Packet> Packet generic_expm1(const Packet& x) { … } // Exponential function. Works by writing "x = m*log(2) + r" where // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). // exp(r) is computed using a 6th order minimax polynomial approximation. template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Packet _x) { … } // The following code is inspired by the following stack-overflow answer: // https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751 // It has been largely optimized: // - By-pass calls to frexp. // - Aligned loads of required 96 bits of 2/pi. This is accomplished by // (1) balancing the mantissa and exponent to the required bits of 2/pi are // aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi. // - Avoid a branch in rounding and extraction of the remaining fractional part. // Overall, I measured a speed up higher than x2 on x86-64. inline float trig_reduce_huge(float xf, Eigen::numext::int32_t* quadrant) { … } template <bool ComputeSine, typename Packet, bool ComputeBoth = false> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS #if EIGEN_COMP_GNUC_STRICT __attribute__((optimize("-fno-unsafe-math-optimizations"))) #endif Packet psincos_float(const Packet& _x) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x) { … } // Trigonometric argument reduction for double for inputs smaller than 15. // Reduces trigonometric arguments for double inputs where x < 15. Given an argument x and its corresponding quadrant // count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t. template <typename Packet> Packet trig_reduce_small_double(const Packet& x, const Packet& q) { … } // Trigonometric argument reduction for double for inputs smaller than 1e14. // Reduces trigonometric arguments for double inputs where x < 1e14. Given an argument x and its corresponding quadrant // count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t. template <typename Packet> Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Packet& q_low) { … } template <bool ComputeSine, typename Packet, bool ComputeBoth = false> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS #if EIGEN_COMP_GNUC_STRICT __attribute__((optimize("-fno-unsafe-math-optimizations"))) #endif Packet psincos_double(const Packet& x) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x) { … } // Generic implementation of acos(x). template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) { … } // Generic implementation of asin(x). template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x_in) { … } // Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy. template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced_float(const Packet& x) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_float(const Packet& x_in) { … } // Computes elementwise atan(x) for x in [-tan(pi/8):tan(pi/8)] // with 2 ulp accuracy. template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced_double(const Packet& x) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_double(const Packet& x_in) { … } /** \internal \returns the hyperbolic tan of \a a (coeff-wise) Doesn't do anything fancy, just a 13/6-degree rational interpolant which is accurate up to a couple of ulps in the (approximate) range [-8, 8], outside of which tanh(x) = +/-1 in single precision. The input is clamped to the range [-c, c]. The value c is chosen as the smallest value where the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004] the approximation tanh(x) ~= x is used for better accuracy as x tends to zero. This implementation works on both scalars and packets. */ template <typename T> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_complex(const Packet& x) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Packet& a) { … } template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a) { … } // \internal \returns the norm of a complex number z = x + i*y, defined as sqrt(x^2 + y^2). // Implemented using the hypot(a,b) algorithm from https://doi.org/10.48550/arXiv.1904.09481 template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet phypot_complex(const Packet& a) { … } psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex && !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>>; psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex && NumTraits<typename unpacket_traits<Packet>::type>::IsSigned && NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>>; psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex && !NumTraits<typename unpacket_traits<Packet>::type>::IsSigned && NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>>; // \internal \returns the the sign of a complex number z, defined as z / abs(z). psign_impl<Packet, std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsComplex && unpacket_traits<Packet>::vectorizable>>; // TODO(rmlarsen): The following set of utilities for double word arithmetic // should perhaps be refactored as a separate file, since it would be generally // useful for special function implementation etc. Writing the algorithms in // terms if a double word type would also make the code more readable. // This function splits x into the nearest integer n and fractional part r, // such that x = n + r holds exactly. template <typename Packet> EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) { … } // This function computes the sum {s, r}, such that x + y = s_hi + s_lo // holds exactly, and s_hi = fl(x+y), if |x| >= |y|. template <typename Packet> EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) { … } #ifdef EIGEN_VECTORIZE_FMA // This function implements the extended precision product of // a pair of floating point numbers. Given {x, y}, it computes the pair // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and // p_hi = fl(x * y). template <typename Packet> EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) { p_hi = pmul(x, y); p_lo = pmsub(x, y, p_hi); } #else // This function implements the Veltkamp splitting. Given a floating point // number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds // exactly and that half of the significant of x fits in x_hi. // This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions", // 3rd edition, Birkh\"auser, 2016. template <typename Packet> EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) { … } // This function implements Dekker's algorithm for products x * y. // Given floating point numbers {x, y} computes the pair // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and // p_hi = fl(x * y). template <typename Packet> EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) { … } #endif // EIGEN_VECTORIZE_FMA // This function implements Dekker's algorithm for the addition // of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}. // It returns the result as a pair {s_hi, s_lo} such that // x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly. // This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions", // 3rd edition, Birkh\"auser, 2016. template <typename Packet> EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, const Packet& y_lo, Packet& s_hi, Packet& s_lo) { … } // This is a version of twosum for double word numbers, // which assumes that |x_hi| >= |y_hi|. template <typename Packet> EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, const Packet& y_lo, Packet& s_hi, Packet& s_lo) { … } // This is a version of twosum for adding a floating point number x to // double word number {y_hi, y_lo} number, with the assumption // that |x| >= |y_hi|. template <typename Packet> EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo, Packet& s_hi, Packet& s_lo) { … } // This function implements the multiplication of a double word // number represented by {x_hi, x_lo} by a floating point number y. // It returns the result as a pair {p_hi, p_lo} such that // (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error // of less than 2*2^{-2p}, where p is the number of significand bit // in the floating point type. // This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions", // 3rd edition, Birkh\"auser, 2016. template <typename Packet> EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y, Packet& p_hi, Packet& p_lo) { … } // This function implements the multiplication of two double word // numbers represented by {x_hi, x_lo} and {y_hi, y_lo}. // It returns the result as a pair {p_hi, p_lo} such that // (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error // of less than 2*2^{-2p}, where p is the number of significand bit // in the floating point type. template <typename Packet> EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, const Packet& y_lo, Packet& p_hi, Packet& p_lo) { … } // This function implements the division of double word {x_hi, x_lo} // by float y. This is Algorithm 15 from "Tight and rigourous error bounds // for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu, // 2017. https://hal.archives-ouvertes.fr/hal-01351529 template <typename Packet> void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y, Packet& z_hi, Packet& z_lo) { … } // This function computes log2(x) and returns the result as a double word. template <typename Scalar> struct accurate_log2 { … }; // This specialization uses a more accurate algorithm to compute log2(x) for // floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.42e-10. // This additional accuracy is needed to counter the error-magnification // inherent in multiplying by a potentially large exponent in pow(x,y). // The minimax polynomial used was calculated using the Sollya tool. // See sollya.org. template <> struct accurate_log2<float> { … }; // This specialization uses a more accurate algorithm to compute log2(x) for // floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~1.27e-18. // This additional accuracy is needed to counter the error-magnification // inherent in multiplying by a potentially large exponent in pow(x,y). // The minimax polynomial used was calculated using the Sollya tool. // See sollya.org. template <> struct accurate_log2<double> { … }; // This function computes exp2(x) (i.e. 2**x). template <typename Scalar> struct fast_accurate_exp2 { … }; // This specialization uses a faster algorithm to compute exp2(x) for floats // in [-0.5;0.5] with a relative accuracy of 1 ulp. // The minimax polynomial used was calculated using the Sollya tool. // See sollya.org. template <> struct fast_accurate_exp2<float> { … }; // in [-0.5;0.5] with a relative accuracy of 1 ulp. // The minimax polynomial used was calculated using the Sollya tool. // See sollya.org. template <> struct fast_accurate_exp2<double> { … }; // This function implements the non-trivial case of pow(x,y) where x is // positive and y is (possibly) non-integer. // Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x. // TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it // easier to specialize or turn off for specific types and/or backends.x template <typename Packet> EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) { … } // Generic implementation of pow(x,y). template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Packet& x, const Packet& y) { … } /* polevl (modified for Eigen) * * Evaluate polynomial * * * * SYNOPSIS: * * int N; * Scalar x, y, coef[N+1]; * * y = polevl<decltype(x), N>( x, coef); * * * * DESCRIPTION: * * Evaluates polynomial of degree N: * * 2 N * y = C + C x + C x +...+ C x * 0 1 2 N * * Coefficients are stored in reverse order: * * coef[0] = C , ..., coef[N] = C . * N 0 * * The function p1evl() assumes that coef[N] = 1.0 and is * omitted from the array. Its calling arguments are * otherwise the same as polevl(). * * * The Eigen implementation is templatized. For best speed, store * coef as a const array (constexpr), e.g. * * const double coef[] = {1.0, 2.0, 3.0, ...}; * */ template <typename Packet, int N> struct ppolevl { … }; ppolevl<Packet, 0>; /* chbevl (modified for Eigen) * * Evaluate Chebyshev series * * * * SYNOPSIS: * * int N; * Scalar x, y, coef[N], chebevl(); * * y = chbevl( x, coef, N ); * * * * DESCRIPTION: * * Evaluates the series * * N-1 * - ' * y = > coef[i] T (x/2) * - i * i=0 * * of Chebyshev polynomials Ti at argument x/2. * * Coefficients are stored in reverse order, i.e. the zero * order term is last in the array. Note N is the number of * coefficients, not the order. * * If coefficients are for the interval a to b, x must * have been transformed to x -> 2(2x - b - a)/(b-a) before * entering the routine. This maps x from (a, b) to (-1, 1), * over which the Chebyshev polynomials are defined. * * If the coefficients are for the inverted interval, in * which (a, b) is mapped to (1/b, 1/a), the transformation * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity, * this becomes x -> 4a/x - 1. * * * * SPEED: * * Taking advantage of the recurrence properties of the * Chebyshev polynomials, the routine requires one more * addition per loop than evaluating a nested polynomial of * the same degree. * */ template <typename Packet, int N> struct pchebevl { … }; namespace unary_pow { template <typename ScalarExponent, bool IsInteger = NumTraits<ScalarExponent>::IsInteger> struct exponent_helper { … }; exponent_helper<ScalarExponent, true>; template <typename Packet, typename ScalarExponent, bool ReciprocateIfExponentIsNegative = !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger && NumTraits<ScalarExponent>::IsSigned> struct reciprocate { … }; reciprocate<Packet, ScalarExponent, false>; template <typename Packet, typename ScalarExponent> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet int_pow(const Packet& x, const ScalarExponent& exponent) { … } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet gen_pow(const Packet& x, const typename unpacket_traits<Packet>::type& exponent) { … } template <typename Packet, typename ScalarExponent> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx, const ScalarExponent& exponent) { … } template <typename Packet, typename ScalarExponent, std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent& exponent) { … } template <typename Packet, typename ScalarExponent, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent&) { … } } // end namespace unary_pow template <typename Packet, typename ScalarExponent, bool BaseIsIntegerType = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger, bool ExponentIsIntegerType = NumTraits<ScalarExponent>::IsInteger, bool ExponentIsSigned = NumTraits<ScalarExponent>::IsSigned> struct unary_pow_impl; unary_pow_impl<Packet, ScalarExponent, false, false, ExponentIsSigned>; unary_pow_impl<Packet, ScalarExponent, false, true, ExponentIsSigned>; unary_pow_impl<Packet, ScalarExponent, true, true, true>; unary_pow_impl<Packet, ScalarExponent, true, true, false>; template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a) { … } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_floor(const Packet& a) { … } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_ceil(const Packet& a) { … } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_trunc(const Packet& a) { … } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a) { … } nearest_integer_packetop_impl<Packet, false, false>; nearest_integer_packetop_impl<Packet, false, true>; } // end namespace internal } // end namespace Eigen #endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H