// SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- // Copyright 2019-2022 Arm Limited // Copyright 2008 Jose Fonseca // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at: // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // ---------------------------------------------------------------------------- /* * This module implements vector support for floats, ints, and vector lane * control masks. It provides access to both explicit vector width types, and * flexible N-wide types where N can be determined at compile time. * * The design of this module encourages use of vector length agnostic code, via * the vint, vfloat, and vmask types. These will take on the widest SIMD vector * with that is available at compile time. The current vector width is * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant. * * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types. * These are provided primarily for prototyping and algorithm debug of VLA * implementations. * * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4 * types. These are provided for use by VLA code, but are also expected to be * used as a fixed-width type and will supported a reference C++ fallback for * use on platforms without SIMD intrinsics. * * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8 * types. These are provide for use by VLA code, and are not expected to be * used as a fixed-width type in normal code. No reference C implementation is * provided on platforms without underlying SIMD intrinsics. * * With the current implementation ISA support is provided for: * * * 1-wide for scalar reference. * * 4-wide for Armv8-A NEON. * * 4-wide for x86-64 SSE2. * * 4-wide for x86-64 SSE4.1. * * 8-wide for x86-64 AVX2. */ #ifndef ASTC_VECMATHLIB_H_INCLUDED #define ASTC_VECMATHLIB_H_INCLUDED #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 #include <immintrin.h> #elif ASTCENC_NEON != 0 #include <arm_neon.h> #endif #if !defined(__clang__) && defined(_MSC_VER) #define ASTCENC_SIMD_INLINE … #define ASTCENC_NO_INLINE #elif defined(__GNUC__) && !defined(__clang__) #define ASTCENC_SIMD_INLINE … #define ASTCENC_NO_INLINE … #else #define ASTCENC_SIMD_INLINE … #define ASTCENC_NO_INLINE … #endif #if ASTCENC_AVX >= 2 /* If we have AVX2 expose 8-wide VLA. */ #include "astcenc_vecmathlib_sse_4.h" #include "astcenc_vecmathlib_common_4.h" #include "astcenc_vecmathlib_avx2_8.h" #define ASTCENC_SIMD_WIDTH … using vfloat = vfloat8; #if defined(ASTCENC_NO_INVARIANCE) using vfloatacc = vfloat8; #else using vfloatacc = vfloat4; #endif using vint = vint8; using vmask = vmask8; constexpr auto loada = vfloat8::loada; constexpr auto load1 = vfloat8::load1; #elif ASTCENC_SSE >= 20 /* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */ #include "astcenc_vecmathlib_sse_4.h" #include "astcenc_vecmathlib_common_4.h" #define ASTCENC_SIMD_WIDTH … vfloat; vfloatacc; vint; vmask; constexpr auto loada = …; constexpr auto load1 = …; #elif ASTCENC_NEON > 0 /* If we have NEON expose 4-wide VLA. */ #include "astcenc_vecmathlib_neon_4.h" #include "astcenc_vecmathlib_common_4.h" #define ASTCENC_SIMD_WIDTH … using vfloat = vfloat4; using vfloatacc = vfloat4; using vint = vint4; using vmask = vmask4; constexpr auto loada = vfloat4::loada; constexpr auto load1 = vfloat4::load1; #else // If we have nothing expose 4-wide VLA, and 4-wide fixed width. // Note: We no longer expose the 1-wide scalar fallback because it is not // invariant with the 4-wide path due to algorithms that use horizontal // operations that accumulate a local vector sum before accumulating into // a running sum. // // For 4 items adding into an accumulator using 1-wide vectors the sum is: // // result = ((((sum + l0) + l1) + l2) + l3) // // ... whereas the accumulator for a 4-wide vector sum is: // // result = sum + ((l0 + l2) + (l1 + l3)) // // In "normal maths" this is the same, but the floating point reassociation // differences mean that these will not produce the same result. #include "astcenc_vecmathlib_none_4.h" #include "astcenc_vecmathlib_common_4.h" #define ASTCENC_SIMD_WIDTH … using vfloat = vfloat4; using vfloatacc = vfloat4; using vint = vint4; using vmask = vmask4; constexpr auto loada = vfloat4::loada; constexpr auto load1 = vfloat4::load1; #endif /** * @brief Round a count down to the largest multiple of 8. * * @param count The unrounded value. * * @return The rounded value. */ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count) { … } /** * @brief Round a count down to the largest multiple of 4. * * @param count The unrounded value. * * @return The rounded value. */ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count) { … } /** * @brief Round a count down to the largest multiple of the SIMD width. * * Assumption that the vector width is a power of two ... * * @param count The unrounded value. * * @return The rounded value. */ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count) { … } /** * @brief Round a count up to the largest multiple of the SIMD width. * * Assumption that the vector width is a power of two ... * * @param count The unrounded value. * * @return The rounded value. */ ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count) { … } /** * @brief Return @c a with lanes negated if the @c b lane is negative. */ ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b) { … } /** * @brief Return fast, but approximate, vector atan(x). * * Max error of this implementation is 0.004883. */ ASTCENC_SIMD_INLINE vfloat atan(vfloat x) { … } /** * @brief Return fast, but approximate, vector atan2(x, y). */ ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x) { … } /* * @brief Factory that returns a unit length 4 component vfloat4. */ static ASTCENC_SIMD_INLINE vfloat4 unit4() { … } /** * @brief Factory that returns a unit length 3 component vfloat4. */ static ASTCENC_SIMD_INLINE vfloat4 unit3() { … } /** * @brief Factory that returns a unit length 2 component vfloat4. */ static ASTCENC_SIMD_INLINE vfloat4 unit2() { … } /** * @brief Factory that returns a 3 component vfloat4. */ static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c) { … } /** * @brief Factory that returns a 2 component vfloat4. */ static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b) { … } /** * @brief Normalize a non-zero length vector to unit length. */ static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a) { … } /** * @brief Normalize a vector, returning @c safe if len is zero. */ static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe) { … } #define POLY0(x, c0) … #define POLY1(x, c0, c1) … #define POLY2(x, c0, c1, c2) … #define POLY3(x, c0, c1, c2, c3) … #define POLY4(x, c0, c1, c2, c3, c4) … #define POLY5(x, c0, c1, c2, c3, c4, c5) … /** * @brief Compute an approximate exp2(x) for each lane in the vector. * * Based on 5th degree minimax polynomials, ported from this blog * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html */ static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x) { … } /** * @brief Compute an approximate log2(x) for each lane in the vector. * * Based on 5th degree minimax polynomials, ported from this blog * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html */ static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x) { … } /** * @brief Compute an approximate pow(x, y) for each lane in the vector. * * Power function based on the exp2(log2(x) * y) transform. */ static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y) { … } /** * @brief Count the leading zeros for each lane in @c a. * * Valid for all data values of @c a; will return a per-lane value [0, 32]. */ static ASTCENC_SIMD_INLINE vint4 clz(vint4 a) { … } /** * @brief Return lanewise 2^a for each lane in @c a. * * Use of signed int means that this is only valid for values in range [0, 31]. */ static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a) { … } /** * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1]. */ static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p) { … } /** * @brief Convert 16-bit LNS to float16. */ static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p) { … } /** * @brief Extract mantissa and exponent of a float value. * * @param a The input value. * @param[out] exp The output exponent. * * @return The mantissa. */ static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp) { … } /** * @brief Convert float to 16-bit LNS. */ static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a) { … } namespace astc { static ASTCENC_SIMD_INLINE float pow(float x, float y) { … } } #endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED