astcenc_vecmathlib.h | Explore in Territory

// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2019-2022 Arm Limited
// Copyright 2008 Jose Fonseca
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------

/*
 * This module implements vector support for floats, ints, and vector lane
 * control masks. It provides access to both explicit vector width types, and
 * flexible N-wide types where N can be determined at compile time.
 *
 * The design of this module encourages use of vector length agnostic code, via
 * the vint, vfloat, and vmask types. These will take on the widest SIMD vector
 * with that is available at compile time. The current vector width is
 * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
 *
 * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
 * These are provided primarily for prototyping and algorithm debug of VLA
 * implementations.
 *
 * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
 * types. These are provided for use by VLA code, but are also expected to be
 * used as a fixed-width type and will supported a reference C++ fallback for
 * use on platforms without SIMD intrinsics.
 *
 * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
 * types. These are provide for use by VLA code, and are not expected to be
 * used as a fixed-width type in normal code. No reference C implementation is
 * provided on platforms without underlying SIMD intrinsics.
 *
 * With the current implementation ISA support is provided for:
 *
 *     * 1-wide for scalar reference.
 *     * 4-wide for Armv8-A NEON.
 *     * 4-wide for x86-64 SSE2.
 *     * 4-wide for x86-64 SSE4.1.
 *     * 8-wide for x86-64 AVX2.
 */

#ifndef ASTC_VECMATHLIB_H_INCLUDED
#define ASTC_VECMATHLIB_H_INCLUDED

#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
	#include <immintrin.h>
#elif ASTCENC_NEON != 0
	#include <arm_neon.h>
#endif

#if !defined(__clang__) && defined(_MSC_VER)
	#define ASTCENC_SIMD_INLINE …
	#define ASTCENC_NO_INLINE
#elif defined(__GNUC__) && !defined(__clang__)
	#define ASTCENC_SIMD_INLINE …
	#define ASTCENC_NO_INLINE …
#else
	#define ASTCENC_SIMD_INLINE …
	#define ASTCENC_NO_INLINE …
#endif

#if ASTCENC_AVX >= 2
	/* If we have AVX2 expose 8-wide VLA. */
	#include "astcenc_vecmathlib_sse_4.h"
	#include "astcenc_vecmathlib_common_4.h"
	#include "astcenc_vecmathlib_avx2_8.h"

	#define ASTCENC_SIMD_WIDTH …

	using vfloat = vfloat8;

	#if defined(ASTCENC_NO_INVARIANCE)
		using vfloatacc = vfloat8;
	#else
		using vfloatacc = vfloat4;
	#endif

	using vint = vint8;
	using vmask = vmask8;

	constexpr auto loada = vfloat8::loada;
	constexpr auto load1 = vfloat8::load1;

#elif ASTCENC_SSE >= 20
	/* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */
	#include "astcenc_vecmathlib_sse_4.h"
	#include "astcenc_vecmathlib_common_4.h"

	#define ASTCENC_SIMD_WIDTH …

	vfloat;
	vfloatacc;
	vint;
	vmask;

	constexpr auto loada = …;
	constexpr auto load1 = …;

#elif ASTCENC_NEON > 0
	/* If we have NEON expose 4-wide VLA. */
	#include "astcenc_vecmathlib_neon_4.h"
	#include "astcenc_vecmathlib_common_4.h"

	#define ASTCENC_SIMD_WIDTH …

	using vfloat = vfloat4;
	using vfloatacc = vfloat4;
	using vint = vint4;
	using vmask = vmask4;

	constexpr auto loada = vfloat4::loada;
	constexpr auto load1 = vfloat4::load1;

#else
	// If we have nothing expose 4-wide VLA, and 4-wide fixed width.

	// Note: We no longer expose the 1-wide scalar fallback because it is not
	// invariant with the 4-wide path due to algorithms that use horizontal
	// operations that accumulate a local vector sum before accumulating into
	// a running sum.
	//
	// For 4 items adding into an accumulator using 1-wide vectors the sum is:
	//
	//     result = ((((sum + l0) + l1) + l2) + l3)
	//
    // ... whereas the accumulator for a 4-wide vector sum is:
	//
	//     result = sum + ((l0 + l2) + (l1 + l3))
	//
	// In "normal maths" this is the same, but the floating point reassociation
	// differences mean that these will not produce the same result.

	#include "astcenc_vecmathlib_none_4.h"
	#include "astcenc_vecmathlib_common_4.h"

	#define ASTCENC_SIMD_WIDTH …

	using vfloat = vfloat4;
	using vfloatacc = vfloat4;
	using vint = vint4;
	using vmask = vmask4;

	constexpr auto loada = vfloat4::loada;
	constexpr auto load1 = vfloat4::load1;
#endif

/**
 * @brief Round a count down to the largest multiple of 8.
 *
 * @param count   The unrounded value.
 *
 * @return The rounded value.
 */
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
{ … }

/**
 * @brief Round a count down to the largest multiple of 4.
 *
 * @param count   The unrounded value.
 *
 * @return The rounded value.
 */
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
{ … }

/**
 * @brief Round a count down to the largest multiple of the SIMD width.
 *
 * Assumption that the vector width is a power of two ...
 *
 * @param count   The unrounded value.
 *
 * @return The rounded value.
 */
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
{ … }

/**
 * @brief Round a count up to the largest multiple of the SIMD width.
 *
 * Assumption that the vector width is a power of two ...
 *
 * @param count   The unrounded value.
 *
 * @return The rounded value.
 */
ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
{ … }

/**
 * @brief Return @c a with lanes negated if the @c b lane is negative.
 */
ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
{ … }

/**
 * @brief Return fast, but approximate, vector atan(x).
 *
 * Max error of this implementation is 0.004883.
 */
ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
{ … }

/**
 * @brief Return fast, but approximate, vector atan2(x, y).
 */
ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
{ … }

/*
 * @brief Factory that returns a unit length 4 component vfloat4.
 */
static ASTCENC_SIMD_INLINE vfloat4 unit4()
{ … }

/**
 * @brief Factory that returns a unit length 3 component vfloat4.
 */
static ASTCENC_SIMD_INLINE vfloat4 unit3()
{ … }

/**
 * @brief Factory that returns a unit length 2 component vfloat4.
 */
static ASTCENC_SIMD_INLINE vfloat4 unit2()
{ … }

/**
 * @brief Factory that returns a 3 component vfloat4.
 */
static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
{ … }

/**
 * @brief Factory that returns a 2 component vfloat4.
 */
static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
{ … }

/**
 * @brief Normalize a non-zero length vector to unit length.
 */
static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
{ … }

/**
 * @brief Normalize a vector, returning @c safe if len is zero.
 */
static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
{ … }



#define POLY0(x, c0) …
#define POLY1(x, c0, c1) …
#define POLY2(x, c0, c1, c2) …
#define POLY3(x, c0, c1, c2, c3) …
#define POLY4(x, c0, c1, c2, c3, c4) …
#define POLY5(x, c0, c1, c2, c3, c4, c5) …

/**
 * @brief Compute an approximate exp2(x) for each lane in the vector.
 *
 * Based on 5th degree minimax polynomials, ported from this blog
 * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
 */
static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
{ … }

/**
 * @brief Compute an approximate log2(x) for each lane in the vector.
 *
 * Based on 5th degree minimax polynomials, ported from this blog
 * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
 */
static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
{ … }

/**
 * @brief Compute an approximate pow(x, y) for each lane in the vector.
 *
 * Power function based on the exp2(log2(x) * y) transform.
 */
static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
{ … }

/**
 * @brief Count the leading zeros for each lane in @c a.
 *
 * Valid for all data values of @c a; will return a per-lane value [0, 32].
 */
static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
{ … }

/**
 * @brief Return lanewise 2^a for each lane in @c a.
 *
 * Use of signed int means that this is only valid for values in range [0, 31].
 */
static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
{ … }

/**
 * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
 */
static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
{ … }

/**
 * @brief Convert 16-bit LNS to float16.
 */
static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
{ … }

/**
 * @brief Extract mantissa and exponent of a float value.
 *
 * @param      a      The input value.
 * @param[out] exp    The output exponent.
 *
 * @return The mantissa.
 */
static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
{ … }

/**
 * @brief Convert float to 16-bit LNS.
 */
static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
{ … }

namespace astc
{

static ASTCENC_SIMD_INLINE float pow(float x, float y)
{ … }

}

#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED
godot/thirdparty/astcenc/astcenc_vecmathlib.h