chromium/third_party/highway/src/hwy/base.h

// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef HIGHWAY_HWY_BASE_H_
#define HIGHWAY_HWY_BASE_H_

// Target-independent definitions.

// IWYU pragma: begin_exports
#include <stddef.h>
#include <stdint.h>

#include "hwy/detect_compiler_arch.h"
#include "hwy/highway_export.h"

// API version (https://semver.org/); keep in sync with CMakeLists.txt.
#define HWY_MAJOR
#define HWY_MINOR
#define HWY_PATCH

// True if the Highway version >= major.minor.0. Added in 1.2.0.
#define HWY_VERSION_GE(major, minor)
// True if the Highway version < major.minor.0. Added in 1.2.0.
#define HWY_VERSION_LT(major, minor)

// "IWYU pragma: keep" does not work for these includes, so hide from the IDE.
#if !HWY_IDE

#if !defined(HWY_NO_LIBCXX)
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#endif

#if (HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)) || HWY_COMPILER_MSVC
#include <atomic>
#endif

#endif  // !HWY_IDE

#ifndef HWY_HAVE_CXX20_THREE_WAY_COMPARE  // allow opt-out
#if !defined(HWY_NO_LIBCXX) && defined(__cpp_impl_three_way_comparison) && \
    __cpp_impl_three_way_comparison >= 201907L && HWY_HAS_INCLUDE(<compare>)
#include <compare>
#define HWY_HAVE_CXX20_THREE_WAY_COMPARE
#else
#define HWY_HAVE_CXX20_THREE_WAY_COMPARE
#endif
#endif  // HWY_HAVE_CXX20_THREE_WAY_COMPARE

// IWYU pragma: end_exports

#if HWY_COMPILER_MSVC
#include <string.h>  // memcpy
#endif

//------------------------------------------------------------------------------
// Compiler-specific definitions

#define HWY_STR_IMPL(macro)
#define HWY_STR(macro)

#if HWY_COMPILER_MSVC

#include <intrin.h>

#define HWY_FUNCTION
#define HWY_RESTRICT
#define HWY_INLINE
#define HWY_NOINLINE
#define HWY_FLATTEN
#define HWY_NORETURN
#define HWY_LIKELY
#define HWY_UNLIKELY
#define HWY_PRAGMA
#define HWY_DIAGNOSTICS
#define HWY_DIAGNOSTICS_OFF
#define HWY_MAYBE_UNUSED
#define HWY_HAS_ASSUME_ALIGNED
#if (_MSC_VER >= 1700)
#define HWY_MUST_USE_RESULT
#else
#define HWY_MUST_USE_RESULT
#endif

#else

#define HWY_FUNCTION
#define HWY_RESTRICT
// force inlining without optimization enabled creates very inefficient code
// that can cause compiler timeout
#ifdef __OPTIMIZE__
#define HWY_INLINE
#else
#define HWY_INLINE
#endif
#define HWY_NOINLINE
#define HWY_FLATTEN
#define HWY_NORETURN
#define HWY_LIKELY(expr)
#define HWY_UNLIKELY(expr)
#define HWY_PRAGMA(tokens)
#define HWY_DIAGNOSTICS(tokens)
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
// Encountered "attribute list cannot appear here" when using the C++17
// [[maybe_unused]], so only use the old style attribute for now.
#define HWY_MAYBE_UNUSED
#define HWY_MUST_USE_RESULT

#endif  // !HWY_COMPILER_MSVC

//------------------------------------------------------------------------------
// Builtin/attributes (no more #include after this point due to namespace!)

namespace hwy {

// Enables error-checking of format strings.
#if HWY_HAS_ATTRIBUTE(__format__)
#define HWY_FORMAT(idx_fmt, idx_arg)
#else
#define HWY_FORMAT
#endif

// Returns a void* pointer which the compiler then assumes is N-byte aligned.
// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
//
// The assignment semantics are required by GCC/Clang. ICC provides an in-place
// __assume_aligned, whereas MSVC's __assume appears unsuitable.
#if HWY_HAS_BUILTIN(__builtin_assume_aligned)
#define HWY_ASSUME_ALIGNED(ptr, align)
#else
#define HWY_ASSUME_ALIGNED
#endif

// Returns a pointer whose type is `type` (T*), while allowing the compiler to
// assume that the untyped pointer `ptr` is aligned to a multiple of sizeof(T).
#define HWY_RCAST_ALIGNED(type, ptr)

// Clang and GCC require attributes on each function into which SIMD intrinsics
// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
// automatic annotation via pragmas.
#if HWY_COMPILER_ICC
// As of ICC 2021.{1-9} the pragma is neither implemented nor required.
#define HWY_PUSH_ATTRIBUTES
#define HWY_POP_ATTRIBUTES
#elif HWY_COMPILER_CLANG
#define HWY_PUSH_ATTRIBUTES(targets_str)
#define HWY_POP_ATTRIBUTES
#elif HWY_COMPILER_GCC_ACTUAL
#define HWY_PUSH_ATTRIBUTES
#define HWY_POP_ATTRIBUTES
#else
#define HWY_PUSH_ATTRIBUTES
#define HWY_POP_ATTRIBUTES
#endif

//------------------------------------------------------------------------------
// Macros

#define HWY_API

#define HWY_CONCAT_IMPL(a, b)
#define HWY_CONCAT(a, b)

#define HWY_MIN(a, b)
#define HWY_MAX(a, b)

#if HWY_COMPILER_GCC_ACTUAL
// nielskm: GCC does not support '#pragma GCC unroll' without the factor.
#define HWY_UNROLL
#define HWY_DEFAULT_UNROLL
#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
#define HWY_UNROLL(factor)
#define HWY_DEFAULT_UNROLL
#else
#define HWY_UNROLL
#define HWY_DEFAULT_UNROLL
#endif

// Tell a compiler that the expression always evaluates to true.
// The expression should be free from any side effects.
// Some older compilers may have trouble with complex expressions, therefore
// it is advisable to split multiple conditions into separate assume statements,
// and manually check the generated code.
// OK but could fail:
//   HWY_ASSUME(x == 2 && y == 3);
// Better:
//   HWY_ASSUME(x == 2);
//   HWY_ASSUME(y == 3);
#if HWY_HAS_CPP_ATTRIBUTE(assume)
#define HWY_ASSUME
#elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC
#define HWY_ASSUME
// __builtin_assume() was added in clang 3.6.
#elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume)
#define HWY_ASSUME(expr)
// __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added
// later, so check for the compiler version directly.
#elif HWY_COMPILER_GCC_ACTUAL >= 405
#define HWY_ASSUME
#else
#define HWY_ASSUME
#endif

// Compile-time fence to prevent undesirable code reordering. On Clang x86, the
// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
// does, without generating code.
#if HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)
#define HWY_FENCE
#else
// TODO(janwas): investigate alternatives. On Arm, the above generates barriers.
#define HWY_FENCE
#endif

// 4 instances of a given literal value, useful as input to LoadDup128.
#define HWY_REP4(literal)

HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
    Abort(const char* file, int line, const char* format, ...);

#define HWY_ABORT(format, ...)

// Always enabled.
#define HWY_ASSERT(condition)

#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) || \
    defined(__SANITIZE_MEMORY__)
#define HWY_IS_MSAN
#else
#define HWY_IS_MSAN
#endif

#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER) || \
    defined(__SANITIZE_ADDRESS__)
#define HWY_IS_ASAN
#else
#define HWY_IS_ASAN
#endif

#if HWY_HAS_FEATURE(hwaddress_sanitizer) || defined(HWADDRESS_SANITIZER) || \
    defined(__SANITIZE_HWADDRESS__)
#define HWY_IS_HWASAN
#else
#define HWY_IS_HWASAN
#endif

#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER) || \
    defined(__SANITIZE_THREAD__)
#define HWY_IS_TSAN
#else
#define HWY_IS_TSAN
#endif

#if HWY_HAS_FEATURE(undefined_behavior_sanitizer) || \
    defined(UNDEFINED_BEHAVIOR_SANITIZER)
#define HWY_IS_UBSAN
#else
#define HWY_IS_UBSAN
#endif

// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
// You can disable MSAN by adding this attribute to the function that fails.
#if HWY_IS_MSAN
#define HWY_ATTR_NO_MSAN
#else
#define HWY_ATTR_NO_MSAN
#endif

// For enabling HWY_DASSERT and shortening tests in slower debug builds
#if !defined(HWY_IS_DEBUG_BUILD)
// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
// MSVC defines NDEBUG (if not, could instead check _DEBUG).
#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
    HWY_IS_HWASAN || HWY_IS_MSAN || HWY_IS_TSAN || HWY_IS_UBSAN || \
    defined(__clang_analyzer__)
#define HWY_IS_DEBUG_BUILD
#else
#define HWY_IS_DEBUG_BUILD
#endif
#endif  // HWY_IS_DEBUG_BUILD

#if HWY_IS_DEBUG_BUILD
#define HWY_DASSERT(condition)
#else
#define HWY_DASSERT
#endif

//------------------------------------------------------------------------------
// CopyBytes / ZeroBytes

#if HWY_COMPILER_MSVC
#pragma intrinsic(memcpy)
#pragma intrinsic(memset)
#endif

template <size_t kBytes, typename From, typename To>
HWY_API void CopyBytes(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {}

HWY_API void CopyBytes(const void* HWY_RESTRICT from, void* HWY_RESTRICT to,
                       size_t num_of_bytes_to_copy) {}

// Same as CopyBytes, but for same-sized objects; avoids a size argument.
template <typename From, typename To>
HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {}

template <size_t kBytes, typename To>
HWY_API void ZeroBytes(To* to) {}

HWY_API void ZeroBytes(void* to, size_t num_bytes) {}

//------------------------------------------------------------------------------
// kMaxVectorSize (undocumented, pending removal)

#if HWY_ARCH_X86
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize =;  // AVX-512
#elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
    __riscv_v_intrinsic >= 11000
// Not actually an upper bound on the size.
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
#else
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
#endif

//------------------------------------------------------------------------------
// Alignment

// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
// should be allocated dynamically via aligned_allocator.h because Lanes() may
// exceed the stack size.
#if HWY_ARCH_X86
#define HWY_ALIGN_MAX
#elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
    __riscv_v_intrinsic >= 11000
#define HWY_ALIGN_MAX
#else
#define HWY_ALIGN_MAX
#endif

//------------------------------------------------------------------------------
// Lane types

// hwy::float16_t and hwy::bfloat16_t are forward declared here to allow
// BitCastScalar to be implemented before the implementations of the
// hwy::float16_t and hwy::bfloat16_t types
struct float16_t;
struct bfloat16_t;

float32_t;
float64_t;

#pragma pack(push, 1)

// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
// https://reviews.llvm.org/D86310
struct alignas(16) uint128_t {};

// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
// field is to be compared (Lt128Upper instead of Lt128).
struct alignas(16) K64V64 {};

// 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
// than when considering both to be a 64-bit key.
struct alignas(8) K32V32 {};

#pragma pack(pop)

static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
                                              const uint128_t& b) {}
// Required for std::greater.
static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
                                              const uint128_t& b) {}
static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
                                               const uint128_t& b) {}

static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
                                              const K64V64& b) {}
// Required for std::greater.
static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
                                              const K64V64& b) {}
static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
                                               const K64V64& b) {}

static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
                                              const K32V32& b) {}
// Required for std::greater.
static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
                                              const K32V32& b) {}
static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
                                               const K32V32& b) {}

//------------------------------------------------------------------------------
// Controlling overload resolution (SFINAE)

template <bool Condition>
struct EnableIfT {};
template <>
struct EnableIfT<true> {};

EnableIf;

template <typename T, typename U>
struct IsSameT {};

IsSameT<T, T>;

template <typename T, typename U>
HWY_API constexpr bool IsSame() {}

// Returns whether T matches either of U1 or U2
template <typename T, typename U1, typename U2>
HWY_API constexpr bool IsSameEither() {}

template <bool Condition, typename Then, typename Else>
struct IfT {};

IfT<false, Then, Else>;

If;

template <typename T>
struct IsConstT {};

IsConstT<const T>;

template <typename T>
HWY_API constexpr bool IsConst() {}

template <class T>
struct RemoveConstT {};
RemoveConstT<const T>;

RemoveConst;

template <class T>
struct RemoveVolatileT {};
RemoveVolatileT<volatile T>;

RemoveVolatile;

template <class T>
struct RemoveRefT {};
RemoveRefT<T &>;
RemoveRefT<T &&>;

RemoveRef;

RemoveCvRef;

template <class T>
struct RemovePtrT {};
RemovePtrT<T *>;
RemovePtrT<const T *>;
RemovePtrT<volatile T *>;
RemovePtrT<const volatile T *>;

RemovePtr;

// Insert into template/function arguments to enable this overload only for
// vectors of exactly, at most (LE), or more than (GT) this many bytes.
//
// As an example, checking for a total size of 16 bytes will match both
// Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>.
#define HWY_IF_V_SIZE(T, kN, bytes)
#define HWY_IF_V_SIZE_LE(T, kN, bytes)
#define HWY_IF_V_SIZE_GT(T, kN, bytes)

#define HWY_IF_LANES(kN, lanes)
#define HWY_IF_LANES_LE(kN, lanes)
#define HWY_IF_LANES_GT(kN, lanes)

#define HWY_IF_UNSIGNED(T)
#define HWY_IF_NOT_UNSIGNED(T)
#define HWY_IF_SIGNED(T)
#define HWY_IF_FLOAT(T)
#define HWY_IF_NOT_FLOAT(T)
#define HWY_IF_FLOAT3264(T)
#define HWY_IF_NOT_FLOAT3264(T)
#define HWY_IF_SPECIAL_FLOAT(T)
#define HWY_IF_NOT_SPECIAL_FLOAT(T)
#define HWY_IF_FLOAT_OR_SPECIAL(T)
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
#define HWY_IF_INTEGER(T)

#define HWY_IF_T_SIZE(T, bytes)
#define HWY_IF_NOT_T_SIZE(T, bytes)
// bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
// too similar. If you want the opposite of this (2 or 4 bytes), ask for those
// bits explicitly (0x14) instead of attempting to 'negate' 0x102.
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
#define HWY_IF_T_SIZE_LE(T, bytes)
#define HWY_IF_T_SIZE_GT(T, bytes)

#define HWY_IF_SAME(T, expected)
#define HWY_IF_NOT_SAME(T, expected)

// One of two expected types
#define HWY_IF_SAME2(T, expected1, expected2)

#define HWY_IF_U8(T)
#define HWY_IF_U16(T)
#define HWY_IF_U32(T)
#define HWY_IF_U64(T)

#define HWY_IF_I8(T)
#define HWY_IF_I16(T)
#define HWY_IF_I32(T)
#define HWY_IF_I64(T)

#define HWY_IF_BF16(T)
#define HWY_IF_NOT_BF16(T)

#define HWY_IF_F16(T)
#define HWY_IF_NOT_F16(T)

#define HWY_IF_F32(T)
#define HWY_IF_F64(T)

// Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double
// overloads.
#define HWY_IF_UI8(T)
#define HWY_IF_UI16(T)
#define HWY_IF_UI32(T)
#define HWY_IF_UI64(T)

#define HWY_IF_LANES_PER_BLOCK(T, N, LANES)

// Empty struct used as a size tag type.
template <size_t N>
struct SizeTag {};

template <class T>
class DeclValT {};

// hwy::DeclVal<T>() can only be used in unevaluated contexts such as within an
// expression of a decltype specifier.

// hwy::DeclVal<T>() does not require that T have a public default constructor
template <class T>
HWY_API typename DeclValT<T>::type DeclVal() noexcept {}

template <class T>
struct IsArrayT {};

IsArrayT<T[]>;

IsArrayT<T[N]>;

template <class T>
static constexpr bool IsArray() {}

#if HWY_COMPILER_MSVC
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4180, ignored "-Wignored-qualifiers")
#endif

template <class From, class To>
class IsConvertibleT {};

#if HWY_COMPILER_MSVC
HWY_DIAGNOSTICS(pop)
#endif

template <class From, class To>
HWY_API constexpr bool IsConvertible() {}

template <class From, class To>
class IsStaticCastableT {};

template <class From, class To>
static constexpr bool IsStaticCastable() {}

#define HWY_IF_CASTABLE(From, To)

#define HWY_IF_OP_CASTABLE(op, T, Native)

template <class T, class From>
class IsAssignableT {};

template <class T, class From>
static constexpr bool IsAssignable() {}

#define HWY_IF_ASSIGNABLE(T, From)

// ----------------------------------------------------------------------------
// IsSpecialFloat

// These types are often special-cased and not supported in all ops.
template <typename T>
HWY_API constexpr bool IsSpecialFloat() {}

// -----------------------------------------------------------------------------
// IsIntegerLaneType and IsInteger

template <class T>
HWY_API constexpr bool IsIntegerLaneType() {}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<int8_t>() {}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<uint8_t>() {}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<int16_t>() {}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<uint16_t>() {}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<int32_t>() {}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<uint32_t>() {}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<int64_t>() {}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<uint64_t>() {}

template <class T>
HWY_API constexpr bool IsInteger() {}
template <>
HWY_INLINE constexpr bool IsInteger<bool>() {}
template <>
HWY_INLINE constexpr bool IsInteger<char>() {}
template <>
HWY_INLINE constexpr bool IsInteger<signed char>() {}
template <>
HWY_INLINE constexpr bool IsInteger<unsigned char>() {}
template <>
HWY_INLINE constexpr bool IsInteger<short>() {}
template <>
HWY_INLINE constexpr bool IsInteger<unsigned short>() {}
template <>
HWY_INLINE constexpr bool IsInteger<int>() {}
template <>
HWY_INLINE constexpr bool IsInteger<unsigned>() {}
template <>
HWY_INLINE constexpr bool IsInteger<long>() {}
template <>
HWY_INLINE constexpr bool IsInteger<unsigned long>() {}
template <>
HWY_INLINE constexpr bool IsInteger<long long>() {}
template <>
HWY_INLINE constexpr bool IsInteger<unsigned long long>() {}
#if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L
template <>
HWY_INLINE constexpr bool IsInteger<char8_t>() {}
#endif
template <>
HWY_INLINE constexpr bool IsInteger<char16_t>() {}
template <>
HWY_INLINE constexpr bool IsInteger<char32_t>() {}

// -----------------------------------------------------------------------------
// BitCastScalar

#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
#define HWY_BITCASTSCALAR_CONSTEXPR
#else
#define HWY_BITCASTSCALAR_CONSTEXPR
#endif

#if __cpp_constexpr >= 201304L
#define HWY_BITCASTSCALAR_CXX14_CONSTEXPR
#else
#define HWY_BITCASTSCALAR_CXX14_CONSTEXPR
#endif

#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
namespace detail {

template <class From>
struct BitCastScalarSrcCastHelper {};

#if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
// Workaround for Clang 9 constexpr __builtin_bit_cast bug
template <class To, class From,
          hwy::EnableIf<hwy::IsInteger<RemoveCvRef<To>>() &&
                        hwy::IsInteger<RemoveCvRef<From>>()>* = nullptr>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
BuiltinBitCastScalar(const From& val) {
  static_assert(sizeof(To) == sizeof(From),
                "sizeof(To) == sizeof(From) must be true");
  return static_cast<To>(val);
}

template <class To, class From,
          hwy::EnableIf<!(hwy::IsInteger<RemoveCvRef<To>>() &&
                          hwy::IsInteger<RemoveCvRef<From>>())>* = nullptr>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
BuiltinBitCastScalar(const From& val) {
  return __builtin_bit_cast(To, val);
}
#endif  // HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000

}  // namespace detail

template <class To, class From, HWY_IF_NOT_SPECIAL_FLOAT(To)>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {}
template <class To, class From, HWY_IF_SPECIAL_FLOAT(To)>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {}
#else
template <class To, class From>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
  To result;
  CopySameSize(&val, &result);
  return result;
}
#endif

//------------------------------------------------------------------------------
// F16 lane type

#pragma pack(push, 1)

// Compiler supports __fp16 and load/store/conversion NEON intrinsics, which are
// included in Armv8 and VFPv4 (except with MSVC). On Armv7 Clang requires
// __ARM_FP & 2 whereas Armv7 GCC requires -mfp16-format=ieee.
#if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) ||                    \
    (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
    (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
#define HWY_NEON_HAVE_F16C
#else
#define HWY_NEON_HAVE_F16C
#endif

// RVV with f16 extension supports _Float16 and f16 vector ops. If set, implies
// HWY_HAVE_FLOAT16.
#if HWY_ARCH_RISCV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG >= 1600
#define HWY_RVV_HAVE_F16_VEC
#else
#define HWY_RVV_HAVE_F16_VEC
#endif

// x86 compiler supports _Float16, not necessarily with operators.
// Avoid clang-cl because it lacks __extendhfsf2.
#if HWY_ARCH_X86 && defined(__SSE2__) && defined(__FLT16_MAX__) && \
    ((HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL) ||      \
     HWY_COMPILER_GCC_ACTUAL >= 1200)
#define HWY_SSE2_HAVE_F16_TYPE
#else
#define HWY_SSE2_HAVE_F16_TYPE
#endif

#ifndef HWY_HAVE_SCALAR_F16_TYPE
// Compiler supports _Float16, not necessarily with operators.
#if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
#define HWY_HAVE_SCALAR_F16_TYPE
#else
#define HWY_HAVE_SCALAR_F16_TYPE
#endif
#endif  // HWY_HAVE_SCALAR_F16_TYPE

#ifndef HWY_HAVE_SCALAR_F16_OPERATORS
// Recent enough compiler also has operators.
#if HWY_HAVE_SCALAR_F16_TYPE &&                                       \
    (HWY_COMPILER_CLANG >= 1800 || HWY_COMPILER_GCC_ACTUAL >= 1200 || \
     (HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL &&          \
      !defined(_WIN32)) ||                                            \
     (HWY_ARCH_ARM &&                                                 \
      (HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800)))
#define HWY_HAVE_SCALAR_F16_OPERATORS
#else
#define HWY_HAVE_SCALAR_F16_OPERATORS
#endif
#endif  // HWY_HAVE_SCALAR_F16_OPERATORS

namespace detail {

template <class T, class TVal = RemoveCvRef<T>, bool = IsSpecialFloat<TVal>()>
struct SpecialFloatUnwrapArithOpOperandT {};

SpecialFloatUnwrapArithOpOperandT<T, TVal, false>;

SpecialFloatUnwrapArithOpOperand;

template <class T, class TVal = RemoveCvRef<T>>
struct NativeSpecialFloatToWrapperT {
  using type = T;
};

NativeSpecialFloatToWrapper;

}  // namespace detail

// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
// by concatenating base type and bits. We use a wrapper class instead of a
// typedef to the native type to ensure that the same symbols, e.g. for VQSort,
// are generated regardless of F16 support; see #1684.
struct alignas(2) float16_t {};
static_assert;

#if HWY_HAVE_SCALAR_F16_TYPE
namespace detail {

#if HWY_HAVE_SCALAR_F16_OPERATORS
SpecialFloatUnwrapArithOpOperandT<T, hwy::float16_t, true>;
#endif

NativeSpecialFloatToWrapperT<T, hwy::float16_t::Native>;

}  // namespace detail
#endif  // HWY_HAVE_SCALAR_F16_TYPE

#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
namespace detail {

template <>
struct BitCastScalarSrcCastHelper<hwy::float16_t> {};

}  // namespace detail
#endif  // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926

#if HWY_HAVE_SCALAR_F16_OPERATORS
#define HWY_F16_CONSTEXPR
#else
#define HWY_F16_CONSTEXPR
#endif  // HWY_HAVE_SCALAR_F16_OPERATORS

HWY_API HWY_F16_CONSTEXPR float F32FromF16(float16_t f16) {}

#if HWY_IS_DEBUG_BUILD && \
    (HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926)
#if defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
// If C++23 if !consteval support is available, only execute
// HWY_DASSERT(condition) if F16FromF32 is not called from a constant-evaluated
// context to avoid compilation errors.
#define HWY_F16_FROM_F32_DASSERT
#elif HWY_HAS_BUILTIN(__builtin_is_constant_evaluated) || \
    HWY_COMPILER_MSVC >= 1926
// If the __builtin_is_constant_evaluated() intrinsic is available,
// only do HWY_DASSERT(condition) if __builtin_is_constant_evaluated() returns
// false to avoid compilation errors if F16FromF32 is called from a
// constant-evaluated context.
#define HWY_F16_FROM_F32_DASSERT(condition)
#else
// If C++23 if !consteval support is not available,
// the __builtin_is_constant_evaluated() intrinsic is not available,
// HWY_IS_DEBUG_BUILD is 1, and the __builtin_bit_cast intrinsic is available,
// do not do a HWY_DASSERT to avoid compilation errors if F16FromF32 is
// called from a constant-evaluated context.
#define HWY_F16_FROM_F32_DASSERT
#endif  // defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
#else
// If HWY_IS_DEBUG_BUILD is 0 or the __builtin_bit_cast intrinsic is not
// available, define HWY_F16_FROM_F32_DASSERT(condition) as
// HWY_DASSERT(condition)
#define HWY_F16_FROM_F32_DASSERT
#endif  // HWY_IS_DEBUG_BUILD && (HWY_HAS_BUILTIN(__builtin_bit_cast) ||
        // HWY_COMPILER_MSVC >= 1926)

HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32) {}

HWY_API HWY_F16_CONSTEXPR float16_t F16FromF64(double f64) {}

// More convenient to define outside float16_t because these may use
// F32FromF16, which is defined after the struct.
HWY_F16_CONSTEXPR inline bool operator==(float16_t lhs,
                                         float16_t rhs) noexcept {}
HWY_F16_CONSTEXPR inline bool operator!=(float16_t lhs,
                                         float16_t rhs) noexcept {}
HWY_F16_CONSTEXPR inline bool operator<(float16_t lhs, float16_t rhs) noexcept {}
HWY_F16_CONSTEXPR inline bool operator<=(float16_t lhs,
                                         float16_t rhs) noexcept {}
HWY_F16_CONSTEXPR inline bool operator>(float16_t lhs, float16_t rhs) noexcept {}
HWY_F16_CONSTEXPR inline bool operator>=(float16_t lhs,
                                         float16_t rhs) noexcept {}
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
    float16_t lhs, float16_t rhs) noexcept {}
#endif  // HWY_HAVE_CXX20_THREE_WAY_COMPARE

//------------------------------------------------------------------------------
// BF16 lane type

// Compiler supports ACLE __bf16, not necessarily with operators.

// Disable the __bf16 type on AArch64 with GCC 13 or earlier as there is a bug
// in GCC 13 and earlier that sometimes causes BF16 constant values to be
// incorrectly loaded on AArch64, and this GCC bug on AArch64 is
// described at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111867.

#if HWY_ARCH_ARM_A64 && \
    (HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400)
#define HWY_ARM_HAVE_SCALAR_BF16_TYPE
#else
#define HWY_ARM_HAVE_SCALAR_BF16_TYPE
#endif

// x86 compiler supports __bf16, not necessarily with operators.
#ifndef HWY_SSE2_HAVE_SCALAR_BF16_TYPE
#if HWY_ARCH_X86 && defined(__SSE2__) &&                      \
    ((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL) || \
     HWY_COMPILER_GCC_ACTUAL >= 1300)
#define HWY_SSE2_HAVE_SCALAR_BF16_TYPE
#else
#define HWY_SSE2_HAVE_SCALAR_BF16_TYPE
#endif
#endif  // HWY_SSE2_HAVE_SCALAR_BF16_TYPE

// Compiler supports __bf16, not necessarily with operators.
#if HWY_ARM_HAVE_SCALAR_BF16_TYPE || HWY_SSE2_HAVE_SCALAR_BF16_TYPE
#define HWY_HAVE_SCALAR_BF16_TYPE
#else
#define HWY_HAVE_SCALAR_BF16_TYPE
#endif

#ifndef HWY_HAVE_SCALAR_BF16_OPERATORS
// Recent enough compiler also has operators. aarch64 clang 18 hits internal
// compiler errors on bf16 ToString, hence only enable on GCC for now.
#if HWY_HAVE_SCALAR_BF16_TYPE && (HWY_COMPILER_GCC_ACTUAL >= 1300)
#define HWY_HAVE_SCALAR_BF16_OPERATORS
#else
#define HWY_HAVE_SCALAR_BF16_OPERATORS
#endif
#endif  // HWY_HAVE_SCALAR_BF16_OPERATORS

#if HWY_HAVE_SCALAR_BF16_OPERATORS
#define HWY_BF16_CONSTEXPR
#else
#define HWY_BF16_CONSTEXPR
#endif

struct alignas(2) bfloat16_t {};
static_assert;

#pragma pack(pop)

#if HWY_HAVE_SCALAR_BF16_TYPE
namespace detail {

#if HWY_HAVE_SCALAR_BF16_OPERATORS
template <class T>
struct SpecialFloatUnwrapArithOpOperandT<T, hwy::bfloat16_t, true> {
  using type = hwy::bfloat16_t::Native;
};
#endif

NativeSpecialFloatToWrapperT<T, hwy::bfloat16_t::Native>;

}  // namespace detail
#endif  // HWY_HAVE_SCALAR_BF16_TYPE

#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
namespace detail {

template <>
struct BitCastScalarSrcCastHelper<hwy::bfloat16_t> {};

}  // namespace detail
#endif  // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926

HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf) {}

namespace detail {

// Returns the increment to add to the bits of a finite F32 value to round a
// finite F32 to the nearest BF16 value
static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint32_t F32BitsToBF16RoundIncr(
    const uint32_t f32_bits) {}

// Converts f32_bits (which is the bits of a F32 value) to BF16 bits,
// rounded to the nearest F16 value
static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint16_t F32BitsToBF16Bits(
    const uint32_t f32_bits) {}

}  // namespace detail

HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f) {}

HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64) {}

// More convenient to define outside bfloat16_t because these may use
// F32FromBF16, which is defined after the struct.

HWY_BF16_CONSTEXPR inline bool operator==(bfloat16_t lhs,
                                          bfloat16_t rhs) noexcept {}

HWY_BF16_CONSTEXPR inline bool operator!=(bfloat16_t lhs,
                                          bfloat16_t rhs) noexcept {}
HWY_BF16_CONSTEXPR inline bool operator<(bfloat16_t lhs,
                                         bfloat16_t rhs) noexcept {}
HWY_BF16_CONSTEXPR inline bool operator<=(bfloat16_t lhs,
                                          bfloat16_t rhs) noexcept {}
HWY_BF16_CONSTEXPR inline bool operator>(bfloat16_t lhs,
                                         bfloat16_t rhs) noexcept {}
HWY_BF16_CONSTEXPR inline bool operator>=(bfloat16_t lhs,
                                          bfloat16_t rhs) noexcept {}
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
HWY_BF16_CONSTEXPR inline std::partial_ordering operator<=>(
    bfloat16_t lhs, bfloat16_t rhs) noexcept {}
#endif  // HWY_HAVE_CXX20_THREE_WAY_COMPARE

//------------------------------------------------------------------------------
// Type relations

namespace detail {

template <typename T>
struct Relations;
template <>
struct Relations<uint8_t> {};
template <>
struct Relations<int8_t> {};
template <>
struct Relations<uint16_t> {};
template <>
struct Relations<int16_t> {};
template <>
struct Relations<uint32_t> {};
template <>
struct Relations<int32_t> {};
template <>
struct Relations<uint64_t> {};
template <>
struct Relations<int64_t> {};
template <>
struct Relations<uint128_t> {};
template <>
struct Relations<float16_t> {};
template <>
struct Relations<bfloat16_t> {};
template <>
struct Relations<float> {};
template <>
struct Relations<double> {};

template <size_t N>
struct TypeFromSize;
template <>
struct TypeFromSize<1> {};
template <>
struct TypeFromSize<2> {};
template <>
struct TypeFromSize<4> {};
template <>
struct TypeFromSize<8> {};
template <>
struct TypeFromSize<16> {};

}  // namespace detail

// Aliases for types of a different category, but the same size.
MakeUnsigned;
MakeSigned;
MakeFloat;

// Aliases for types of the same category, but different size.
MakeWide;
MakeNarrow;

// Obtain type from its size [bytes].
UnsignedFromSize;
SignedFromSize;
FloatFromSize;

// Avoid confusion with SizeTag where the parameter is a lane size.
UnsignedTag;
SignedTag;  // integer
FloatTag;
SpecialTag;

template <typename T, class R = detail::Relations<T>>
constexpr auto TypeTag()
    -> hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)> {}

// For when we only want to distinguish FloatTag from everything else.
NonFloatTag;

template <typename T, class R = detail::Relations<T>>
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {}

//------------------------------------------------------------------------------
// Type traits

template <typename T>
HWY_API constexpr bool IsFloat3264() {}

template <typename T>
HWY_API constexpr bool IsFloat() {}

template <typename T>
HWY_API constexpr bool IsSigned() {}
template <>
constexpr bool IsSigned<float16_t>() {}
template <>
constexpr bool IsSigned<bfloat16_t>() {}
template <>
constexpr bool IsSigned<hwy::uint128_t>() {}
template <>
constexpr bool IsSigned<hwy::K64V64>() {}
template <>
constexpr bool IsSigned<hwy::K32V32>() {}

template <typename T, bool = IsInteger<T>() && !IsIntegerLaneType<T>()>
struct MakeLaneTypeIfIntegerT {};

MakeLaneTypeIfIntegerT<T, true>;

MakeLaneTypeIfInteger;

// Largest/smallest representable integer values.
template <typename T>
HWY_API constexpr T LimitsMax() {}
template <typename T>
HWY_API constexpr T LimitsMin() {}

// Largest/smallest representable value (integer or float). This naming avoids
// confusion with numeric_limits<float>::min() (the smallest positive value).
// Cannot be constexpr because we use CopySameSize for [b]float16_t.
template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T LowestValue() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t LowestValue<bfloat16_t>() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t LowestValue<float16_t>() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue<float>() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double LowestValue<double>() {}

template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t HighestValue<bfloat16_t>() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t HighestValue<float16_t>() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue<float>() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double HighestValue<double>() {}

// Difference between 1.0 and the next representable value. Equal to
// 1 / (1ULL << MantissaBits<T>()), but hard-coding ensures precision.
template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T Epsilon() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t Epsilon<bfloat16_t>() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t Epsilon<float16_t>() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float Epsilon<float>() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double Epsilon<double>() {}

// Returns width in bits of the mantissa field in IEEE binary16/32/64.
template <typename T>
constexpr int MantissaBits() {}
template <>
constexpr int MantissaBits<bfloat16_t>() {}
template <>
constexpr int MantissaBits<float16_t>() {}
template <>
constexpr int MantissaBits<float>() {}
template <>
constexpr int MantissaBits<double>() {}

// Returns the (left-shifted by one bit) IEEE binary16/32/64 representation with
// the largest possible (biased) exponent field. Used by IsInf.
template <typename T>
constexpr MakeSigned<T> MaxExponentTimes2() {}

// Returns bitmask of the sign bit in IEEE binary16/32/64.
template <typename T>
constexpr MakeUnsigned<T> SignMask() {}

// Returns bitmask of the exponent field in IEEE binary16/32/64.
template <typename T>
constexpr MakeUnsigned<T> ExponentMask() {}

// Returns bitmask of the mantissa field in IEEE binary16/32/64.
template <typename T>
constexpr MakeUnsigned<T> MantissaMask() {}

// Returns 1 << mantissa_bits as a floating-point number. All integers whose
// absolute value are less than this can be represented exactly.
template <typename T>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T MantissaEnd() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t MantissaEnd<bfloat16_t>() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t MantissaEnd<float16_t>() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float MantissaEnd<float>() {}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double MantissaEnd<double>() {}

// Returns width in bits of the exponent field in IEEE binary16/32/64.
template <typename T>
constexpr int ExponentBits() {}

// Returns largest value of the biased exponent field in IEEE binary16/32/64,
// right-shifted so that the LSB is bit zero. Example: 0xFF for float.
// This is expressed as a signed integer for more efficient comparison.
template <typename T>
constexpr MakeSigned<T> MaxExponentField() {}

//------------------------------------------------------------------------------
// Additional F16/BF16 operators

#if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS

#define HWY_RHS_SPECIAL_FLOAT_ARITH_OP

#define HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP

#if HWY_HAVE_SCALAR_F16_OPERATORS
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, float16_t)
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, float16_t)
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, float16_t)
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, float16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, float16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, float16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, float16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, float16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, float16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, float16_t)
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, float16_t)
#endif
#endif  // HWY_HAVE_SCALAR_F16_OPERATORS

#if HWY_HAVE_SCALAR_BF16_OPERATORS
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, bfloat16_t)
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, bfloat16_t)
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, bfloat16_t)
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, bfloat16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, bfloat16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, bfloat16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, bfloat16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, bfloat16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, bfloat16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, bfloat16_t)
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, bfloat16_t)
#endif
#endif  // HWY_HAVE_SCALAR_BF16_OPERATORS

#undef HWY_RHS_SPECIAL_FLOAT_ARITH_OP
#undef HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP

#endif  // HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS

//------------------------------------------------------------------------------
// Type conversions (after IsSpecialFloat)

HWY_API float F32FromF16Mem(const void* ptr) {}

HWY_API float F32FromBF16Mem(const void* ptr) {}

#if HWY_HAVE_SCALAR_F16_OPERATORS
#define HWY_BF16_TO_F16_CONSTEXPR
#else
#define HWY_BF16_TO_F16_CONSTEXPR
#endif

// For casting from TFrom to TTo
template <typename TTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TTo),
          HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TTo, TFrom)>
HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {}
template <typename TTo, typename TFrom, HWY_IF_F16(TTo),
          HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {}
template <typename TTo, HWY_IF_F16(TTo)>
HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo
ConvertScalarTo(const hwy::bfloat16_t in) {}
template <typename TTo, HWY_IF_F16(TTo)>
HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const double in) {}
template <typename TTo, typename TFrom, HWY_IF_BF16(TTo),
          HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {}
template <typename TTo, HWY_IF_BF16(TTo)>
HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo ConvertScalarTo(const hwy::float16_t in) {}
template <typename TTo, HWY_IF_BF16(TTo)>
HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const double in) {}
template <typename TTo, typename TFrom, HWY_IF_F16(TFrom),
          HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {}
template <typename TTo, typename TFrom, HWY_IF_BF16(TFrom),
          HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(TFrom in) {}
// Same: return unchanged
template <typename TTo>
HWY_API constexpr TTo ConvertScalarTo(TTo in) {}

//------------------------------------------------------------------------------
// Helper functions

template <typename T1, typename T2>
constexpr inline T1 DivCeil(T1 a, T2 b) {}

// Works for any `align`; if a power of two, compiler emits ADD+AND.
constexpr inline size_t RoundUpTo(size_t what, size_t align) {}

// Works for any `align`; if a power of two, compiler emits AND.
constexpr inline size_t RoundDownTo(size_t what, size_t align) {}

namespace detail {

// T is unsigned or T is signed and (val >> shift_amt) is an arithmetic right
// shift
template <class T>
static HWY_INLINE constexpr T ScalarShr(hwy::UnsignedTag /*type_tag*/, T val,
                                        int shift_amt) {}

// T is signed and (val >> shift_amt) is a non-arithmetic right shift
template <class T>
static HWY_INLINE constexpr T ScalarShr(hwy::SignedTag /*type_tag*/, T val,
                                        int shift_amt) {}

}  // namespace detail

// If T is an signed integer type, ScalarShr is guaranteed to perform an
// arithmetic right shift

// Otherwise, if T is an unsigned integer type, ScalarShr is guaranteed to
// perform a logical right shift
template <class T, HWY_IF_INTEGER(RemoveCvRef<T>)>
HWY_API constexpr RemoveCvRef<T> ScalarShr(T val, int shift_amt) {}

// Undefined results for x == 0.
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {}

HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {}

// Undefined results for x == 0.
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {}

HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {}

template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
          HWY_IF_T_SIZE_ONE_OF(RemoveCvRef<T>, (1 << 1) | (1 << 2) | (1 << 4))>
HWY_API size_t PopCount(T x) {}

template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
          HWY_IF_T_SIZE(RemoveCvRef<T>, 8)>
HWY_API size_t PopCount(T x) {}

// Skip HWY_API due to GCC "function not considered for inlining". Previously
// such errors were caused by underlying type mismatches, but it's not clear
// what is still mismatched despite all the casts.
template <typename TI>
/*HWY_API*/ constexpr size_t FloorLog2(TI x) {}

template <typename TI>
/*HWY_API*/ constexpr size_t CeilLog2(TI x) {}

template <typename T, typename T2, HWY_IF_FLOAT(T), HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {}

template <typename T, typename T2, HWY_IF_SPECIAL_FLOAT(T)>
HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {}

template <typename T, typename T2, HWY_IF_NOT_FLOAT(T)>
HWY_INLINE constexpr T AddWithWraparound(T t, T2 n) {}

#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
#pragma intrinsic(_mul128)
#pragma intrinsic(_umul128)
#endif

// 64 x 64 = 128 bit multiplication
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {}

HWY_API int64_t Mul128(int64_t a, int64_t b, int64_t* HWY_RESTRICT upper) {}

// Precomputation for fast n / divisor and n % divisor, where n is a variable
// and divisor is unchanging but unknown at compile-time.
class Divisor {};

namespace detail {

template <typename T>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T ScalarAbs(hwy::FloatTag /*tag*/,
                                                          T val) {}

template <typename T>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
ScalarAbs(hwy::SpecialTag /*tag*/, T val) {}

template <typename T>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
ScalarAbs(hwy::SignedTag /*tag*/, T val) {}

template <typename T>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
ScalarAbs(hwy::UnsignedTag /*tag*/, T val) {}

}  // namespace detail

template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarAbs(T val) {}

template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val) {}

template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsInf(T val) {}

namespace detail {

template <typename T>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
    hwy::FloatTag /*tag*/, T val) {}

template <typename T>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
    hwy::NonFloatTag /*tag*/, T /*val*/) {}

}  // namespace detail

template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(T val) {}

template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarCopySign(T magn,
                                                                  T sign) {}

template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val) {}

// Prevents the compiler from eliding the computations that led to "output".
#if HWY_ARCH_PPC && (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
    !defined(_SOFT_FLOAT)
// Workaround to avoid test failures on PPC if compiled with Clang
template <class T, HWY_IF_F32(T)>
HWY_API void PreventElision(T&& output) {
  asm volatile("" : "+f"(output)::"memory");
}
template <class T, HWY_IF_F64(T)>
HWY_API void PreventElision(T&& output) {
  asm volatile("" : "+d"(output)::"memory");
}
template <class T, HWY_IF_NOT_FLOAT3264(T)>
HWY_API void PreventElision(T&& output) {
  asm volatile("" : "+r"(output)::"memory");
}
#else
template <class T>
HWY_API void PreventElision(T&& output) {}
#endif

}  // namespace hwy

#endif  // HIGHWAY_HWY_BASE_H_