chromium/third_party/highway/src/hwy/ops/set_macros-inl.h

// Copyright 2020 Google LLC
// Copyright 2024 Arm Limited and/or its affiliates <[email protected]>
// SPDX-License-Identifier: Apache-2.0
// SPDX-License-Identifier: BSD-3-Clause
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Sets macros based on HWY_TARGET.

// This include guard is toggled by foreach_target, so avoid the usual _H_
// suffix to prevent copybara from renaming it.
#if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
#ifdef HWY_SET_MACROS_PER_TARGET
#undef HWY_SET_MACROS_PER_TARGET
#else
#define HWY_SET_MACROS_PER_TARGET
#endif

#endif  // HWY_SET_MACROS_PER_TARGET

#include "hwy/detect_compiler_arch.h"  // IWYU: export
#include "hwy/detect_targets.h"        // IWYU: export

#undef HWY_NAMESPACE
#undef HWY_ALIGN
#undef HWY_MAX_BYTES
#undef HWY_LANES

#undef HWY_HAVE_SCALABLE
#undef HWY_HAVE_TUPLE
#undef HWY_HAVE_INTEGER64
#undef HWY_HAVE_FLOAT16
#undef HWY_HAVE_FLOAT64
#undef HWY_MEM_OPS_MIGHT_FAULT
#undef HWY_NATIVE_FMA
#undef HWY_NATIVE_DOT_BF16
#undef HWY_CAP_GE256
#undef HWY_CAP_GE512

#undef HWY_TARGET_IS_SVE
#if HWY_TARGET & HWY_ALL_SVE
#define HWY_TARGET_IS_SVE
#else
#define HWY_TARGET_IS_SVE
#endif

#undef HWY_TARGET_IS_NEON
#if HWY_TARGET & HWY_ALL_NEON
#define HWY_TARGET_IS_NEON
#else
#define HWY_TARGET_IS_NEON
#endif

#undef HWY_TARGET_IS_PPC
#if HWY_TARGET & HWY_ALL_PPC
#define HWY_TARGET_IS_PPC
#else
#define HWY_TARGET_IS_PPC
#endif

// Supported on all targets except RVV (requires GCC 14 or upcoming Clang)
#if HWY_TARGET == HWY_RVV &&                                        \
    ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \
     (HWY_COMPILER_CLANG))
#define HWY_HAVE_TUPLE
#else
#define HWY_HAVE_TUPLE
#endif

// For internal use (clamping/validating N for Simd<>)
#undef HWY_MAX_N
#if HWY_TARGET == HWY_SCALAR
#define HWY_MAX_N
#else
#define HWY_MAX_N
#endif

// For internal use (clamping kPow2 for Simd<>)
#undef HWY_MAX_POW2
// For HWY_TARGET == HWY_RVV, LMUL <= 8. Even on other targets, we want to
// support say Rebind<uint64_t, Simd<uint8_t, 1, 0>> d; whose kPow2 is also 3.
// However, those other targets do not actually support multiple vectors, and
// thus Lanes(d) must not exceed Lanes(ScalableTag<T>()).
#define HWY_MAX_POW2

// User-visible. Loose lower bound that guarantees HWY_MAX_BYTES >>
// (-HWY_MIN_POW2) <= 1. Useful for terminating compile-time recursions.
#undef HWY_MIN_POW2
#if HWY_TARGET == HWY_RVV
#define HWY_MIN_POW2
#else
// Tighter bound for other targets, whose vectors are smaller, to potentially
// save compile time.
#define HWY_MIN_POW2
#endif  // HWY_TARGET == HWY_RVV

#undef HWY_TARGET_STR

#if defined(HWY_DISABLE_PCLMUL_AES)
#define HWY_TARGET_STR_PCLMUL_AES
#else
#define HWY_TARGET_STR_PCLMUL_AES
#endif

#if defined(HWY_DISABLE_BMI2_FMA)
#define HWY_TARGET_STR_BMI2_FMA
#else
#define HWY_TARGET_STR_BMI2_FMA
#endif

#if defined(HWY_DISABLE_F16C)
#define HWY_TARGET_STR_F16C
#else
#define HWY_TARGET_STR_F16C
#endif

#define HWY_TARGET_STR_SSE2

#define HWY_TARGET_STR_SSSE3

#define HWY_TARGET_STR_SSE4
// Include previous targets, which are the half-vectors of the next target.
#define HWY_TARGET_STR_AVX2
#define HWY_TARGET_STR_AVX3
#define HWY_TARGET_STR_AVX3_DL

// Force-disable for compilers that do not properly support avx512bf16.
#if !defined(HWY_AVX3_DISABLE_AVX512BF16) &&                        \
    (HWY_COMPILER_CLANGCL ||                                        \
     (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \
     (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 900))
#define HWY_AVX3_DISABLE_AVX512BF16
#endif

#if !defined(HWY_AVX3_DISABLE_AVX512BF16)
#define HWY_TARGET_STR_AVX3_ZEN4
#else
#define HWY_TARGET_STR_AVX3_ZEN4
#endif

#define HWY_TARGET_STR_AVX3_SPR

#if defined(HWY_DISABLE_PPC8_CRYPTO)
#define HWY_TARGET_STR_PPC8_CRYPTO
#else
#define HWY_TARGET_STR_PPC8_CRYPTO
#endif

#define HWY_TARGET_STR_PPC8
#define HWY_TARGET_STR_PPC9

#if HWY_COMPILER_CLANG
#define HWY_TARGET_STR_PPC10
#else
// See #1707 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102059#c35.
// When the baseline is PPC 8 or 9, inlining functions such as PreventElision
// into PPC10 code fails because PPC10 defaults to no-htm and is thus worse than
// the baseline, which has htm. We cannot have pragma target on functions
// outside HWY_NAMESPACE such as those in base.h. It would be possible for users
// to set -mno-htm globally, but we can also work around this at the library
// level by claiming that PPC10 still has HTM, thus avoiding the mismatch. This
// seems to be safe because HTM uses builtins rather than modifying codegen, see
// https://gcc.gnu.org/legacy-ml/gcc-patches/2013-07/msg00167.html.
#define HWY_TARGET_STR_PPC10
#endif

#define HWY_TARGET_STR_Z14
#define HWY_TARGET_STR_Z15

// Before include guard so we redefine HWY_TARGET_STR on each include,
// governed by the current HWY_TARGET.

//-----------------------------------------------------------------------------
// SSE2
#if HWY_TARGET == HWY_SSE2

#define HWY_NAMESPACE
#define HWY_ALIGN
#define HWY_MAX_BYTES
#define HWY_LANES

#define HWY_HAVE_SCALABLE
#define HWY_HAVE_INTEGER64
#define HWY_HAVE_FLOAT16
#define HWY_HAVE_FLOAT64
#define HWY_MEM_OPS_MIGHT_FAULT
#define HWY_NATIVE_FMA
#define HWY_NATIVE_DOT_BF16
#define HWY_CAP_GE256
#define HWY_CAP_GE512

#define HWY_TARGET_STR
//-----------------------------------------------------------------------------
// SSSE3
#elif HWY_TARGET == HWY_SSSE3

#define HWY_NAMESPACE
#define HWY_ALIGN
#define HWY_MAX_BYTES
#define HWY_LANES(T)

#define HWY_HAVE_SCALABLE
#define HWY_HAVE_INTEGER64
#define HWY_HAVE_FLOAT16
#define HWY_HAVE_FLOAT64
#define HWY_MEM_OPS_MIGHT_FAULT
#define HWY_NATIVE_FMA
#define HWY_NATIVE_DOT_BF16
#define HWY_CAP_GE256
#define HWY_CAP_GE512

#define HWY_TARGET_STR

//-----------------------------------------------------------------------------
// SSE4
#elif HWY_TARGET == HWY_SSE4

#define HWY_NAMESPACE
#define HWY_ALIGN
#define HWY_MAX_BYTES
#define HWY_LANES

#define HWY_HAVE_SCALABLE
#define HWY_HAVE_INTEGER64
#define HWY_HAVE_FLOAT16
#define HWY_HAVE_FLOAT64
#define HWY_MEM_OPS_MIGHT_FAULT
#define HWY_NATIVE_FMA
#define HWY_NATIVE_DOT_BF16
#define HWY_CAP_GE256
#define HWY_CAP_GE512

#define HWY_TARGET_STR

//-----------------------------------------------------------------------------
// AVX2
#elif HWY_TARGET == HWY_AVX2

#define HWY_NAMESPACE
#define HWY_ALIGN
#define HWY_MAX_BYTES
#define HWY_LANES

#define HWY_HAVE_SCALABLE
#define HWY_HAVE_INTEGER64
#define HWY_HAVE_FLOAT16
#define HWY_HAVE_FLOAT64
#define HWY_MEM_OPS_MIGHT_FAULT

#ifdef HWY_DISABLE_BMI2_FMA
#define HWY_NATIVE_FMA
#else
#define HWY_NATIVE_FMA
#endif
#define HWY_NATIVE_DOT_BF16

#define HWY_CAP_GE256
#define HWY_CAP_GE512

#define HWY_TARGET_STR

//-----------------------------------------------------------------------------
// AVX3[_DL]
#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \
    HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR

#define HWY_ALIGN
#define HWY_MAX_BYTES
#define HWY_LANES

#define HWY_HAVE_SCALABLE
#define HWY_HAVE_INTEGER64
#if HWY_TARGET == HWY_AVX3_SPR && HWY_COMPILER_GCC_ACTUAL && \
    HWY_HAVE_SCALAR_F16_TYPE
// TODO: enable F16 for AVX3_SPR target with Clang once compilation issues are
// fixed
#define HWY_HAVE_FLOAT16
#else
#define HWY_HAVE_FLOAT16
#endif
#define HWY_HAVE_FLOAT64
#define HWY_MEM_OPS_MIGHT_FAULT
#define HWY_NATIVE_FMA
#if (HWY_TARGET <= HWY_AVX3_ZEN4) && !defined(HWY_AVX3_DISABLE_AVX512BF16)
#define HWY_NATIVE_DOT_BF16
#else
#define HWY_NATIVE_DOT_BF16
#endif
#define HWY_CAP_GE256
#define HWY_CAP_GE512

#if HWY_TARGET == HWY_AVX3

#define HWY_NAMESPACE
#define HWY_TARGET_STR

#elif HWY_TARGET == HWY_AVX3_DL

#define HWY_NAMESPACE
#define HWY_TARGET_STR

#elif HWY_TARGET == HWY_AVX3_ZEN4

#define HWY_NAMESPACE
#define HWY_TARGET_STR

#elif HWY_TARGET == HWY_AVX3_SPR

#define HWY_NAMESPACE
#define HWY_TARGET_STR

#else
#error "Logic error"
#endif  // HWY_TARGET

//-----------------------------------------------------------------------------
// PPC8, PPC9, PPC10
#elif HWY_TARGET_IS_PPC

#define HWY_ALIGN
#define HWY_MAX_BYTES
#define HWY_LANES

#define HWY_HAVE_SCALABLE
#define HWY_HAVE_INTEGER64
#define HWY_HAVE_FLOAT16
#define HWY_HAVE_FLOAT64
#define HWY_MEM_OPS_MIGHT_FAULT
#define HWY_NATIVE_FMA
#define HWY_NATIVE_DOT_BF16
#define HWY_CAP_GE256
#define HWY_CAP_GE512

#if HWY_TARGET == HWY_PPC8

#define HWY_NAMESPACE
#define HWY_TARGET_STR

#elif HWY_TARGET == HWY_PPC9

#define HWY_NAMESPACE
#define HWY_TARGET_STR

#elif HWY_TARGET == HWY_PPC10

#define HWY_NAMESPACE
#define HWY_TARGET_STR

#else
#error "Logic error"
#endif  // HWY_TARGET

//-----------------------------------------------------------------------------
// Z14, Z15
#elif HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15

#define HWY_ALIGN
#define HWY_MAX_BYTES
#define HWY_LANES

#define HWY_HAVE_SCALABLE
#define HWY_HAVE_INTEGER64
#define HWY_HAVE_FLOAT16
#define HWY_HAVE_FLOAT64
#define HWY_MEM_OPS_MIGHT_FAULT
#define HWY_NATIVE_FMA
#define HWY_NATIVE_DOT_BF16
#define HWY_CAP_GE256
#define HWY_CAP_GE512

#if HWY_TARGET == HWY_Z14

#define HWY_NAMESPACE
#define HWY_TARGET_STR

#elif HWY_TARGET == HWY_Z15

#define HWY_NAMESPACE
#define HWY_TARGET_STR

#else
#error "Logic error"
#endif  // HWY_TARGET == HWY_Z15

//-----------------------------------------------------------------------------
// NEON
#elif HWY_TARGET_IS_NEON

#define HWY_ALIGN
#define HWY_MAX_BYTES
#define HWY_LANES

#define HWY_HAVE_SCALABLE
#define HWY_HAVE_INTEGER64
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || HWY_TARGET == HWY_NEON_BF16
#define HWY_HAVE_FLOAT16
#else
#define HWY_HAVE_FLOAT16
#endif

#if HWY_ARCH_ARM_A64
#define HWY_HAVE_FLOAT64
#else
#define HWY_HAVE_FLOAT64
#endif

#define HWY_MEM_OPS_MIGHT_FAULT

#if defined(__ARM_FEATURE_FMA) || defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
#define HWY_NATIVE_FMA
#else
#define HWY_NATIVE_FMA
#endif
#if HWY_NEON_HAVE_F32_TO_BF16C || HWY_TARGET == HWY_NEON_BF16
#define HWY_NATIVE_DOT_BF16
#else
#define HWY_NATIVE_DOT_BF16
#endif

#define HWY_CAP_GE256
#define HWY_CAP_GE512

#if HWY_TARGET == HWY_NEON_WITHOUT_AES
#define HWY_NAMESPACE
#elif HWY_TARGET == HWY_NEON
#define HWY_NAMESPACE
#elif HWY_TARGET == HWY_NEON_BF16
#define HWY_NAMESPACE
#else
#error "Logic error, missing case"
#endif  // HWY_TARGET

// Can use pragmas instead of -march compiler flag
#if HWY_HAVE_RUNTIME_DISPATCH
#if HWY_ARCH_ARM_V7

// The __attribute__((target(+neon-vfpv4)) was introduced in gcc >= 8.
#if HWY_COMPILER_GCC_ACTUAL >= 800
#define HWY_TARGET_STR
#else   // GCC < 7
// Do not define HWY_TARGET_STR (no pragma).
#endif  // HWY_COMPILER_GCC_ACTUAL

#else  // !HWY_ARCH_ARM_V7

#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1300) || \
    (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1300)
// GCC 12 or earlier and Clang 12 or earlier require +crypto be added to the
// target string to enable AArch64 AES intrinsics
#define HWY_TARGET_STR_NEON
#else
#define HWY_TARGET_STR_NEON
#endif

#if HWY_COMPILER_CLANG >= 1600
#define HWY_TARGET_STR_FP16
#else
#define HWY_TARGET_STR_FP16
#endif

#if HWY_TARGET == HWY_NEON_WITHOUT_AES
// Do not define HWY_TARGET_STR (no pragma).
#elif HWY_TARGET == HWY_NEON
#define HWY_TARGET_STR
#elif HWY_TARGET == HWY_NEON_BF16
#define HWY_TARGET_STR
#else
#error "Logic error, missing case"
#endif  // HWY_TARGET

#endif  // !HWY_ARCH_ARM_V7
#else   // !HWY_HAVE_RUNTIME_DISPATCH
// HWY_TARGET_STR remains undefined
#endif

//-----------------------------------------------------------------------------
// SVE[2]
#elif HWY_TARGET_IS_SVE

// SVE only requires lane alignment, not natural alignment of the entire vector.
#define HWY_ALIGN

// Value ensures MaxLanes() is the tightest possible upper bound to reduce
// overallocation.
#define HWY_LANES

#define HWY_HAVE_INTEGER64
#define HWY_HAVE_FLOAT16
#define HWY_HAVE_FLOAT64
#define HWY_MEM_OPS_MIGHT_FAULT
#define HWY_NATIVE_FMA
#if HWY_SVE_HAVE_BF16_FEATURE
#define HWY_NATIVE_DOT_BF16
#else
#define HWY_NATIVE_DOT_BF16
#endif
#define HWY_CAP_GE256
#define HWY_CAP_GE512

#if HWY_TARGET == HWY_SVE2
#define HWY_NAMESPACE
#define HWY_MAX_BYTES
#define HWY_HAVE_SCALABLE
#elif HWY_TARGET == HWY_SVE_256
#define HWY_NAMESPACE
#define HWY_MAX_BYTES
#define HWY_HAVE_SCALABLE
#elif HWY_TARGET == HWY_SVE2_128
#define HWY_NAMESPACE
#define HWY_MAX_BYTES
#define HWY_HAVE_SCALABLE
#else
#define HWY_NAMESPACE
#define HWY_MAX_BYTES
#define HWY_HAVE_SCALABLE
#endif

// Can use pragmas instead of -march compiler flag
#if HWY_HAVE_RUNTIME_DISPATCH
#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
// Static dispatch with -march=armv8-a+sve2+aes, or no baseline, hence dynamic
// dispatch, which checks for AES support at runtime.
#if defined(__ARM_FEATURE_SVE2_AES) || (HWY_BASELINE_SVE2 == 0)
#define HWY_TARGET_STR
#else  // SVE2 without AES
#define HWY_TARGET_STR
#endif
#else  // not SVE2 target
#define HWY_TARGET_STR
#endif
#else  // !HWY_HAVE_RUNTIME_DISPATCH
// HWY_TARGET_STR remains undefined
#endif

//-----------------------------------------------------------------------------
// WASM
#elif HWY_TARGET == HWY_WASM

#define HWY_ALIGN
#define HWY_MAX_BYTES
#define HWY_LANES

#define HWY_HAVE_SCALABLE
#define HWY_HAVE_INTEGER64
#define HWY_HAVE_FLOAT16
#define HWY_HAVE_FLOAT64
#define HWY_MEM_OPS_MIGHT_FAULT
#define HWY_NATIVE_FMA
#define HWY_NATIVE_DOT_BF16
#define HWY_CAP_GE256
#define HWY_CAP_GE512

#define HWY_NAMESPACE

#define HWY_TARGET_STR

//-----------------------------------------------------------------------------
// WASM_EMU256
#elif HWY_TARGET == HWY_WASM_EMU256

#define HWY_ALIGN
#define HWY_MAX_BYTES
#define HWY_LANES

#define HWY_HAVE_SCALABLE
#define HWY_HAVE_INTEGER64
#define HWY_HAVE_FLOAT16
#define HWY_HAVE_FLOAT64
#define HWY_MEM_OPS_MIGHT_FAULT
#define HWY_NATIVE_FMA
#define HWY_NATIVE_DOT_BF16
#define HWY_CAP_GE256
#define HWY_CAP_GE512

#define HWY_NAMESPACE

#define HWY_TARGET_STR

//-----------------------------------------------------------------------------
// RVV
#elif HWY_TARGET == HWY_RVV

// RVV only requires lane alignment, not natural alignment of the entire vector,
// and the compiler already aligns builtin types, so nothing to do here.
#define HWY_ALIGN

// The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8).
#define HWY_MAX_BYTES

// = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual
// LMUL. This is the tightest possible upper bound.
#define HWY_LANES

#define HWY_HAVE_SCALABLE
#define HWY_HAVE_INTEGER64
#define HWY_HAVE_FLOAT64
#define HWY_MEM_OPS_MIGHT_FAULT
#define HWY_NATIVE_FMA
#define HWY_NATIVE_DOT_BF16
#define HWY_CAP_GE256
#define HWY_CAP_GE512

#if HWY_RVV_HAVE_F16_VEC
#define HWY_HAVE_FLOAT16
#else
#define HWY_HAVE_FLOAT16
#endif

#define HWY_NAMESPACE

// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
// (rv64gcv is not a valid target)

//-----------------------------------------------------------------------------
// EMU128
#elif HWY_TARGET == HWY_EMU128

#define HWY_ALIGN
#define HWY_MAX_BYTES
#define HWY_LANES

#define HWY_HAVE_SCALABLE
#define HWY_HAVE_INTEGER64
#define HWY_HAVE_FLOAT16
#define HWY_HAVE_FLOAT64
#define HWY_MEM_OPS_MIGHT_FAULT
#define HWY_NATIVE_FMA
#define HWY_NATIVE_DOT_BF16
#define HWY_CAP_GE256
#define HWY_CAP_GE512

#define HWY_NAMESPACE

// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.

//-----------------------------------------------------------------------------
// SCALAR
#elif HWY_TARGET == HWY_SCALAR

#define HWY_ALIGN
#define HWY_MAX_BYTES
#define HWY_LANES

#define HWY_HAVE_SCALABLE
#define HWY_HAVE_INTEGER64
#define HWY_HAVE_FLOAT16
#define HWY_HAVE_FLOAT64
#define HWY_MEM_OPS_MIGHT_FAULT
#define HWY_NATIVE_FMA
#define HWY_NATIVE_DOT_BF16
#define HWY_CAP_GE256
#define HWY_CAP_GE512

#define HWY_NAMESPACE

// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.

#else
#pragma message("HWY_TARGET does not match any known target")
#endif  // HWY_TARGET

//-----------------------------------------------------------------------------

// Sanity check: if we have f16 vector support, then base.h should also be
// using a built-in type for f16 scalars.
#if HWY_HAVE_FLOAT16 && !HWY_HAVE_SCALAR_F16_TYPE
#error "Logic error: f16 vectors but no scalars"
#endif

// Override this to 1 in asan/msan builds, which will still fault.
#if HWY_IS_ASAN || HWY_IS_MSAN
#undef HWY_MEM_OPS_MIGHT_FAULT
#define HWY_MEM_OPS_MIGHT_FAULT
#endif

// Clang <9 requires this be invoked at file scope, before any namespace.
#undef HWY_BEFORE_NAMESPACE
#if defined(HWY_TARGET_STR)
#define HWY_BEFORE_NAMESPACE()
#else
// avoids compiler warning if no HWY_TARGET_STR
#define HWY_BEFORE_NAMESPACE
#endif

// Clang <9 requires any namespaces be closed before this macro.
#undef HWY_AFTER_NAMESPACE
#if defined(HWY_TARGET_STR)
#define HWY_AFTER_NAMESPACE()
#else
// avoids compiler warning if no HWY_TARGET_STR
#define HWY_AFTER_NAMESPACE
#endif

#undef HWY_ATTR
#if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target)
#define HWY_ATTR
#else
#define HWY_ATTR
#endif