chromium/third_party/highway/src/hwy/ops/shared-inl.h

// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Per-target definitions shared by ops/*.h and user code.

// IWYU pragma: begin_exports
// Export does not seem to be recursive, so re-export these (also in base.h)
#include <stddef.h>

#include "hwy/base.h"
// "IWYU pragma: keep" does not work for this include, so hide it from the IDE.
#if !HWY_IDE
#include <stdint.h>
#endif

#include "hwy/detect_compiler_arch.h"
#include "hwy/detect_targets.h"

// Separate header because foreach_target.h re-enables its include guard.
#include "hwy/ops/set_macros-inl.h"

// IWYU pragma: end_exports

#if HWY_IS_MSAN
#include <sanitizer/msan_interface.h>
#endif

// We are covered by the highway.h include guard, but generic_ops-inl.h
// includes this again #if HWY_IDE.
// clang-format off
#if defined(HIGHWAY_HWY_OPS_SHARED_TOGGLE) == defined(HWY_TARGET_TOGGLE)  // NOLINT
// clang-format on
#ifdef HIGHWAY_HWY_OPS_SHARED_TOGGLE
#undef HIGHWAY_HWY_OPS_SHARED_TOGGLE
#else
#define HIGHWAY_HWY_OPS_SHARED_TOGGLE
#endif

HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {

// NOTE: GCC generates incorrect code for vector arguments to non-inlined
// functions in two situations:
// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
//   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
// - on aarch64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
//   all) tests to fail.
//
// We therefore pass by const& only on GCC and (Windows or aarch64). This alias
// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
// and possibly also other functions that are not inlined.
#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
template <class V>
using VecArg = const V&;
#else
VecArg;
#endif

namespace detail {

template <typename T>
struct NativeLaneTypeT {};
template <>
struct NativeLaneTypeT<hwy::float16_t> {};
template <>
struct NativeLaneTypeT<hwy::bfloat16_t> {};

// The type expected by intrinsics for the given Highway lane type T. This
// usually matches T, but differs for our wrapper types [b]float16_t. Use this
// only when defining intrinsic wrappers, and NOT for casting, which is UB.
NativeLaneType;

// Returns the same pointer after changing type to NativeLaneType. Use this only
// for wrapper functions that call intrinsics (e.g. load/store) where some of
// the overloads expect _Float16* or __bf16* arguments. For non-special floats,
// this returns the same pointer and type.
//
// This makes use of the fact that a wrapper struct is pointer-interconvertible
// with its first member (a union), thus also with the union members. Do NOT
// call both this and U16LanePointer on the same object - they access different
// union members, and this is not guaranteed to be safe.
template <typename T, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_INLINE T* NativeLanePointer(T* p) {}
template <typename T, typename NT = NativeLaneType<RemoveConst<T>>,
          HWY_IF_F16(T)>
HWY_INLINE constexpr If<IsConst<T>(), const NT*, NT*> NativeLanePointer(T* p) {}
template <typename T, typename NT = NativeLaneType<RemoveConst<T>>,
          HWY_IF_BF16(T)>
HWY_INLINE constexpr If<IsConst<T>(), const NT*, NT*> NativeLanePointer(T* p) {}

// Returns a pointer to the u16 member of our [b]float16_t wrapper structs.
// Use this in Highway targets that lack __bf16 intrinsics; for storing to
// memory, we BitCast vectors to u16 and write to the pointer returned here.
// Do NOT call both this and U16LanePointer on the same object - they access
// different union members, and this is not guaranteed to be safe.
template <typename T, HWY_IF_SPECIAL_FLOAT(T)>
HWY_INLINE If<IsConst<T>(), const uint16_t*, uint16_t*> U16LanePointer(T* p) {}

// Returns N * 2^pow2. N is the number of lanes in a full vector and pow2 the
// desired fraction or multiple of it, see Simd<>. `pow2` is most often in
// [-3, 3] but can also be lower for user-specified fractions.
constexpr size_t ScaleByPower(size_t N, int pow2) {}

template <typename T>
HWY_INLINE void MaybeUnpoison(T* HWY_RESTRICT unaligned, size_t count) {}

}  // namespace detail

// Highway operations are implemented as overloaded functions selected using a
// zero-sized tag type D := Simd<T, N, kPow2>. T denotes the lane type.
//
// N defines how many lanes are in a 'full' vector, typically equal to
// HWY_LANES(T) (which is the actual count on targets with vectors of known
// size, and an upper bound in case of scalable vectors), otherwise a
// user-specified limit at most that large.
//
// 2^kPow2 is a _subsequently_ applied scaling factor that indicates the
// desired fraction of a 'full' vector: 0 means full, -1 means half; 1,2,3
// means two/four/eight full vectors ganged together. The largest supported
// kPow2 is `HWY_MAX_POW2` and the aliases below take care of clamping
// user-specified values to that. Note that `Simd<T, 1, 0>` and `Simd<T, 2, -1>`
// have the same `MaxLanes` and `Lanes`.
//
// We can theoretically keep halving Lanes(), but recursive instantiations of
// kPow2 - 1 will eventually fail e.g. because -64 is not a valid shift count.
// Users must terminate such compile-time recursions at or above HWY_MIN_POW2.
//
// WARNING: do not use N directly because it may be a special representation of
// a fractional MaxLanes. This arises when we Rebind Simd<uint8_t, 1, 0> to
// Simd<uint32_t, ??, 2>. RVV requires that the last argument (kPow2) be two,
// but we want MaxLanes to be the same in both cases. Hence ?? is a
// fixed-point encoding of 1/4.
//
// Instead of referring to Simd<> directly, users create D via aliases:
// - ScalableTag<T> for a full vector;
// - ScalableTag<T, kPow2>() for a fraction/group, where `kPow2` is
//   interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`;
// - CappedTag<T, kLimit> for a vector with up to kLimit lanes; or
// - FixedTag<T, kNumLanes> for a vector with exactly kNumLanes lanes.
//
// Instead of N, use Lanes(D()) for the actual number of lanes at runtime and
// D().MaxLanes() for a constexpr upper bound. Both are powers of two.
template <typename Lane, size_t N, int kPow2>
struct Simd {};

namespace detail {

template <typename T, size_t N, int kPow2>
constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {}

// Struct wrappers enable validation of arguments via static_assert.
template <typename T, size_t N, int kPow2>
struct ClampNAndPow2 {};

template <typename T, int kPow2>
struct ScalableTagChecker {};

template <typename T, size_t kLimit, int kPow2>
struct CappedTagChecker {};

template <typename T, size_t kNumLanes>
struct FixedTagChecker {};

}  // namespace detail

// ------------------------------ Aliases for Simd<>

// Tag describing a full vector (kPow2 == 0: the most common usage, e.g. 1D
// loops where the application does not care about the vector size) or a
// fraction/multiple of one. Fractions (kPow2 < 0) are useful for arguments or
// return values of type promotion and demotion. User-specified kPow2 is
// interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`.
ScalableTag;

// Tag describing a vector with *up to* kLimit active lanes, even on targets
// with scalable vectors and HWY_SCALAR. The runtime lane count `Lanes(tag)` may
// be less than kLimit, and is 1 on HWY_SCALAR. This alias is typically used for
// 1D loops with a relatively low application-defined upper bound, e.g. for 8x8
// DCTs. However, it is better if data structures are designed to be
// vector-length-agnostic (e.g. a hybrid SoA where there are chunks of `M >=
// MaxLanes(d)` DC components followed by M AC1, .., and M AC63; this would
// enable vector-length-agnostic loops using ScalableTag). User-specified kPow2
// is interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`.
CappedTag;

#if !HWY_HAVE_SCALABLE
// If the vector size is known, and the app knows it does not want more than
// kLimit lanes, then capping can be beneficial. For example, AVX-512 has lower
// IPC and potentially higher costs for unaligned load/store vs. 256-bit AVX2.
CappedTagIfFixed;
#else  // HWY_HAVE_SCALABLE
// .. whereas on RVV/SVE, the cost of clamping Lanes() may exceed the benefit.
template <typename T, size_t kLimit, int kPow2 = 0>
using CappedTagIfFixed = ScalableTag<T, kPow2>;
#endif

// Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
// even on targets with scalable vectors. Requires `kNumLanes` to be a power of
// two not exceeding `HWY_LANES(T)`.
//
// NOTE: if the application does not need to support HWY_SCALAR (+), use this
// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
// This is useful for data structures that rely on exactly 128-bit SIMD, but
// these are discouraged because they cannot benefit from wider vectors.
// Instead, applications would ideally define a larger problem size and loop
// over it with the (unknown size) vectors from ScalableTag.
//
// + e.g. if the baseline is known to support SIMD, or the application requires
//   ops such as TableLookupBytes not supported by HWY_SCALAR.
FixedTag;

// Convenience form for fixed sizes.
Full16;

Full32;

Full64;

Full128;

// ------------------------------ Accessors for Simd<>

// Lane type.
TFromD;

// Upper bound on the number of lanes, typically used for SFINAE conditions and
// to allocate storage for targets with known vector sizes. Note: this may be a
// loose bound, instead use Lanes() as the actual size for AllocateAligned.
// MSVC workaround: use static constant directly instead of a function.
#define HWY_MAX_LANES_D(D)

// Same as D().Pow2(), but this is too complex for SFINAE with MSVC, so we use a
// static constant directly.
#define HWY_POW2_D(D)

// Non-macro form of HWY_MAX_LANES_D in case that is preferable. WARNING: the
// macro form may be required for MSVC, which has limitations on deducing
// arguments.
template <class D>
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {}

#if !HWY_HAVE_SCALABLE

// If non-scalable, this is constexpr; otherwise the target's header defines a
// non-constexpr version of this function. This is the actual vector length,
// used when advancing loop counters.
template <class D>
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t Lanes(D) {}

#endif  // !HWY_HAVE_SCALABLE

// Tag for the same number of lanes as D, but with the LaneType T.
Rebind;

RebindToSigned;
RebindToUnsigned;
RebindToFloat;

// Tag for the same total size as D, but with the LaneType T.
Repartition;

RepartitionToWide;
RepartitionToNarrow;

// Shorthand for applying RepartitionToWide twice (for 8/16-bit types).
RepartitionToWideX2;
// Shorthand for applying RepartitionToWide three times (for 8-bit types).
RepartitionToWideX3;

// Tag for the same lane type as D, but half the lanes.
Half;

// Tag for the same lane type as D, but twice the lanes.
Twice;

// Tag for a 16-byte block with the same lane type as D
#if HWY_HAVE_SCALABLE
namespace detail {

template <class D>
class BlockDFromD_t {};

template <typename T, size_t N, int kPow2>
class BlockDFromD_t<Simd<T, N, kPow2>> {
  using D = Simd<T, N, kPow2>;
  static constexpr int kNewPow2 = HWY_MIN(kPow2, 0);
  static constexpr size_t kMaxLpb = HWY_MIN(16 / sizeof(T), HWY_MAX_LANES_D(D));
  static constexpr size_t kNewN = D::template NewN<kNewPow2, kMaxLpb>();

 public:
  using type = Simd<T, kNewN, kNewPow2>;
};

}  // namespace detail

template <class D>
using BlockDFromD = typename detail::BlockDFromD_t<RemoveConst<D>>::type;
#else
BlockDFromD;
#endif

// Returns whether `ptr` is a multiple of `Lanes(d)` elements.
template <class D, typename T>
HWY_API bool IsAligned(D d, T* ptr) {}

// ------------------------------ Choosing overloads (SFINAE)

// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
#define HWY_IF_UNSIGNED_D(D)
#define HWY_IF_NOT_UNSIGNED_D(D)
#define HWY_IF_SIGNED_D(D)
#define HWY_IF_FLOAT_D(D)
#define HWY_IF_NOT_FLOAT_D(D)
#define HWY_IF_FLOAT3264_D(D)
#define HWY_IF_NOT_FLOAT3264_D(D)
#define HWY_IF_SPECIAL_FLOAT_D(D)
#define HWY_IF_NOT_SPECIAL_FLOAT_D(D)
#define HWY_IF_FLOAT_OR_SPECIAL_D(D)
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)

#define HWY_IF_T_SIZE_D(D, bytes)
#define HWY_IF_NOT_T_SIZE_D(D, bytes)
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
#define HWY_IF_T_SIZE_LE_D(D, bytes)
#define HWY_IF_T_SIZE_GT_D(D, bytes)

#define HWY_IF_LANES_D(D, lanes)
#define HWY_IF_LANES_LE_D(D, lanes)
#define HWY_IF_LANES_GT_D(D, lanes)
#define HWY_IF_LANES_PER_BLOCK_D(D, lanes)

#if HWY_COMPILER_MSVC
#define HWY_IF_POW2_LE_D
#define HWY_IF_POW2_GT_D
#else
#define HWY_IF_POW2_LE_D(D, pow2)
#define HWY_IF_POW2_GT_D(D, pow2)
#endif  // HWY_COMPILER_MSVC

#define HWY_IF_U8_D(D)
#define HWY_IF_U16_D(D)
#define HWY_IF_U32_D(D)
#define HWY_IF_U64_D(D)

#define HWY_IF_I8_D(D)
#define HWY_IF_I16_D(D)
#define HWY_IF_I32_D(D)
#define HWY_IF_I64_D(D)

// Use instead of HWY_IF_T_SIZE_D to avoid ambiguity with float16_t/float/double
// overloads.
#define HWY_IF_UI8_D(D)
#define HWY_IF_UI16_D(D)
#define HWY_IF_UI32_D(D)
#define HWY_IF_UI64_D(D)

#define HWY_IF_BF16_D(D)
#define HWY_IF_NOT_BF16_D(D)

#define HWY_IF_F16_D(D)
#define HWY_IF_NOT_F16_D(D)

#define HWY_IF_F32_D(D)
#define HWY_IF_F64_D(D)

#define HWY_V_SIZE_D(D)
#define HWY_IF_V_SIZE_D(D, bytes)
#define HWY_IF_V_SIZE_LE_D(D, bytes)
#define HWY_IF_V_SIZE_GT_D(D, bytes)

// Same, but with a vector argument. ops/*-inl.h define their own TFromV.
#define HWY_IF_UNSIGNED_V(V)
#define HWY_IF_NOT_UNSIGNED_V(V)
#define HWY_IF_SIGNED_V(V)
#define HWY_IF_FLOAT_V(V)
#define HWY_IF_NOT_FLOAT_V(V)
#define HWY_IF_SPECIAL_FLOAT_V(V)
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)

#define HWY_IF_T_SIZE_V(V, bytes)
#define HWY_IF_NOT_T_SIZE_V(V, bytes)
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)

#define HWY_MAX_LANES_V(V)
#define HWY_IF_V_SIZE_V(V, bytes)
#define HWY_IF_V_SIZE_LE_V(V, bytes)
#define HWY_IF_V_SIZE_GT_V(V, bytes)

// Use in implementations of ReduceSum etc. to avoid conflicts with the N=1 and
// N=4 8-bit specializations in generic_ops-inl.
#undef HWY_IF_REDUCE_D
#define HWY_IF_REDUCE_D(D)

#undef HWY_IF_SUM_OF_LANES_D
#define HWY_IF_SUM_OF_LANES_D(D)

#undef HWY_IF_MINMAX_OF_LANES_D
#define HWY_IF_MINMAX_OF_LANES_D(D)

#undef HWY_IF_ADDSUB_V
#define HWY_IF_ADDSUB_V(V)

#undef HWY_IF_MULADDSUB_V
#define HWY_IF_MULADDSUB_V(V)

// HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V is used to disable the default
// implementation of unsigned to signed DemoteTo/ReorderDemote2To in
// generic_ops-inl.h for at least some of the unsigned to signed demotions on
// SCALAR/EMU128/SSE2/SSSE3/SSE4/AVX2/SVE/SVE2

#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V)

// Old names (deprecated)
#define HWY_IF_LANE_SIZE_D(D, bytes)
#define HWY_IF_NOT_LANE_SIZE_D(D, bytes)

// NOLINTNEXTLINE(google-readability-namespace-comments)
}  // namespace HWY_NAMESPACE
}  // namespace hwy
HWY_AFTER_NAMESPACE();

#endif  // HIGHWAY_HWY_OPS_SHARED_TOGGLE