#include "hwy/base.h"
#if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED)
#include "hwy/detect_targets.h"
#include "hwy/ops/emu128-inl.h"
#endif
HWY_BEFORE_NAMESPACE(…);
namespace hwy {
namespace HWY_NAMESPACE {
LaneType;
Vec;
Mask;
template <class V>
HWY_API V Clamp(const V v, const V lo, const V hi) { … }
#if (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV) || HWY_IDE
template <size_t kLanes, class D>
HWY_API VFromD<D> CombineShiftRightLanes(D d, VFromD<D> hi, VFromD<D> lo) { … }
#endif
template <class D>
HWY_API Vec<D> SignBit(D d) { … }
template <class D>
HWY_API Vec<D> NaN(D d) { … }
template <class D>
HWY_API Vec<D> Inf(D d) { … }
#if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR
namespace detail {
#if HWY_HAVE_SCALABLE
template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
hwy::SizeTag<kFromVectSize> ,
hwy::SizeTag<kToVectSize> , DTo d_to, DFrom d_from,
VFromD<DFrom> v) {
const Repartition<uint8_t, DTo> d_to_u8;
const auto resized = ResizeBitCast(d_to_u8, v);
const size_t num_bytes = Lanes(Repartition<uint8_t, decltype(d_from)>());
return BitCast(d_to, IfThenElseZero(FirstN(d_to_u8, num_bytes), resized));
}
#else
template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
HWY_IF_LANES_LE(kToVectSize, kFromVectSize)>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
hwy::SizeTag<kFromVectSize> ,
hwy::SizeTag<kToVectSize> , DTo d_to, DFrom ,
VFromD<DFrom> v) { … }
template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
HWY_IF_LANES(kToVectSize, kFromVectSize * 2)>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
hwy::SizeTag<kFromVectSize> ,
hwy::SizeTag<kToVectSize> , DTo d_to, DFrom d_from,
VFromD<DFrom> v) { … }
template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
HWY_IF_LANES_GT(kToVectSize, kFromVectSize * 2)>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
hwy::SizeTag<kFromVectSize> ,
hwy::SizeTag<kToVectSize> , DTo d_to, DFrom ,
VFromD<DFrom> v) { … }
#endif
}
#endif
template <class DTo, class DFrom>
HWY_API VFromD<DTo> ZeroExtendResizeBitCast(DTo d_to, DFrom d_from,
VFromD<DFrom> v) { … }
template <class D, typename T = TFromD<D>>
HWY_API void SafeFillN(const size_t num, const T value, D d,
T* HWY_RESTRICT to) { … }
template <class D, typename T = TFromD<D>>
HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
T* HWY_RESTRICT to) { … }
#if (defined(HWY_NATIVE_IS_NEGATIVE) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_IS_NEGATIVE
#undef HWY_NATIVE_IS_NEGATIVE
#else
#define HWY_NATIVE_IS_NEGATIVE
#endif
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
HWY_API Mask<DFromV<V>> IsNegative(V v) { … }
#endif
#if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_MASK_FALSE
#undef HWY_NATIVE_MASK_FALSE
#else
#define HWY_NATIVE_MASK_FALSE
#endif
template <class D>
HWY_API Mask<D> MaskFalse(D d) { … }
#endif
#if (defined(HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
#undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
#else
#define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
#endif
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
HWY_API V IfNegativeThenElseZero(V v, V yes) { … }
#endif
#if (defined(HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
#undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
#else
#define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
#endif
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
HWY_API V IfNegativeThenZeroElse(V v, V no) { … }
#endif
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
HWY_API V ZeroIfNegative(V v) { … }
#if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#else
#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
#endif
template <class V>
HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { … }
#endif
#if (defined(HWY_NATIVE_PROMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_PROMOTE_MASK_TO
#undef HWY_NATIVE_PROMOTE_MASK_TO
#else
#define HWY_NATIVE_PROMOTE_MASK_TO
#endif
template <class DTo, class DFrom>
HWY_API Mask<DTo> PromoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) { … }
#endif
#if (defined(HWY_NATIVE_DEMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_DEMOTE_MASK_TO
#undef HWY_NATIVE_DEMOTE_MASK_TO
#else
#define HWY_NATIVE_DEMOTE_MASK_TO
#endif
template <class DTo, class DFrom>
HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) { … }
#endif
#if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_COMBINE_MASKS
#undef HWY_NATIVE_COMBINE_MASKS
#else
#define HWY_NATIVE_COMBINE_MASKS
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class D>
HWY_API Mask<D> CombineMasks(D d, Mask<Half<D>> hi, Mask<Half<D>> lo) { … }
#endif
#endif
#if (defined(HWY_NATIVE_LOWER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
#undef HWY_NATIVE_LOWER_HALF_OF_MASK
#else
#define HWY_NATIVE_LOWER_HALF_OF_MASK
#endif
template <class D>
HWY_API Mask<D> LowerHalfOfMask(D d, Mask<Twice<D>> m) { … }
#endif
#if (defined(HWY_NATIVE_UPPER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
#undef HWY_NATIVE_UPPER_HALF_OF_MASK
#else
#define HWY_NATIVE_UPPER_HALF_OF_MASK
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class D>
HWY_API Mask<D> UpperHalfOfMask(D d, Mask<Twice<D>> m) { … }
#endif
#endif
#if (defined(HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#else
#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class DTo, class DFrom>
HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
Mask<DFrom> b) { … }
#endif
#endif
template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V RotateLeft(V v) { … }
#if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
#undef HWY_NATIVE_INTERLEAVE_WHOLE
#else
#define HWY_NATIVE_INTERLEAVE_WHOLE
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) { … }
#endif
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class V>
HWY_API V InterleaveWholeLower(V a, V b) { … }
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class V>
HWY_API V InterleaveEven(V a, V b) { … }
#endif
template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
HWY_API V AddSub(V a, V b) { … }
template <class V, HWY_IF_ADDSUB_V(V)>
HWY_API V AddSub(V a, V b) { … }
#if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_MASKED_ARITH
#undef HWY_NATIVE_MASKED_ARITH
#else
#define HWY_NATIVE_MASKED_ARITH
#endif
template <class V, class M>
HWY_API V MaskedMinOr(V no, M m, V a, V b) { … }
template <class V, class M>
HWY_API V MaskedMaxOr(V no, M m, V a, V b) { … }
template <class V, class M>
HWY_API V MaskedAddOr(V no, M m, V a, V b) { … }
template <class V, class M>
HWY_API V MaskedSubOr(V no, M m, V a, V b) { … }
template <class V, class M>
HWY_API V MaskedMulOr(V no, M m, V a, V b) { … }
template <class V, class M>
HWY_API V MaskedDivOr(V no, M m, V a, V b) { … }
template <class V, class M>
HWY_API V MaskedModOr(V no, M m, V a, V b) { … }
template <class V, class M>
HWY_API V MaskedSatAddOr(V no, M m, V a, V b) { … }
template <class V, class M>
HWY_API V MaskedSatSubOr(V no, M m, V a, V b) { … }
#endif
#if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
#undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
#else
#define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
#endif
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE
const auto zero = Zero(DFromV<V>());
return MaskedSubOr(v, Lt(mask, zero), zero, v);
#else
return IfNegativeThenElse(mask, Neg(v), v);
#endif
}
#endif
template <class V, HWY_IF_FLOAT_V(V)>
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) { … }
#if (defined(HWY_NATIVE_SATURATED_NEG_8_16_32) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
#undef HWY_NATIVE_SATURATED_NEG_8_16_32
#else
#define HWY_NATIVE_SATURATED_NEG_8_16_32
#endif
template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
HWY_IF_SIGNED_V(V)>
HWY_API V SaturatedNeg(V v) { … }
template <class V, HWY_IF_I32(TFromV<V>)>
HWY_API V SaturatedNeg(V v) { … }
#endif
#if (defined(HWY_NATIVE_SATURATED_NEG_64) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SATURATED_NEG_64
#undef HWY_NATIVE_SATURATED_NEG_64
#else
#define HWY_NATIVE_SATURATED_NEG_64
#endif
template <class V, HWY_IF_I64(TFromV<V>)>
HWY_API V SaturatedNeg(V v) { … }
#endif
#if (defined(HWY_NATIVE_SATURATED_ABS) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SATURATED_ABS
#undef HWY_NATIVE_SATURATED_ABS
#else
#define HWY_NATIVE_SATURATED_ABS
#endif
template <class V, HWY_IF_SIGNED_V(V)>
HWY_API V SaturatedAbs(V v) {
return Max(v, SaturatedNeg(v));
}
#endif
#if (defined(HWY_NATIVE_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REDUCE_SCALAR
#undef HWY_NATIVE_REDUCE_SCALAR
#else
#define HWY_NATIVE_REDUCE_SCALAR
#endif
namespace detail {
struct AddFunc { … };
struct MinFunc { … };
struct MaxFunc { … };
template <class D, class Func, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE VFromD<D> ReduceAcrossBlocks(D, Func, VFromD<D> v) { … }
template <class D, class Func, HWY_IF_V_SIZE_D(D, 32)>
HWY_INLINE VFromD<D> ReduceAcrossBlocks(D , Func f, VFromD<D> v) { … }
template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v10) { … }
template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v3210) { … }
template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v76543210) { … }
template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_U8_D(D)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) { … }
template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_I8_D(D)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) { … }
}
template <class D, HWY_IF_SUM_OF_LANES_D(D)>
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) { … }
template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) { … }
template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) { … }
template <class D, HWY_IF_REDUCE_D(D)>
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) { … }
template <class D, HWY_IF_REDUCE_D(D)>
HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) { … }
template <class D, HWY_IF_REDUCE_D(D)>
HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) { … }
#endif
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API TFromD<D> ReduceSum(D , VFromD<D> v) { … }
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API TFromD<D> ReduceMin(D , VFromD<D> v) { … }
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API TFromD<D> ReduceMax(D , VFromD<D> v) { … }
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> SumOfLanes(D , VFromD<D> v) { … }
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> MinOfLanes(D , VFromD<D> v) { … }
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> MaxOfLanes(D , VFromD<D> v) { … }
#if (defined(HWY_NATIVE_REDUCE_SUM_4_UI8) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
#else
#define HWY_NATIVE_REDUCE_SUM_4_UI8
#endif
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) { … }
#endif
#if (defined(HWY_NATIVE_REDUCE_MINMAX_4_UI8) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
#else
#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
#endif
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) { … }
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) { … }
#endif
#if (defined(HWY_NATIVE_IS_EITHER_NAN) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_IS_EITHER_NAN
#undef HWY_NATIVE_IS_EITHER_NAN
#else
#define HWY_NATIVE_IS_EITHER_NAN
#endif
template <class V, HWY_IF_FLOAT_V(V)>
HWY_API MFromD<DFromV<V>> IsEitherNaN(V a, V b) {
return Or(IsNaN(a), IsNaN(b));
}
#endif
#if (defined(HWY_NATIVE_ISINF) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_ISINF
#undef HWY_NATIVE_ISINF
#else
#define HWY_NATIVE_ISINF
#endif
template <class V, class D = DFromV<V>>
HWY_API MFromD<D> IsInf(const V v) { … }
template <class V, class D = DFromV<V>>
HWY_API MFromD<D> IsFinite(const V v) { … }
#endif
#if HWY_IDE || \
(defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
#else
#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
#endif
template <class D, HWY_IF_LANES_GT_D(D, 1)>
HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1) { … }
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1) { … }
namespace detail {
#if HWY_IDE
template <class V>
HWY_INLINE V ShuffleTwo1230(V a, V ) {
return a;
}
template <class V>
HWY_INLINE V ShuffleTwo2301(V a, V ) {
return a;
}
template <class V>
HWY_INLINE V ShuffleTwo3012(V a, V ) {
return a;
}
#endif
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE void LoadTransposedBlocks3(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& A, VFromD<D>& B,
VFromD<D>& C) { … }
}
template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { … }
template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { … }
template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { … }
template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { … }
template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { … }
template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { … }
namespace detail {
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE void LoadTransposedBlocks4(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& vA, VFromD<D>& vB,
VFromD<D>& vC, VFromD<D>& vD) { … }
}
template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
VFromD<D>& v3) { … }
template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
VFromD<D>& v3) { … }
template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
VFromD<D>& v3) { … }
template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
VFromD<D>& v3) { … }
template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
VFromD<D>& v3) { … }
namespace detail {
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE void StoreTransposedBlocks2(VFromD<D> A, VFromD<D> B, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
}
template <class D, HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
template <class V, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API void StoreInterleaved2(V part0, V part1, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
namespace detail {
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE void StoreTransposedBlocks3(VFromD<D> A, VFromD<D> B, VFromD<D> C,
D d, TFromD<D>* HWY_RESTRICT unaligned) { … }
}
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 8)>
HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
VFromD<D> part2, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 4)>
HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
VFromD<D> part2, D dh,
TFromD<D>* HWY_RESTRICT unaligned) { … }
template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_D(D, 2)>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 4),
HWY_IF_LANES_GT_D(D, 1)>
HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
VFromD<D> part2, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 2)>
HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
VFromD<D> part2, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
namespace detail {
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE void StoreTransposedBlocks4(VFromD<D> vA, VFromD<D> vB, VFromD<D> vC,
VFromD<D> vD, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
}
template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
VFromD<D> v3, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
VFromD<D> v3, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
VFromD<D> part2, VFromD<D> part3, D ,
TFromD<D>* HWY_RESTRICT unaligned) { … }
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
VFromD<D> part2, VFromD<D> part3, D ,
TFromD<D>* HWY_RESTRICT unaligned) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
VFromD<D> part2, VFromD<D> part3, D d,
TFromD<D>* HWY_RESTRICT unaligned) { … }
#endif
#if (defined(HWY_NATIVE_LOAD_N) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_LOAD_N
#undef HWY_NATIVE_LOAD_N
#else
#define HWY_NATIVE_LOAD_N
#endif
#if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
namespace detail {
template <class DTo, class DFrom>
HWY_INLINE VFromD<DTo> LoadNResizeBitCast(DTo d_to, DFrom d_from,
VFromD<DFrom> v) { … }
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) { … }
#if HWY_MAX_BYTES >= 32
template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) {
if (num_lanes >= Lanes(d)) return LoadU(d, p);
const Half<decltype(d)> dh;
const size_t half_N = Lanes(dh);
if (num_lanes <= half_N) {
return ZeroExtendVector(d, LoadN(dh, p, num_lanes));
} else {
const VFromD<decltype(dh)> v_lo = LoadU(dh, p);
const VFromD<decltype(dh)> v_hi = LoadN(dh, p + half_N, num_lanes - half_N);
return Combine(d, v_hi, v_lo);
}
}
template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) {
if (num_lanes >= Lanes(d)) return LoadU(d, p);
const Half<decltype(d)> dh;
const size_t half_N = Lanes(dh);
const VFromD<decltype(dh)> no_h = LowerHalf(no);
if (num_lanes <= half_N) {
return ConcatUpperLower(d, no,
ResizeBitCast(d, LoadNOr(no_h, dh, p, num_lanes)));
} else {
const VFromD<decltype(dh)> v_lo = LoadU(dh, p);
const VFromD<decltype(dh)> v_hi =
LoadNOr(no_h, dh, p + half_N, num_lanes - half_N);
return Combine(d, v_hi, v_lo);
}
}
#endif
template <class D, HWY_IF_BF16_D(D)>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) { … }
template <class D, HWY_IF_BF16_D(D)>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) { … }
#else
template <class D>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) {
#if HWY_MEM_OPS_MIGHT_FAULT
if (num_lanes <= 0) return Zero(d);
#endif
return MaskedLoad(FirstN(d, num_lanes), d, p);
}
template <class D>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
size_t num_lanes) {
#if HWY_MEM_OPS_MIGHT_FAULT
if (num_lanes <= 0) return no;
#endif
return MaskedLoadOr(no, FirstN(d, num_lanes), d, p);
}
#endif
#endif
#if (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_STORE_N
#undef HWY_NATIVE_STORE_N
#else
#define HWY_NATIVE_STORE_N
#endif
#if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
namespace detail {
template <class DH, HWY_IF_V_SIZE_LE_D(DH, 4)>
HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) { … }
template <class DH, HWY_IF_V_SIZE_GT_D(DH, 4)>
HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) { … }
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
size_t max_lanes_to_store) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
size_t max_lanes_to_store) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
size_t max_lanes_to_store) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
size_t max_lanes_to_store) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
size_t max_lanes_to_store) { … }
#if HWY_MAX_BYTES >= 32
template <class D, HWY_IF_V_SIZE_GT_D(D, 16), typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
size_t max_lanes_to_store) {
const size_t N = Lanes(d);
if (max_lanes_to_store >= N) {
StoreU(v, d, p);
return;
}
const Half<decltype(d)> dh;
const size_t half_N = Lanes(dh);
if (max_lanes_to_store <= half_N) {
StoreN(LowerHalf(dh, v), dh, p, max_lanes_to_store);
} else {
StoreU(LowerHalf(dh, v), dh, p);
StoreN(UpperHalf(dh, v), dh, p + half_N, max_lanes_to_store - half_N);
}
}
#endif
#else
template <class D, typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
size_t max_lanes_to_store) {
const size_t N = Lanes(d);
const size_t clamped_max_lanes_to_store = HWY_MIN(max_lanes_to_store, N);
#if HWY_MEM_OPS_MIGHT_FAULT
if (clamped_max_lanes_to_store == 0) return;
#endif
BlendedStore(v, FirstN(d, clamped_max_lanes_to_store), d, p);
detail::MaybeUnpoison(p, clamped_max_lanes_to_store);
}
#endif
#endif
#if (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SCATTER
#undef HWY_NATIVE_SCATTER
#else
#define HWY_NATIVE_SCATTER
#endif
template <class D, typename T = TFromD<D>>
HWY_API void ScatterOffset(VFromD<D> v, D d, T* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> offset) { … }
template <class D, typename T = TFromD<D>>
HWY_API void ScatterIndex(VFromD<D> v, D d, T* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> index) { … }
template <class D, typename T = TFromD<D>>
HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D d,
T* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> index) { … }
template <class D, typename T = TFromD<D>>
HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> index,
const size_t max_lanes_to_store) { … }
#else
template <class D, typename T = TFromD<D>>
HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> index,
const size_t max_lanes_to_store) {
MaskedScatterIndex(v, FirstN(d, max_lanes_to_store), d, base, index);
}
#endif
#if (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_GATHER
#undef HWY_NATIVE_GATHER
#else
#define HWY_NATIVE_GATHER
#endif
template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> offset) { … }
template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> index) { … }
template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
const T* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> index) { … }
template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
const T* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> index) { … }
template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> index,
const size_t max_lanes_to_load) { … }
template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> index,
const size_t max_lanes_to_load) { … }
#else
template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> index,
const size_t max_lanes_to_load) {
return MaskedGatherIndex(FirstN(d, max_lanes_to_load), d, base, index);
}
template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> index,
const size_t max_lanes_to_load) {
return MaskedGatherIndexOr(no, FirstN(d, max_lanes_to_load), d, base, index);
}
#endif
#if (defined(HWY_NATIVE_INTEGER_ABS_DIFF) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_INTEGER_ABS_DIFF
#undef HWY_NATIVE_INTEGER_ABS_DIFF
#else
#define HWY_NATIVE_INTEGER_ABS_DIFF
#endif
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V AbsDiff(V a, V b) { … }
#endif
#if (defined(HWY_NATIVE_SUMS_OF_8_ABS_DIFF) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
#undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
#else
#define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
#endif
template <class V, HWY_IF_UI8_D(DFromV<V>),
HWY_IF_V_SIZE_GT_D(DFromV<V>, (HWY_TARGET == HWY_SCALAR ? 0 : 4))>
HWY_API Vec<RepartitionToWideX3<DFromV<V>>> SumsOf8AbsDiff(V a, V b) {
const DFromV<decltype(a)> d;
const RebindToUnsigned<decltype(d)> du;
const RepartitionToWideX3<decltype(d)> dw;
return BitCast(dw, SumsOf8(BitCast(du, AbsDiff(a, b))));
}
#endif
#if (defined(HWY_NATIVE_I32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
#undef HWY_NATIVE_I32_SATURATED_ADDSUB
#else
#define HWY_NATIVE_I32_SATURATED_ADDSUB
#endif
template <class V, HWY_IF_I32_D(DFromV<V>)>
HWY_API V SaturatedAdd(V a, V b) { … }
template <class V, HWY_IF_I32_D(DFromV<V>)>
HWY_API V SaturatedSub(V a, V b) { … }
#endif
#if (defined(HWY_NATIVE_I64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
#undef HWY_NATIVE_I64_SATURATED_ADDSUB
#else
#define HWY_NATIVE_I64_SATURATED_ADDSUB
#endif
template <class V, HWY_IF_I64_D(DFromV<V>)>
HWY_API V SaturatedAdd(V a, V b) { … }
template <class V, HWY_IF_I64_D(DFromV<V>)>
HWY_API V SaturatedSub(V a, V b) { … }
#endif
#if (defined(HWY_NATIVE_U32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
#undef HWY_NATIVE_U32_SATURATED_ADDSUB
#else
#define HWY_NATIVE_U32_SATURATED_ADDSUB
#endif
template <class V, HWY_IF_U32_D(DFromV<V>)>
HWY_API V SaturatedAdd(V a, V b) { … }
template <class V, HWY_IF_U32_D(DFromV<V>)>
HWY_API V SaturatedSub(V a, V b) { … }
#endif
#if (defined(HWY_NATIVE_U64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
#undef HWY_NATIVE_U64_SATURATED_ADDSUB
#else
#define HWY_NATIVE_U64_SATURATED_ADDSUB
#endif
template <class V, HWY_IF_U64_D(DFromV<V>)>
HWY_API V SaturatedAdd(V a, V b) { … }
template <class V, HWY_IF_U64_D(DFromV<V>)>
HWY_API V SaturatedSub(V a, V b) { … }
#endif
template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V),
class V2 = VFromD<Rebind<TFromV<V>, DN>>,
hwy::EnableIf<(sizeof(TFromD<DN>) < sizeof(TFromV<V>))>* = nullptr,
HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
HWY_API VFromD<DN> DemoteTo(DN dn, V v) { … }
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V),
class V2 = VFromD<Repartition<TFromV<V>, DN>>,
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { … }
#endif
template <class D, class V>
HWY_API VFromD<D> PromoteLowerTo(D d, V v) { … }
#if (defined(HWY_NATIVE_PROMOTE_UPPER_TO) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
#undef HWY_NATIVE_PROMOTE_UPPER_TO
#else
#define HWY_NATIVE_PROMOTE_UPPER_TO
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class D, class V>
HWY_API VFromD<D> PromoteUpperTo(D d, V v) { … }
#endif
#endif
#if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_F16C
#undef HWY_NATIVE_F16C
#else
#define HWY_NATIVE_F16C
#endif
template <class D, HWY_IF_F32_D(D)>
HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) { … }
template <class D, HWY_IF_F16_D(D)>
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) { … }
#endif
#if (defined(HWY_NATIVE_DEMOTE_F64_TO_F16) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
#else
#define HWY_NATIVE_DEMOTE_F64_TO_F16
#endif
#if HWY_HAVE_FLOAT64
template <class D, HWY_IF_F16_D(D)>
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) { … }
#endif
#endif
#if (defined(HWY_NATIVE_PROMOTE_F16_TO_F64) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
#undef HWY_NATIVE_PROMOTE_F16_TO_F64
#else
#define HWY_NATIVE_PROMOTE_F16_TO_F64
#endif
#if HWY_HAVE_FLOAT64
template <class D, HWY_IF_F64_D(D)>
HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<float16_t, D>> v) { … }
#endif
#endif
#if (defined(HWY_NATIVE_DEMOTE_F32_TO_BF16) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
#else
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
#endif
namespace detail {
template <class V, HWY_IF_F32(TFromV<V>)>
HWY_INLINE VFromD<RebindToUnsigned<DFromV<V>>> RoundF32ForDemoteToBF16(V v) { … }
}
template <class D, HWY_IF_BF16_D(D)>
HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) { … }
template <class D, HWY_IF_BF16_D(D)>
HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
VFromD<Repartition<float, D>> b) { … }
template <class D, HWY_IF_BF16_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
VFromD<Repartition<float, D>> b) { … }
#endif
#if (defined(HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
#else
#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
#endif
#if HWY_HAVE_INTEGER64
template <class D64, HWY_IF_UI64_D(D64)>
HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) {
return PromoteTo(d64, v);
}
#endif
#endif
#if (defined(HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
#else
#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
#endif
template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
HWY_IF_T_SIZE_ONE_OF_D(DI, (HWY_HAVE_FLOAT16 ? (1 << 2) : 0) |
(1 << 4) |
(HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<RebindToFloat<DI>> v) {
return ConvertTo(di, v);
}
#endif
#if (defined(HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
#else
#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
#endif
#if HWY_HAVE_FLOAT64
template <class D32, HWY_IF_UI32_D(D32)>
HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) {
return DemoteTo(d32, v);
}
#endif
#endif
template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
HWY_API VFromD<D> PromoteInRangeLowerTo(D d, V v) { … }
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
HWY_API VFromD<D> PromoteInRangeUpperTo(D d, V v) { … }
#endif
template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
HWY_API VFromD<D> PromoteInRangeEvenTo(D d, V v) { … }
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
HWY_API VFromD<D> PromoteInRangeOddTo(D d, V v) { … }
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
namespace detail {
template <class TypeTag, size_t kLaneSize, class V>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
TypeTag , hwy::SizeTag<kLaneSize> , V v) { … }
}
template <class V>
HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(V v) { … }
#endif
namespace detail {
template <class TypeTag, size_t kLaneSize, class V>
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
TypeTag , hwy::SizeTag<kLaneSize> , V v) { … }
}
template <class V>
HWY_API VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(V v) { … }
#if HWY_IDE || \
(defined(HWY_NATIVE_ORDERED_TRUNCATE_2_TO) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
#else
#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
HWY_IF_LANES_D(DFromV<VFromD<DN>>, HWY_MAX_LANES_D(DFromV<V>) * 2)>
HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) { … }
#endif
#endif
#if (defined(HWY_NATIVE_LEADING_ZERO_COUNT) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
#undef HWY_NATIVE_LEADING_ZERO_COUNT
#else
#define HWY_NATIVE_LEADING_ZERO_COUNT
#endif
namespace detail {
template <class D, HWY_IF_U32_D(D)>
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { … }
template <class V, HWY_IF_U32_D(DFromV<V>)>
HWY_INLINE V I32RangeU32ToF32BiasedExp(V v) { … }
template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { … }
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)>
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { … }
#endif
template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { … }
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4),
HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 2)>
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { … }
template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 2)>
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { … }
#endif
#if HWY_TARGET == HWY_SCALAR
template <class D>
using F32ExpLzcntMinMaxRepartition = RebindToUnsigned<D>;
#elif HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2
F32ExpLzcntMinMaxRepartition;
#else
template <class D>
using F32ExpLzcntMinMaxRepartition =
Repartition<UnsignedFromSize<HWY_MIN(sizeof(TFromD<D>), 4)>, D>;
#endif
F32ExpLzcntMinMaxCmpV;
template <class V>
HWY_INLINE F32ExpLzcntMinMaxCmpV<V> F32ExpLzcntMinMaxBitCast(V v) { … }
template <class D, HWY_IF_U64_D(D)>
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { … }
template <class V, HWY_IF_UNSIGNED_V(V)>
HWY_INLINE V UIntToF32BiasedExp(V v) { … }
template <class V, HWY_IF_UNSIGNED_V(V),
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) { … }
template <class V, HWY_IF_UNSIGNED_V(V),
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) { … }
}
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V HighestSetBitIndex(V v) { … }
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V LeadingZeroCount(V v) { … }
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V TrailingZeroCount(V v) { … }
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
namespace detail {
template <class V>
HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL,
V affine_tblU) { … }
template <class V>
HWY_INLINE V SubBytes(V state) { … }
template <class V>
HWY_INLINE V InvSubBytes(V state) { … }
}
#endif
#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_AES
#undef HWY_NATIVE_AES
#else
#define HWY_NATIVE_AES
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
namespace detail {
template <class V>
HWY_INLINE V ShiftRows(const V state) { … }
template <class V>
HWY_INLINE V InvShiftRows(const V state) { … }
template <class V>
HWY_INLINE V GF2P8Mod11BMulBy2(V v) { … }
template <class V>
HWY_INLINE V MixColumns(const V state) { … }
template <class V>
HWY_INLINE V InvMixColumns(const V state) { … }
}
template <class V>
HWY_API V AESRound(V state, const V round_key) { … }
template <class V>
HWY_API V AESLastRound(V state, const V round_key) { … }
template <class V>
HWY_API V AESInvMixColumns(V state) { … }
template <class V>
HWY_API V AESRoundInv(V state, const V round_key) { … }
template <class V>
HWY_API V AESLastRoundInv(V state, const V round_key) { … }
template <uint8_t kRcon, class V, HWY_IF_U8_D(DFromV<V>)>
HWY_API V AESKeyGenAssist(V v) { … }
template <class V>
HWY_API V CLMulLower(V a, V b) { … }
template <class V>
HWY_API V CLMulUpper(V a, V b) { … }
#endif
#endif
#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_POPCNT
#undef HWY_NATIVE_POPCNT
#else
#define HWY_NATIVE_POPCNT
#endif
#undef HWY_IF_POPCNT
#if HWY_TARGET == HWY_RVV
#define HWY_IF_POPCNT …
#else
#define HWY_IF_POPCNT(D) …
#endif
template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)>
HWY_API V PopulationCount(V v) { … }
#if HWY_TARGET != HWY_RVV
template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API V PopulationCount(V v) { … }
#endif
template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)>
HWY_API V PopulationCount(V v) { … }
template <class V, class D = DFromV<V>, HWY_IF_U32_D(D)>
HWY_API V PopulationCount(V v) { … }
#if HWY_HAVE_INTEGER64
template <class V, class D = DFromV<V>, HWY_IF_U64_D(D)>
HWY_API V PopulationCount(V v) { … }
#endif
#endif
#if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
#ifdef HWY_NATIVE_MUL_8
#undef HWY_NATIVE_MUL_8
#else
#define HWY_NATIVE_MUL_8
#endif
template <class V, HWY_IF_T_SIZE_V(V, 1),
HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
HWY_API V operator*(const V a, const V b) { … }
template <class V, HWY_IF_T_SIZE_V(V, 1),
HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
HWY_API V operator*(const V a, const V b) { … }
#endif
#if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
#ifdef HWY_NATIVE_MUL_64
#undef HWY_NATIVE_MUL_64
#else
#define HWY_NATIVE_MUL_64
#endif
template <class V, HWY_IF_T_SIZE_V(V, 8), HWY_IF_V_SIZE_V(V, 8),
HWY_IF_NOT_FLOAT_V(V)>
HWY_API V operator*(V x, V y) { … }
template <class V, class D64 = DFromV<V>, HWY_IF_U64_D(D64),
HWY_IF_V_SIZE_GT_D(D64, 8)>
HWY_API V operator*(V x, V y) { … }
template <class V, class DI64 = DFromV<V>, HWY_IF_I64_D(DI64),
HWY_IF_V_SIZE_GT_D(DI64, 8)>
HWY_API V operator*(V x, V y) { … }
#endif
#if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_INT_FMA
#undef HWY_NATIVE_INT_FMA
#else
#define HWY_NATIVE_INT_FMA
#endif
#ifdef HWY_NATIVE_INT_FMSUB
#undef HWY_NATIVE_INT_FMSUB
#else
#define HWY_NATIVE_INT_FMSUB
#endif
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V MulAdd(V mul, V x, V add) { … }
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V NegMulAdd(V mul, V x, V add) { … }
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V MulSub(V mul, V x, V sub) { … }
#endif
#if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_INT_FMSUB
#undef HWY_NATIVE_INT_FMSUB
#else
#define HWY_NATIVE_INT_FMSUB
#endif
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V MulSub(V mul, V x, V sub) {
const DFromV<decltype(mul)> d;
const RebindToSigned<decltype(d)> di;
return MulAdd(mul, x, BitCast(d, Neg(BitCast(di, sub))));
}
#endif
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V NegMulSub(V mul, V x, V sub) { … }
template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
HWY_API V MulAddSub(V mul, V x, V sub_or_add) { … }
template <class V, HWY_IF_MULADDSUB_V(V)>
HWY_API V MulAddSub(V mul, V x, V sub_or_add) { … }
#if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_INT_DIV
#undef HWY_NATIVE_INT_DIV
#else
#define HWY_NATIVE_INT_DIV
#endif
namespace detail {
template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
HWY_INLINE Vec<D> IntDivConvFloatToInt(D di, V vf) { … }
template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
HWY_INLINE Vec<D> IntDivConvIntToFloat(D df, V vi) { … }
#if !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
template <class D, class V, HWY_IF_UI64_D(D), HWY_IF_F32(TFromV<V>)>
HWY_INLINE Vec<D> IntDivConvFloatToInt(D df, V vi) {
return PromoteInRangeTo(df, vi);
}
template <class D, class V, HWY_IF_F32_D(D), HWY_IF_I64(TFromV<V>)>
HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vi) {
const Twice<decltype(df32)> dt_f32;
auto vf32 =
ConvertTo(dt_f32, BitCast(RebindToSigned<decltype(dt_f32)>(), vi));
#if HWY_IS_LITTLE_ENDIAN
const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
#else
const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
#endif
const RebindToSigned<decltype(df32)> di32;
hi_f32 =
Add(hi_f32, And(BitCast(df32, BroadcastSignBit(BitCast(di32, lo_f32))),
Set(df32, 1.0f)));
return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
}
template <class D, class V, HWY_IF_F32_D(D), HWY_IF_U64(TFromV<V>)>
HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vu) {
const Twice<decltype(df32)> dt_f32;
auto vf32 =
ConvertTo(dt_f32, BitCast(RebindToUnsigned<decltype(dt_f32)>(), vu));
#if HWY_IS_LITTLE_ENDIAN
const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
const auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
#else
const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
const auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
#endif
return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
}
#endif
template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
HWY_IF_T_SIZE_GT(TFromV<V>, kOrigLaneSize)>
HWY_INLINE V IntDivUsingFloatDiv(V a, V b) { … }
template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
HWY_IF_T_SIZE(TFromV<V>, kOrigLaneSize),
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
HWY_INLINE V IntDivUsingFloatDiv(V a, V b) { … }
template <size_t kOrigLaneSize, class V,
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
HWY_IF_V_SIZE_LE_V(
V, HWY_MAX_BYTES /
((!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1) ? 4 : 2))>
HWY_INLINE V IntDiv(V a, V b) { … }
template <size_t kOrigLaneSize, class V,
HWY_IF_T_SIZE_ONE_OF_V(V,
(HWY_HAVE_FLOAT16 ? (1 << 1) : 0) | (1 << 2)),
HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntDiv(V a, V b) { … }
#if !HWY_HAVE_FLOAT16
template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
HWY_IF_V_SIZE_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntDiv(V a, V b) { … }
template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntDiv(V a, V b) { … }
#endif
template <size_t kOrigLaneSize, class V,
HWY_IF_T_SIZE_ONE_OF_V(V,
(HWY_HAVE_FLOAT64 ? 0 : (1 << 4)) | (1 << 8))>
HWY_INLINE V IntDiv(V a, V b) { … }
#if HWY_HAVE_FLOAT64
template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntDiv(V a, V b) { … }
template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntDiv(V a, V b) { … }
#endif
template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
HWY_IF_T_SIZE_ONE_OF_V(V, ((HWY_TARGET <= HWY_SSE2 ||
HWY_TARGET == HWY_WASM ||
HWY_TARGET == HWY_WASM_EMU256)
? 0
: (1 << 1)) |
(1 << 2) | (1 << 4) | (1 << 8))>
HWY_INLINE V IntMod(V a, V b) { … }
#if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
HWY_TARGET == HWY_WASM_EMU256
template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntMod(V a, V b) { … }
template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
HWY_INLINE V IntMod(V a, V b) { … }
#endif
}
#if HWY_TARGET == HWY_SCALAR
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec1<T> operator/(Vec1<T> a, Vec1<T> b) {
return detail::IntDiv<sizeof(T)>(a, b);
}
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec1<T> operator%(Vec1<T> a, Vec1<T> b) {
return detail::IntMod<sizeof(T)>(a, b);
}
#else
template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) { … }
template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) { … }
#if HWY_CAP_GE256
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec256<T> operator/(Vec256<T> a, Vec256<T> b) {
return detail::IntDiv<sizeof(T)>(a, b);
}
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec256<T> operator%(Vec256<T> a, Vec256<T> b) {
return detail::IntMod<sizeof(T)>(a, b);
}
#endif
#if HWY_CAP_GE512
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec512<T> operator/(Vec512<T> a, Vec512<T> b) {
return detail::IntDiv<sizeof(T)>(a, b);
}
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec512<T> operator%(Vec512<T> a, Vec512<T> b) {
return detail::IntMod<sizeof(T)>(a, b);
}
#endif
#endif
#endif
#if (defined(HWY_NATIVE_MUL_EVEN_BF16) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_MUL_EVEN_BF16
#undef HWY_NATIVE_MUL_EVEN_BF16
#else
#define HWY_NATIVE_MUL_EVEN_BF16
#endif
template <class DF, HWY_IF_F32_D(DF),
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
HWY_API VFromD<DF> MulEvenAdd(DF df, VBF a, VBF b, VFromD<DF> c) { … }
template <class DF, HWY_IF_F32_D(DF),
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
HWY_API VFromD<DF> MulOddAdd(DF df, VBF a, VBF b, VFromD<DF> c) { … }
#endif
#if (defined(HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
#undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
#else
#define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
#endif
template <class DF, HWY_IF_F32_D(DF),
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
VFromD<DF> sum0,
VFromD<DF>& sum1) { … }
#endif
#if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
#undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
#else
#define HWY_NATIVE_WIDEN_MUL_ACCUMULATE
#endif
template<class D, HWY_IF_INTEGER(TFromD<D>),
class DN = RepartitionToNarrow<D>>
HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
VFromD<D> low, VFromD<D>& high) { … }
#endif
#if 0
#if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
#undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
#else
#define HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
#endif
#if HWY_HAVE_FLOAT16
template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>>
HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
VFromD<D> low, VFromD<D>& high) {
high = MulAdd(PromoteUpperTo(d, mul), PromoteUpperTo(d, x), high);
return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low);
}
#endif
#endif
#endif
#if (defined(HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
#undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
#else
#define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
#endif
template <class DI16, class VU8, class VI8,
class VU8_2 = Vec<Repartition<uint8_t, DI16>>, HWY_IF_I16_D(DI16),
HWY_IF_U8_D(DFromV<VU8>), HWY_IF_I8_D(DFromV<VI8>),
HWY_IF_LANES_D(DFromV<VU8>, HWY_MAX_LANES_V(VI8)),
HWY_IF_LANES_D(DFromV<VU8>, HWY_MAX_LANES_V(VU8_2))>
HWY_API Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) {
const RebindToUnsigned<decltype(di16)> du16;
const auto a0 = BitCast(di16, PromoteEvenTo(du16, a));
const auto b0 = PromoteEvenTo(di16, b);
const auto a1 = BitCast(di16, PromoteOddTo(du16, a));
const auto b1 = PromoteOddTo(di16, b);
return SaturatedAdd(Mul(a0, b0), Mul(a1, b1));
}
#endif
#if (defined(HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
#undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
#else
#define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
#endif
template <class DI32, HWY_IF_I32_D(DI32)>
HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
DI32 di32, VFromD<Repartition<int16_t, DI32>> a,
VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) { … }
#endif
#if (defined(HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
#else
#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
#endif
template <class DI32, HWY_IF_I32_D(DI32)>
HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
VFromD<Rebind<int16_t, DI32>> a,
VFromD<Rebind<int16_t, DI32>> b,
VFromD<DI32> sum) { … }
#endif
#if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
#endif
template <class DI32, HWY_IF_I32_D(DI32)>
HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
VFromD<Repartition<int8_t, DI32>> a,
VFromD<Repartition<int8_t, DI32>> b,
VFromD<DI32> sum) { … }
#endif
#if (defined(HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
#endif
template <class DU32, HWY_IF_U32_D(DU32)>
HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
DU32 du32, VFromD<Repartition<uint8_t, DU32>> a,
VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) { … }
#endif
#if (defined(HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
#endif
template <class DI32, HWY_IF_I32_D(DI32)>
HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
DI32 di32, VFromD<Repartition<uint8_t, DI32>> a_u,
VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) { … }
#endif
#if (defined(HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
#endif
#if HWY_HAVE_INTEGER64
template <class DI64, HWY_IF_I64_D(DI64)>
HWY_API VFromD<DI64> SumOfMulQuadAccumulate(
DI64 di64, VFromD<Repartition<int16_t, DI64>> a,
VFromD<Repartition<int16_t, DI64>> b, VFromD<DI64> sum) { … }
#endif
#endif
#if (defined(HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
#endif
#if HWY_HAVE_INTEGER64
template <class DU64, HWY_IF_U64_D(DU64)>
HWY_API VFromD<DU64> SumOfMulQuadAccumulate(
DU64 du64, VFromD<Repartition<uint16_t, DU64>> a,
VFromD<Repartition<uint16_t, DU64>> b, VFromD<DU64> sum) { … }
#endif
#endif
#if (defined(HWY_NATIVE_F64_APPROX_RECIP) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_F64_APPROX_RECIP
#undef HWY_NATIVE_F64_APPROX_RECIP
#else
#define HWY_NATIVE_F64_APPROX_RECIP
#endif
#if HWY_HAVE_FLOAT64
template <class V, HWY_IF_F64_D(DFromV<V>)>
HWY_API V ApproximateReciprocal(V v) { … }
#endif
#endif
#if (defined(HWY_NATIVE_F64_APPROX_RSQRT) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_F64_APPROX_RSQRT
#undef HWY_NATIVE_F64_APPROX_RSQRT
#else
#define HWY_NATIVE_F64_APPROX_RSQRT
#endif
#if HWY_HAVE_FLOAT64
template <class V, HWY_IF_F64_D(DFromV<V>)>
HWY_API V ApproximateReciprocalSqrt(V v) { … }
#endif
#endif
#if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_COMPRESS8
#undef HWY_NATIVE_COMPRESS8
#else
#define HWY_NATIVE_COMPRESS8
#endif
template <class V, class D, typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API size_t CompressBitsStore(V v, const uint8_t* HWY_RESTRICT bits, D d,
T* unaligned) { … }
template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API size_t CompressStore(V v, M mask, D d, T* HWY_RESTRICT unaligned) { … }
template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API size_t CompressBlendedStore(V v, M mask, D d,
T* HWY_RESTRICT unaligned) { … }
template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
HWY_API V Compress(V v, const M mask) { … }
template <class V, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { … }
template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
HWY_API V CompressNot(V v, M mask) { … }
#endif
#if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
#ifdef HWY_NATIVE_EXPAND
#undef HWY_NATIVE_EXPAND
#else
#define HWY_NATIVE_EXPAND
#endif
namespace detail {
#if HWY_IDE
template <class M>
HWY_INLINE uint64_t BitsFromMask(M ) {
return 0;
}
#endif
template <size_t N>
HWY_INLINE Vec128<uint8_t, N> IndicesForExpandFromBits(uint64_t mask_bits) { … }
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 1), HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { … }
template <typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) { … }
template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { … }
template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { … }
template <typename T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) { … }
template <typename T>
HWY_API Vec128<T, 1> Expand(Vec128<T, 1> v, Mask128<T, 1> mask) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
const TFromD<D>* HWY_RESTRICT unaligned) { … }
#endif
IndicesFromD;
#if HWY_TARGET != HWY_RVV && !HWY_TARGET_IS_SVE
template <class D>
HWY_API VFromD<D> TwoTablesLookupLanes(D , VFromD<D> a, VFromD<D> b,
IndicesFromD<D> idx) { … }
#endif
#if (defined(HWY_NATIVE_REVERSE2_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
#ifdef HWY_NATIVE_REVERSE2_8
#undef HWY_NATIVE_REVERSE2_8
#else
#define HWY_NATIVE_REVERSE2_8
#endif
#undef HWY_PREFER_ROTATE
#if HWY_TARGET == HWY_SSE2 || HWY_TARGET <= HWY_AVX3 || \
HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_PPC8
#define HWY_PREFER_ROTATE …
#else
#define HWY_PREFER_ROTATE …
#endif
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { … }
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { … }
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) { … }
#endif
#if (defined(HWY_NATIVE_REVERSE_LANE_BYTES) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REVERSE_LANE_BYTES
#undef HWY_NATIVE_REVERSE_LANE_BYTES
#else
#define HWY_NATIVE_REVERSE_LANE_BYTES
#endif
template <class V, HWY_IF_T_SIZE_V(V, 2)>
HWY_API V ReverseLaneBytes(V v) { … }
template <class V, HWY_IF_T_SIZE_V(V, 4)>
HWY_API V ReverseLaneBytes(V v) { … }
template <class V, HWY_IF_T_SIZE_V(V, 8)>
HWY_API V ReverseLaneBytes(V v) { … }
#endif
#undef HWY_REVERSE_BITS_MIN_BYTES
#if ((HWY_TARGET >= HWY_AVX3 && HWY_TARGET <= HWY_SSE2) || \
HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256)
#define HWY_REVERSE_BITS_MIN_BYTES …
#else
#define HWY_REVERSE_BITS_MIN_BYTES …
#endif
#if (defined(HWY_NATIVE_REVERSE_BITS_UI8) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REVERSE_BITS_UI8
#undef HWY_NATIVE_REVERSE_BITS_UI8
#else
#define HWY_NATIVE_REVERSE_BITS_UI8
#endif
namespace detail {
template <int kShiftAmt, int kShrResultMask, class V,
HWY_IF_V_SIZE_GT_D(DFromV<V>, HWY_REVERSE_BITS_MIN_BYTES - 1)>
HWY_INLINE V UI8ReverseBitsStep(V v) { … }
#if HWY_REVERSE_BITS_MIN_BYTES == 2
template <int kShiftAmt, int kShrResultMask, class V,
HWY_IF_V_SIZE_D(DFromV<V>, 1)>
HWY_INLINE V UI8ReverseBitsStep(V v) { … }
#endif
}
template <class V, HWY_IF_T_SIZE_V(V, 1)>
HWY_API V ReverseBits(V v) { … }
#endif
#if (defined(HWY_NATIVE_REVERSE_BITS_UI16_32_64) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64
#undef HWY_NATIVE_REVERSE_BITS_UI16_32_64
#else
#define HWY_NATIVE_REVERSE_BITS_UI16_32_64
#endif
template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8)),
HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V ReverseBits(V v) { … }
#endif
#if (defined(HWY_NATIVE_PER4LANEBLKSHUF_DUP32) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
#else
#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
namespace detail {
template <class D>
HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
const uint32_t x2,
const uint32_t x1,
const uint32_t x0) {
#if HWY_TARGET == HWY_RVV
constexpr int kPow2 = d.Pow2();
constexpr int kLoadPow2 = HWY_MAX(kPow2, -1);
const ScalableTag<uint32_t, kLoadPow2> d_load;
#else
constexpr size_t kMaxBytes = d.MaxBytes();
#if HWY_TARGET_IS_NEON
constexpr size_t kMinLanesToLoad = 2;
#else
constexpr size_t kMinLanesToLoad = 4;
#endif
constexpr size_t kNumToLoad =
HWY_MAX(kMaxBytes / sizeof(uint32_t), kMinLanesToLoad);
const CappedTag<uint32_t, kNumToLoad> d_load;
#endif
return ResizeBitCast(d, Dup128VecFromValues(d_load, x0, x1, x2, x3));
}
}
#endif
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
namespace detail {
template <class V>
HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<0> , V v) { … }
template <class V>
HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<1> , V v) { … }
template <class V>
HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<2> , V v) { … }
template <class V>
HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<3> , V v) { … }
HWY_INLINE uint32_t U8x4Per4LaneBlkIndices(const uint32_t idx3,
const uint32_t idx2,
const uint32_t idx1,
const uint32_t idx0) { … }
template <class D>
HWY_INLINE Vec<D> TblLookupPer4LaneBlkU8IdxInBlk(D d, const uint32_t idx3,
const uint32_t idx2,
const uint32_t idx1,
const uint32_t idx0) { … }
#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_EMU128
#define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE …
#else
#define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE …
template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, V idx) { … }
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
const uint32_t idx2,
const uint32_t idx1,
const uint32_t idx0) { … }
template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
const uint32_t idx2,
const uint32_t idx1,
const uint32_t idx0) { … }
template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
const uint32_t idx2,
const uint32_t idx1,
const uint32_t idx0) { … }
#endif
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
const uint32_t idx2,
const uint32_t idx1,
const uint32_t idx0) { … }
#if HWY_TARGET == HWY_RVV
template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
const uint32_t idx2,
const uint32_t idx1,
const uint32_t idx0) {
const Rebind<uint8_t, decltype(d)> du8;
return PromoteTo(d,
TblLookupPer4LaneBlkU8IdxInBlk(du8, idx3, idx2, idx1, idx0));
}
#else
template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
const uint32_t idx2,
const uint32_t idx1,
const uint32_t idx0) { … }
template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
const uint32_t idx2,
const uint32_t idx1,
const uint32_t idx0) { … }
template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
const uint32_t idx2,
const uint32_t idx1,
const uint32_t idx0) { … }
#endif
template <class D, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D)>
HWY_INLINE IndicesFromD<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
const uint32_t idx2,
const uint32_t idx1,
const uint32_t idx0) { … }
template <class V, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(DFromV<V>)>
HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, IndicesFromD<DFromV<V>> idx) { … }
#undef HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE
template <class V>
HWY_INLINE V TblLookupPer4LaneBlkShuf(V v, size_t idx3210) { … }
template <size_t kIdx3210, size_t kLaneSize, size_t kVectSize, class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> ,
hwy::SizeTag<kLaneSize> ,
hwy::SizeTag<kVectSize> ,
V v) { … }
#if HWY_HAVE_FLOAT64
template <class V>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> Per4LaneBlockShufCastToWide(
hwy::FloatTag , hwy::SizeTag<4> , V v) { … }
#endif
template <size_t kLaneSize, class V>
HWY_INLINE VFromD<RepartitionToWide<RebindToUnsigned<DFromV<V>>>>
Per4LaneBlockShufCastToWide(hwy::FloatTag ,
hwy::SizeTag<kLaneSize> , V v) { … }
template <size_t kLaneSize, class V>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> Per4LaneBlockShufCastToWide(
hwy::NonFloatTag ,
hwy::SizeTag<kLaneSize> , V v) { … }
template <class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x1B> , V v) { … }
template <class V,
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
(HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x44> , V v) { … }
template <class V,
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
(HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> , V v) { … }
#if HWY_MAX_BYTES >= 32
template <class V, HWY_IF_T_SIZE_V(V, 8)>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> , V v) {
return SwapAdjacentBlocks(v);
}
#endif
template <class V, HWY_IF_LANES_D(DFromV<V>, 4),
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> , V v) { … }
template <class V, HWY_IF_T_SIZE_V(V, 4)>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> , V v) { … }
template <class V, HWY_IF_LANES_D(DFromV<V>, 4)>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x88> , V v) { … }
template <class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xA0> , V v) { … }
template <class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xB1> , V v) { … }
template <class V, HWY_IF_LANES_D(DFromV<V>, 4)>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> , V v) { … }
template <class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xE4> , V v) { … }
template <class V,
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
(HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xEE> , V v) { … }
template <class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xF5> , V v) { … }
template <class V, HWY_IF_T_SIZE_V(V, 4)>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xFA> , V v) { … }
template <size_t kIdx3210, class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag, V v) { … }
}
#endif
template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
HWY_IF_LANES_D(DFromV<V>, 1)>
HWY_API V Per4LaneBlockShuffle(V v) { … }
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
HWY_IF_LANES_D(DFromV<V>, 2)>
HWY_API V Per4LaneBlockShuffle(V v) { … }
template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
HWY_IF_LANES_GT_D(DFromV<V>, 2)>
HWY_API V Per4LaneBlockShuffle(V v) { … }
#endif
template <class D>
HWY_API size_t Blocks(D d) { … }
#if (defined(HWY_NATIVE_BLK_INSERT_EXTRACT) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_BLK_INSERT_EXTRACT
#undef HWY_NATIVE_BLK_INSERT_EXTRACT
#else
#define HWY_NATIVE_BLK_INSERT_EXTRACT
#endif
template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
HWY_API V InsertBlock(V , V blk_to_insert) { … }
template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
HWY_API V ExtractBlock(V v) { … }
template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
HWY_API V BroadcastBlock(V v) { … }
#endif
#if (defined(HWY_NATIVE_BROADCASTLANE) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_BROADCASTLANE
#undef HWY_NATIVE_BROADCASTLANE
#else
#define HWY_NATIVE_BROADCASTLANE
#endif
template <int kLane, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
HWY_API V BroadcastLane(V v) { … }
#endif
#if (defined(HWY_NATIVE_SLIDE1_UP_DOWN) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SLIDE1_UP_DOWN
#undef HWY_NATIVE_SLIDE1_UP_DOWN
#else
#define HWY_NATIVE_SLIDE1_UP_DOWN
#endif
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> Slide1Up(D d, VFromD<D> ) { … }
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> Slide1Down(D d, VFromD<D> ) { … }
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) { … }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) { … }
#endif
#endif
template <int kBlocks, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> SlideUpBlocks(D , VFromD<D> v) { … }
#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256
template <int kBlocks, class D, HWY_IF_V_SIZE_GT_D(D, 16)>
HWY_API VFromD<D> SlideUpBlocks(D d, VFromD<D> v) {
static_assert(0 <= kBlocks && static_cast<size_t>(kBlocks) < d.MaxBlocks(),
"kBlocks must be between 0 and d.MaxBlocks() - 1");
constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
return SlideUpLanes(d, v, static_cast<size_t>(kBlocks) * kLanesPerBlock);
}
#endif
template <int kBlocks, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> SlideDownBlocks(D , VFromD<D> v) { … }
#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256
template <int kBlocks, class D, HWY_IF_V_SIZE_GT_D(D, 16)>
HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
static_assert(0 <= kBlocks && static_cast<size_t>(kBlocks) < d.MaxBlocks(),
"kBlocks must be between 0 and d.MaxBlocks() - 1");
constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
return SlideDownLanes(d, v, static_cast<size_t>(kBlocks) * kLanesPerBlock);
}
#endif
#if (defined(HWY_NATIVE_SLIDE_MASK) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SLIDE_MASK
#undef HWY_NATIVE_SLIDE_MASK
#else
#define HWY_NATIVE_SLIDE_MASK
#endif
template <class D>
HWY_API Mask<D> SlideMask1Up(D d, Mask<D> m) { … }
template <class D>
HWY_API Mask<D> SlideMask1Down(D d, Mask<D> m) { … }
template <class D>
HWY_API Mask<D> SlideMaskUpLanes(D d, Mask<D> m, size_t amt) { … }
template <class D>
HWY_API Mask<D> SlideMaskDownLanes(D d, Mask<D> m, size_t amt) { … }
#endif
#if (defined(HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
#undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
#else
#define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <int kAOffset, int kBOffset, class V8, HWY_IF_UI8_D(DFromV<V8>)>
HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) { … }
#endif
#endif
#if (defined(HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
#undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
#else
#define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V8,
HWY_IF_UI8_D(DFromV<V8>)>
HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
V8 b) { … }
#endif
#endif
#if (defined(HWY_NATIVE_BITSHUFFLE) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_BITSHUFFLE
#undef HWY_NATIVE_BITSHUFFLE
#else
#define HWY_NATIVE_BITSHUFFLE
#endif
#if HWY_HAVE_INTEGER64 && HWY_TARGET != HWY_SCALAR
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>)>
HWY_API V BitShuffle(V v, VI idx) { … }
#endif
#endif
#if (defined(HWY_NATIVE_OPERATOR_REPLACEMENTS) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS
#undef HWY_NATIVE_OPERATOR_REPLACEMENTS
#else
#define HWY_NATIVE_OPERATOR_REPLACEMENTS
#endif
template <class V>
HWY_API V Add(V a, V b) { … }
template <class V>
HWY_API V Sub(V a, V b) { … }
template <class V>
HWY_API V Mul(V a, V b) { … }
template <class V>
HWY_API V Div(V a, V b) { … }
template <class V>
HWY_API V Mod(V a, V b) { … }
template <class V>
V Shl(V a, V b) { … }
template <class V>
V Shr(V a, V b) { … }
template <class V>
HWY_API auto Eq(V a, V b) -> decltype(a == b) { … }
template <class V>
HWY_API auto Ne(V a, V b) -> decltype(a == b) { … }
template <class V>
HWY_API auto Lt(V a, V b) -> decltype(a == b) { … }
template <class V>
HWY_API auto Gt(V a, V b) -> decltype(a == b) { … }
template <class V>
HWY_API auto Ge(V a, V b) -> decltype(a == b) { … }
template <class V>
HWY_API auto Le(V a, V b) -> decltype(a == b) { … }
#endif
}
}
HWY_AFTER_NAMESPACE(…);