xxhash.cpp | Explore in Territory

/*
 *  xxHash - Extremely Fast Hash algorithm
 *  Copyright (C) 2012-2023, Yann Collet
 *
 *  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions are
 *  met:
 *
 *  * Redistributions of source code must retain the above copyright
 *  notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above
 *  copyright notice, this list of conditions and the following disclaimer
 *  in the documentation and/or other materials provided with the
 *  distribution.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 *  You can contact the author at :
 *  - xxHash homepage: http://www.xxhash.com
 *  - xxHash source repository : https://github.com/Cyan4973/xxHash
 */

// xxhash64 is based on commit d2df04efcbef7d7f6886d345861e5dfda4edacc1. Removed
// everything but a simple interface for computing xxh64.

// xxh3_64bits is based on commit d5891596637d21366b9b1dcf2c0007a3edb26a9e (July
// 2023).

// xxh3_128bits is based on commit b0adcc54188c3130b1793e7b19c62eb1e669f7df
// (June 2024).

#include "llvm/Support/xxhash.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Endian.h"

#include <stdlib.h>

#if !defined(LLVM_XXH_USE_NEON)
#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) &&      \
    !defined(__ARM_BIG_ENDIAN)
#define LLVM_XXH_USE_NEON …
#else
#define LLVM_XXH_USE_NEON …
#endif
#endif

#if LLVM_XXH_USE_NEON
#include <arm_neon.h>
#endif

usingnamespacellvm;
usingnamespacesupport;

static uint64_t rotl64(uint64_t X, size_t R) { … }

constexpr uint32_t PRIME32_1 = …;
constexpr uint32_t PRIME32_2 = …;
constexpr uint32_t PRIME32_3 = …;

static const uint64_t PRIME64_1 = …;
static const uint64_t PRIME64_2 = …;
static const uint64_t PRIME64_3 = …;
static const uint64_t PRIME64_4 = …;
static const uint64_t PRIME64_5 = …;

static uint64_t round(uint64_t Acc, uint64_t Input) { … }

static uint64_t mergeRound(uint64_t Acc, uint64_t Val) { … }

static uint64_t XXH64_avalanche(uint64_t hash) { … }

uint64_t llvm::xxHash64(StringRef Data) { … }

uint64_t llvm::xxHash64(ArrayRef<uint8_t> Data) { … }

constexpr size_t XXH3_SECRETSIZE_MIN = …;
constexpr size_t XXH_SECRET_DEFAULT_SIZE = …;

/* Pseudorandom data taken directly from FARSH */
// clang-format off
constexpr uint8_t kSecret[XXH_SECRET_DEFAULT_SIZE] = …;
// clang-format on

constexpr uint64_t PRIME_MX1 = …;
constexpr uint64_t PRIME_MX2 = …;

// Calculates a 64-bit to 128-bit multiply, then XOR folds it.
static uint64_t XXH3_mul128_fold64(uint64_t lhs, uint64_t rhs) { … }

constexpr size_t XXH_STRIPE_LEN = …;
constexpr size_t XXH_SECRET_CONSUME_RATE = …;
constexpr size_t XXH_ACC_NB = …;

static uint64_t XXH3_avalanche(uint64_t hash) { … }

static uint64_t XXH3_len_1to3_64b(const uint8_t *input, size_t len,
                                  const uint8_t *secret, uint64_t seed) { … }

static uint64_t XXH3_len_4to8_64b(const uint8_t *input, size_t len,
                                  const uint8_t *secret, uint64_t seed) { … }

static uint64_t XXH3_len_9to16_64b(const uint8_t *input, size_t len,
                                   const uint8_t *secret, uint64_t const seed) { … }

LLVM_ATTRIBUTE_ALWAYS_INLINE
static uint64_t XXH3_len_0to16_64b(const uint8_t *input, size_t len,
                                   const uint8_t *secret, uint64_t const seed) { … }

static uint64_t XXH3_mix16B(const uint8_t *input, uint8_t const *secret,
                            uint64_t seed) { … }

/* For mid range keys, XXH3 uses a Mum-hash variant. */
LLVM_ATTRIBUTE_ALWAYS_INLINE
static uint64_t XXH3_len_17to128_64b(const uint8_t *input, size_t len,
                                     const uint8_t *secret,
                                     uint64_t const seed) { … }

constexpr size_t XXH3_MIDSIZE_MAX = …;
constexpr size_t XXH3_MIDSIZE_STARTOFFSET = …;
constexpr size_t XXH3_MIDSIZE_LASTOFFSET = …;

LLVM_ATTRIBUTE_NOINLINE
static uint64_t XXH3_len_129to240_64b(const uint8_t *input, size_t len,
                                      const uint8_t *secret, uint64_t seed) { … }

#if LLVM_XXH_USE_NEON

#define XXH3_accumulate_512 …
#define XXH3_scrambleAcc …

// NEON implementation based on commit a57f6cce2698049863af8c25787084ae0489d849
// (July 2024), with the following removed:
// - workaround for suboptimal codegen on older GCC
// - compiler barriers against instruction reordering
// - WebAssembly SIMD support
// - configurable split between NEON and scalar lanes (benchmarking shows no
//   penalty when fully doing SIMD on the Apple M1)

#if defined(__GNUC__) || defined(__clang__)
#define XXH_ALIASING …
#else
#define XXH_ALIASING …
#endif

typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;

LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64x2_t XXH_vld1q_u64(void const *ptr) {
  return vreinterpretq_u64_u8(vld1q_u8((uint8_t const *)ptr));
}

LLVM_ATTRIBUTE_ALWAYS_INLINE
static void XXH3_accumulate_512_neon(uint64_t *acc, const uint8_t *input,
                                     const uint8_t *secret) {
  xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;

#ifdef __clang__
#pragma clang loop unroll(full)
#endif
  for (size_t i = 0; i < XXH_ACC_NB / 2; i += 2) {
    /* data_vec = input[i]; */
    uint64x2_t data_vec_1 = XXH_vld1q_u64(input + (i * 16));
    uint64x2_t data_vec_2 = XXH_vld1q_u64(input + ((i + 1) * 16));

    /* key_vec  = secret[i];  */
    uint64x2_t key_vec_1 = XXH_vld1q_u64(secret + (i * 16));
    uint64x2_t key_vec_2 = XXH_vld1q_u64(secret + ((i + 1) * 16));

    /* data_swap = swap(data_vec) */
    uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
    uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);

    /* data_key = data_vec ^ key_vec; */
    uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
    uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);

    /*
     * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
     * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
     * get one vector with the low 32 bits of each lane, and one vector
     * with the high 32 bits of each lane.
     *
     * The intrinsic returns a double vector because the original ARMv7-a
     * instruction modified both arguments in place. AArch64 and SIMD128 emit
     * two instructions from this intrinsic.
     *
     *  [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
     *  [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
     */
    uint32x4x2_t unzipped = vuzpq_u32(vreinterpretq_u32_u64(data_key_1),
                                      vreinterpretq_u32_u64(data_key_2));

    /* data_key_lo = data_key & 0xFFFFFFFF */
    uint32x4_t data_key_lo = unzipped.val[0];
    /* data_key_hi = data_key >> 32 */
    uint32x4_t data_key_hi = unzipped.val[1];

    /*
     * Then, we can split the vectors horizontally and multiply which, as for
     * most widening intrinsics, have a variant that works on both high half
     * vectors for free on AArch64. A similar instruction is available on
     * SIMD128.
     *
     * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
     */
    uint64x2_t sum_1 = vmlal_u32(data_swap_1, vget_low_u32(data_key_lo),
                                 vget_low_u32(data_key_hi));
    uint64x2_t sum_2 = vmlal_u32(data_swap_2, vget_high_u32(data_key_lo),
                                 vget_high_u32(data_key_hi));

    /* xacc[i] = acc_vec + sum; */
    xacc[i] = vaddq_u64(xacc[i], sum_1);
    xacc[i + 1] = vaddq_u64(xacc[i + 1], sum_2);
  }
}

LLVM_ATTRIBUTE_ALWAYS_INLINE
static void XXH3_scrambleAcc_neon(uint64_t *acc, const uint8_t *secret) {
  xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;

  /* { prime32_1, prime32_1 } */
  uint32x2_t const kPrimeLo = vdup_n_u32(PRIME32_1);
  /* { 0, prime32_1, 0, prime32_1 } */
  uint32x4_t const kPrimeHi =
      vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)PRIME32_1 << 32));

  for (size_t i = 0; i < XXH_ACC_NB / 2; ++i) {
    /* xacc[i] ^= (xacc[i] >> 47); */
    uint64x2_t acc_vec = XXH_vld1q_u64(acc + (2 * i));
    uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
    uint64x2_t data_vec = veorq_u64(acc_vec, shifted);

    /* xacc[i] ^= secret[i]; */
    uint64x2_t key_vec = XXH_vld1q_u64(secret + (i * 16));
    uint64x2_t data_key = veorq_u64(data_vec, key_vec);

    /*
     * xacc[i] *= XXH_PRIME32_1
     *
     * Expanded version with portable NEON intrinsics
     *
     *    lo(x) * lo(y) + (hi(x) * lo(y) << 32)
     *
     * prod_hi = hi(data_key) * lo(prime) << 32
     *
     * Since we only need 32 bits of this multiply a trick can be used,
     * reinterpreting the vector as a uint32x4_t and multiplying by
     * { 0, prime, 0, prime } to cancel out the unwanted bits and avoid the
     * shift.
     */
    uint32x4_t prod_hi = vmulq_u32(vreinterpretq_u32_u64(data_key), kPrimeHi);

    /* Extract low bits for vmlal_u32  */
    uint32x2_t data_key_lo = vmovn_u64(data_key);

    /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
    xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
  }
}
#else

#define XXH3_accumulate_512 …
#define XXH3_scrambleAcc …

LLVM_ATTRIBUTE_ALWAYS_INLINE
static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
                                       const uint8_t *secret) { … }

LLVM_ATTRIBUTE_ALWAYS_INLINE
static void XXH3_scrambleAcc_scalar(uint64_t *acc, const uint8_t *secret) { … }
#endif

LLVM_ATTRIBUTE_ALWAYS_INLINE
static void XXH3_accumulate(uint64_t *acc, const uint8_t *input,
                            const uint8_t *secret, size_t nbStripes) { … }

static uint64_t XXH3_mix2Accs(const uint64_t *acc, const uint8_t *secret) { … }

static uint64_t XXH3_mergeAccs(const uint64_t *acc, const uint8_t *key,
                               uint64_t start) { … }

LLVM_ATTRIBUTE_NOINLINE
static uint64_t XXH3_hashLong_64b(const uint8_t *input, size_t len,
                                  const uint8_t *secret, size_t secretSize) { … }

uint64_t llvm::xxh3_64bits(ArrayRef<uint8_t> data) { … }

/* ==========================================
 * XXH3 128 bits (a.k.a XXH128)
 * ==========================================
 * XXH3's 128-bit variant has better mixing and strength than the 64-bit
 * variant, even without counting the significantly larger output size.
 *
 * For example, extra steps are taken to avoid the seed-dependent collisions
 * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
 *
 * This strength naturally comes at the cost of some speed, especially on short
 * lengths. Note that longer hashes are about as fast as the 64-bit version
 * due to it using only a slight modification of the 64-bit loop.
 *
 * XXH128 is also more oriented towards 64-bit machines. It is still extremely
 * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
 */

/*!
 * @internal
 * @def XXH_rotl32(x,r)
 * @brief 32-bit rotate left.
 *
 * @param x The 32-bit integer to be rotated.
 * @param r The number of bits to rotate.
 * @pre
 *   @p r > 0 && @p r < 32
 * @note
 *   @p x and @p r may be evaluated multiple times.
 * @return The rotated result.
 */
#if __has_builtin(__builtin_rotateleft32) &&                                   \
    __has_builtin(__builtin_rotateleft64)
#define XXH_rotl32 …
#define XXH_rotl64 …
/* Note: although _rotl exists for minGW (GCC under windows), performance seems
 * poor */
#elif defined(_MSC_VER)
#define XXH_rotl32 …
#define XXH_rotl64 …
#else
#define XXH_rotl32 …
#define XXH_rotl64 …
#endif

#define XXH_mult32to64(x, y) …

/*!
 * @brief Calculates a 64->128-bit long multiply.
 *
 * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
 * version.
 *
 * @param lhs , rhs The 64-bit integers to be multiplied
 * @return The 128-bit result represented in an @ref XXH128_hash_t.
 */
static XXH128_hash_t XXH_mult64to128(uint64_t lhs, uint64_t rhs) { … }

/*! Seems to produce slightly better code on GCC for some reason. */
LLVM_ATTRIBUTE_ALWAYS_INLINE constexpr uint64_t XXH_xorshift64(uint64_t v64,
                                                               int shift) { … }

LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_1to3_128b(const uint8_t *input, size_t len, const uint8_t *secret,
                   uint64_t seed) { … }

LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_4to8_128b(const uint8_t *input, size_t len, const uint8_t *secret,
                   uint64_t seed) { … }

LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_9to16_128b(const uint8_t *input, size_t len, const uint8_t *secret,
                    uint64_t seed) { … }

/*
 * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
 */
LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_0to16_128b(const uint8_t *input, size_t len, const uint8_t *secret,
                    uint64_t seed) { … }

/*
 * A bit slower than XXH3_mix16B, but handles multiply by zero better.
 */
LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH128_mix32B(XXH128_hash_t acc, const uint8_t *input_1, const uint8_t *input_2,
              const uint8_t *secret, uint64_t seed) { … }

LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_17to128_128b(const uint8_t *input, size_t len, const uint8_t *secret,
                      size_t secretSize, uint64_t seed) { … }

LLVM_ATTRIBUTE_NOINLINE static XXH128_hash_t
XXH3_len_129to240_128b(const uint8_t *input, size_t len, const uint8_t *secret,
                       size_t secretSize, uint64_t seed) { … }

LLVM_ATTRIBUTE_ALWAYS_INLINE XXH128_hash_t
XXH3_hashLong_128b(const uint8_t *input, size_t len, const uint8_t *secret,
                   size_t secretSize) { … }

llvm::XXH128_hash_t llvm::xxh3_128bits(ArrayRef<uint8_t> data) { … }
llvm/llvm/lib/Support/xxhash.cpp