Hashing.h | Explore in Territory

//===-- llvm/ADT/Hashing.h - Utilities for hashing --------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the newly proposed standard C++ interfaces for hashing
// arbitrary data and building hash functions for user-defined types. This
// interface was originally proposed in N3333[1] and is currently under review
// for inclusion in a future TR and/or standard.
//
// The primary interfaces provide are comprised of one type and three functions:
//
//  -- 'hash_code' class is an opaque type representing the hash code for some
//     data. It is the intended product of hashing, and can be used to implement
//     hash tables, checksumming, and other common uses of hashes. It is not an
//     integer type (although it can be converted to one) because it is risky
//     to assume much about the internals of a hash_code. In particular, each
//     execution of the program has a high probability of producing a different
//     hash_code for a given input. Thus their values are not stable to save or
//     persist, and should only be used during the execution for the
//     construction of hashing datastructures.
//
//  -- 'hash_value' is a function designed to be overloaded for each
//     user-defined type which wishes to be used within a hashing context. It
//     should be overloaded within the user-defined type's namespace and found
//     via ADL. Overloads for primitive types are provided by this library.
//
//  -- 'hash_combine' and 'hash_combine_range' are functions designed to aid
//      programmers in easily and intuitively combining a set of data into
//      a single hash_code for their object. They should only logically be used
//      within the implementation of a 'hash_value' routine or similar context.
//
// Note that 'hash_combine_range' contains very special logic for hashing
// a contiguous array of integers or pointers. This logic is *extremely* fast,
// on a modern Intel "Gainestown" Xeon (Nehalem uarch) @2.2 GHz, these were
// benchmarked at over 6.5 GiB/s for large keys, and <20 cycles/hash for keys
// under 32-bytes.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_ADT_HASHING_H
#define LLVM_ADT_HASHING_H

#include "llvm/Config/abi-breaking.h"
#include "llvm/Support/DataTypes.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/SwapByteOrder.h"
#include "llvm/Support/type_traits.h"
#include <algorithm>
#include <cassert>
#include <cstring>
#include <optional>
#include <string>
#include <tuple>
#include <utility>

namespace llvm {
template <typename T, typename Enable> struct DenseMapInfo;

/// An opaque object representing a hash code.
///
/// This object represents the result of hashing some entity. It is intended to
/// be used to implement hashtables or other hashing-based data structures.
/// While it wraps and exposes a numeric value, this value should not be
/// trusted to be stable or predictable across processes or executions.
///
/// In order to obtain the hash_code for an object 'x':
/// \code
///   using llvm::hash_value;
///   llvm::hash_code code = hash_value(x);
/// \endcode
class hash_code { … };

/// Compute a hash_code for any integer value.
///
/// Note that this function is intended to compute the same hash_code for
/// a particular value without regard to the pre-promotion type. This is in
/// contrast to hash_combine which may produce different hash_codes for
/// differing argument types even if they would implicit promote to a common
/// type without changing the value.
template <typename T>
std::enable_if_t<is_integral_or_enum<T>::value, hash_code> hash_value(T value);

/// Compute a hash_code for a pointer's address.
///
/// N.B.: This hashes the *address*. Not the value and not the type.
template <typename T> hash_code hash_value(const T *ptr);

/// Compute a hash_code for a pair of objects.
template <typename T, typename U>
hash_code hash_value(const std::pair<T, U> &arg);

/// Compute a hash_code for a tuple.
template <typename... Ts>
hash_code hash_value(const std::tuple<Ts...> &arg);

/// Compute a hash_code for a standard string.
template <typename T>
hash_code hash_value(const std::basic_string<T> &arg);

/// Compute a hash_code for a standard string.
template <typename T> hash_code hash_value(const std::optional<T> &arg);

// All of the implementation details of actually computing the various hash
// code values are held within this namespace. These routines are included in
// the header file mainly to allow inlining and constant propagation.
namespace hashing {
namespace detail {

inline uint64_t fetch64(const char *p) { … }

inline uint32_t fetch32(const char *p) { … }

/// Some primes between 2^63 and 2^64 for various uses.
static constexpr uint64_t k0 = …;
static constexpr uint64_t k1 = …;
static constexpr uint64_t k2 = …;
static constexpr uint64_t k3 = …;

/// Bitwise right rotate.
/// Normally this will compile to a single instruction, especially if the
/// shift is a manifest constant.
inline uint64_t rotate(uint64_t val, size_t shift) { … }

inline uint64_t shift_mix(uint64_t val) { … }

inline uint64_t hash_16_bytes(uint64_t low, uint64_t high) { … }

inline uint64_t hash_1to3_bytes(const char *s, size_t len, uint64_t seed) { … }

inline uint64_t hash_4to8_bytes(const char *s, size_t len, uint64_t seed) { … }

inline uint64_t hash_9to16_bytes(const char *s, size_t len, uint64_t seed) { … }

inline uint64_t hash_17to32_bytes(const char *s, size_t len, uint64_t seed) { … }

inline uint64_t hash_33to64_bytes(const char *s, size_t len, uint64_t seed) { … }

inline uint64_t hash_short(const char *s, size_t length, uint64_t seed) { … }

/// The intermediate state used during hashing.
/// Currently, the algorithm for computing hash codes is based on CityHash and
/// keeps 56 bytes of arbitrary state.
struct hash_state { … };

/// In LLVM_ENABLE_ABI_BREAKING_CHECKS builds, the seed is non-deterministic
/// per process (address of a function in LLVMSupport) to prevent having users
/// depend on the particular hash values. On platforms without ASLR, this is
/// still likely non-deterministic per build.
inline uint64_t get_execution_seed() { … }


/// Trait to indicate whether a type's bits can be hashed directly.
///
/// A type trait which is true if we want to combine values for hashing by
/// reading the underlying data. It is false if values of this type must
/// first be passed to hash_value, and the resulting hash_codes combined.
//
// FIXME: We want to replace is_integral_or_enum and is_pointer here with
// a predicate which asserts that comparing the underlying storage of two
// values of the type for equality is equivalent to comparing the two values
// for equality. For all the platforms we care about, this holds for integers
// and pointers, but there are platforms where it doesn't and we would like to
// support user-defined types which happen to satisfy this property.
template <typename T> struct is_hashable_data
  : std::integral_constant<bool, ((is_integral_or_enum<T>::value ||
                                   std::is_pointer<T>::value) &&
                                  64 % sizeof(T) == 0)> { … };

// Special case std::pair to detect when both types are viable and when there
// is no alignment-derived padding in the pair. This is a bit of a lie because
// std::pair isn't truly POD, but it's close enough in all reasonable
// implementations for our use case of hashing the underlying data.
is_hashable_data<std::pair<T, U>>;

/// Helper to get the hashable data representation for a type.
/// This variant is enabled when the type itself can be used.
template <typename T>
std::enable_if_t<is_hashable_data<T>::value, T>
get_hashable_data(const T &value) { … }
/// Helper to get the hashable data representation for a type.
/// This variant is enabled when we must first call hash_value and use the
/// result as our data.
template <typename T>
std::enable_if_t<!is_hashable_data<T>::value, size_t>
get_hashable_data(const T &value) { … }

/// Helper to store data from a value into a buffer and advance the
/// pointer into that buffer.
///
/// This routine first checks whether there is enough space in the provided
/// buffer, and if not immediately returns false. If there is space, it
/// copies the underlying bytes of value into the buffer, advances the
/// buffer_ptr past the copied bytes, and returns true.
template <typename T>
bool store_and_advance(char *&buffer_ptr, char *buffer_end, const T& value,
                       size_t offset = 0) { … }

/// Implement the combining of integral values into a hash_code.
///
/// This overload is selected when the value type of the iterator is
/// integral. Rather than computing a hash_code for each object and then
/// combining them, this (as an optimization) directly combines the integers.
template <typename InputIteratorT>
hash_code hash_combine_range_impl(InputIteratorT first, InputIteratorT last) { … }

/// Implement the combining of integral values into a hash_code.
///
/// This overload is selected when the value type of the iterator is integral
/// and when the input iterator is actually a pointer. Rather than computing
/// a hash_code for each object and then combining them, this (as an
/// optimization) directly combines the integers. Also, because the integers
/// are stored in contiguous memory, this routine avoids copying each value
/// and directly reads from the underlying memory.
template <typename ValueT>
std::enable_if_t<is_hashable_data<ValueT>::value, hash_code>
hash_combine_range_impl(ValueT *first, ValueT *last) { … }

} // namespace detail
} // namespace hashing


/// Compute a hash_code for a sequence of values.
///
/// This hashes a sequence of values. It produces the same hash_code as
/// 'hash_combine(a, b, c, ...)', but can run over arbitrary sized sequences
/// and is significantly faster given pointers and types which can be hashed as
/// a sequence of bytes.
template <typename InputIteratorT>
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last) { … }


// Implementation details for hash_combine.
namespace hashing {
namespace detail {

/// Helper class to manage the recursive combining of hash_combine
/// arguments.
///
/// This class exists to manage the state and various calls involved in the
/// recursive combining of arguments used in hash_combine. It is particularly
/// useful at minimizing the code in the recursive calls to ease the pain
/// caused by a lack of variadic functions.
struct hash_combine_recursive_helper { … };

} // namespace detail
} // namespace hashing

/// Combine values into a single hash_code.
///
/// This routine accepts a varying number of arguments of any type. It will
/// attempt to combine them into a single hash_code. For user-defined types it
/// attempts to call a \see hash_value overload (via ADL) for the type. For
/// integer and pointer types it directly combines their data into the
/// resulting hash_code.
///
/// The result is suitable for returning from a user's hash_value
/// *implementation* for their user-defined type. Consumers of a type should
/// *not* call this routine, they should instead call 'hash_value'.
template <typename ...Ts> hash_code hash_combine(const Ts &...args) { … }

// Implementation details for implementations of hash_value overloads provided
// here.
namespace hashing {
namespace detail {

/// Helper to hash the value of a single integer.
///
/// Overloads for smaller integer types are not provided to ensure consistent
/// behavior in the presence of integral promotions. Essentially,
/// "hash_value('4')" and "hash_value('0' + 4)" should be the same.
inline hash_code hash_integer_value(uint64_t value) { … }

} // namespace detail
} // namespace hashing

// Declared and documented above, but defined here so that any of the hashing
// infrastructure is available.
template <typename T>
std::enable_if_t<is_integral_or_enum<T>::value, hash_code> hash_value(T value) { … }

// Declared and documented above, but defined here so that any of the hashing
// infrastructure is available.
template <typename T> hash_code hash_value(const T *ptr) { … }

// Declared and documented above, but defined here so that any of the hashing
// infrastructure is available.
template <typename T, typename U>
hash_code hash_value(const std::pair<T, U> &arg) { … }

template <typename... Ts> hash_code hash_value(const std::tuple<Ts...> &arg) { … }

// Declared and documented above, but defined here so that any of the hashing
// infrastructure is available.
template <typename T>
hash_code hash_value(const std::basic_string<T> &arg) { … }

template <typename T> hash_code hash_value(const std::optional<T> &arg) { … }

template <> struct DenseMapInfo<hash_code, void> { … };

} // namespace llvm

/// Implement std::hash so that hash_code can be used in STL containers.
namespace std {

template<>
struct hash<llvm::hash_code> { … };

} // namespace std;

#endif
llvm/llvm/include/llvm/ADT/Hashing.h