//===-- llvm/ADT/Hashing.h - Utilities for hashing --------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the newly proposed standard C++ interfaces for hashing // arbitrary data and building hash functions for user-defined types. This // interface was originally proposed in N3333[1] and is currently under review // for inclusion in a future TR and/or standard. // // The primary interfaces provide are comprised of one type and three functions: // // -- 'hash_code' class is an opaque type representing the hash code for some // data. It is the intended product of hashing, and can be used to implement // hash tables, checksumming, and other common uses of hashes. It is not an // integer type (although it can be converted to one) because it is risky // to assume much about the internals of a hash_code. In particular, each // execution of the program has a high probability of producing a different // hash_code for a given input. Thus their values are not stable to save or // persist, and should only be used during the execution for the // construction of hashing datastructures. // // -- 'hash_value' is a function designed to be overloaded for each // user-defined type which wishes to be used within a hashing context. It // should be overloaded within the user-defined type's namespace and found // via ADL. Overloads for primitive types are provided by this library. // // -- 'hash_combine' and 'hash_combine_range' are functions designed to aid // programmers in easily and intuitively combining a set of data into // a single hash_code for their object. They should only logically be used // within the implementation of a 'hash_value' routine or similar context. // // Note that 'hash_combine_range' contains very special logic for hashing // a contiguous array of integers or pointers. This logic is *extremely* fast, // on a modern Intel "Gainestown" Xeon (Nehalem uarch) @2.2 GHz, these were // benchmarked at over 6.5 GiB/s for large keys, and <20 cycles/hash for keys // under 32-bytes. // //===----------------------------------------------------------------------===// #ifndef LLVM_ADT_HASHING_H #define LLVM_ADT_HASHING_H #include "llvm/Config/abi-breaking.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SwapByteOrder.h" #include "llvm/Support/type_traits.h" #include <algorithm> #include <cassert> #include <cstring> #include <optional> #include <string> #include <tuple> #include <utility> namespace llvm { template <typename T, typename Enable> struct DenseMapInfo; /// An opaque object representing a hash code. /// /// This object represents the result of hashing some entity. It is intended to /// be used to implement hashtables or other hashing-based data structures. /// While it wraps and exposes a numeric value, this value should not be /// trusted to be stable or predictable across processes or executions. /// /// In order to obtain the hash_code for an object 'x': /// \code /// using llvm::hash_value; /// llvm::hash_code code = hash_value(x); /// \endcode class hash_code { … }; /// Compute a hash_code for any integer value. /// /// Note that this function is intended to compute the same hash_code for /// a particular value without regard to the pre-promotion type. This is in /// contrast to hash_combine which may produce different hash_codes for /// differing argument types even if they would implicit promote to a common /// type without changing the value. template <typename T> std::enable_if_t<is_integral_or_enum<T>::value, hash_code> hash_value(T value); /// Compute a hash_code for a pointer's address. /// /// N.B.: This hashes the *address*. Not the value and not the type. template <typename T> hash_code hash_value(const T *ptr); /// Compute a hash_code for a pair of objects. template <typename T, typename U> hash_code hash_value(const std::pair<T, U> &arg); /// Compute a hash_code for a tuple. template <typename... Ts> hash_code hash_value(const std::tuple<Ts...> &arg); /// Compute a hash_code for a standard string. template <typename T> hash_code hash_value(const std::basic_string<T> &arg); /// Compute a hash_code for a standard string. template <typename T> hash_code hash_value(const std::optional<T> &arg); // All of the implementation details of actually computing the various hash // code values are held within this namespace. These routines are included in // the header file mainly to allow inlining and constant propagation. namespace hashing { namespace detail { inline uint64_t fetch64(const char *p) { … } inline uint32_t fetch32(const char *p) { … } /// Some primes between 2^63 and 2^64 for various uses. static constexpr uint64_t k0 = …; static constexpr uint64_t k1 = …; static constexpr uint64_t k2 = …; static constexpr uint64_t k3 = …; /// Bitwise right rotate. /// Normally this will compile to a single instruction, especially if the /// shift is a manifest constant. inline uint64_t rotate(uint64_t val, size_t shift) { … } inline uint64_t shift_mix(uint64_t val) { … } inline uint64_t hash_16_bytes(uint64_t low, uint64_t high) { … } inline uint64_t hash_1to3_bytes(const char *s, size_t len, uint64_t seed) { … } inline uint64_t hash_4to8_bytes(const char *s, size_t len, uint64_t seed) { … } inline uint64_t hash_9to16_bytes(const char *s, size_t len, uint64_t seed) { … } inline uint64_t hash_17to32_bytes(const char *s, size_t len, uint64_t seed) { … } inline uint64_t hash_33to64_bytes(const char *s, size_t len, uint64_t seed) { … } inline uint64_t hash_short(const char *s, size_t length, uint64_t seed) { … } /// The intermediate state used during hashing. /// Currently, the algorithm for computing hash codes is based on CityHash and /// keeps 56 bytes of arbitrary state. struct hash_state { … }; /// In LLVM_ENABLE_ABI_BREAKING_CHECKS builds, the seed is non-deterministic /// per process (address of a function in LLVMSupport) to prevent having users /// depend on the particular hash values. On platforms without ASLR, this is /// still likely non-deterministic per build. inline uint64_t get_execution_seed() { … } /// Trait to indicate whether a type's bits can be hashed directly. /// /// A type trait which is true if we want to combine values for hashing by /// reading the underlying data. It is false if values of this type must /// first be passed to hash_value, and the resulting hash_codes combined. // // FIXME: We want to replace is_integral_or_enum and is_pointer here with // a predicate which asserts that comparing the underlying storage of two // values of the type for equality is equivalent to comparing the two values // for equality. For all the platforms we care about, this holds for integers // and pointers, but there are platforms where it doesn't and we would like to // support user-defined types which happen to satisfy this property. template <typename T> struct is_hashable_data : std::integral_constant<bool, ((is_integral_or_enum<T>::value || std::is_pointer<T>::value) && 64 % sizeof(T) == 0)> { … }; // Special case std::pair to detect when both types are viable and when there // is no alignment-derived padding in the pair. This is a bit of a lie because // std::pair isn't truly POD, but it's close enough in all reasonable // implementations for our use case of hashing the underlying data. is_hashable_data<std::pair<T, U>>; /// Helper to get the hashable data representation for a type. /// This variant is enabled when the type itself can be used. template <typename T> std::enable_if_t<is_hashable_data<T>::value, T> get_hashable_data(const T &value) { … } /// Helper to get the hashable data representation for a type. /// This variant is enabled when we must first call hash_value and use the /// result as our data. template <typename T> std::enable_if_t<!is_hashable_data<T>::value, size_t> get_hashable_data(const T &value) { … } /// Helper to store data from a value into a buffer and advance the /// pointer into that buffer. /// /// This routine first checks whether there is enough space in the provided /// buffer, and if not immediately returns false. If there is space, it /// copies the underlying bytes of value into the buffer, advances the /// buffer_ptr past the copied bytes, and returns true. template <typename T> bool store_and_advance(char *&buffer_ptr, char *buffer_end, const T& value, size_t offset = 0) { … } /// Implement the combining of integral values into a hash_code. /// /// This overload is selected when the value type of the iterator is /// integral. Rather than computing a hash_code for each object and then /// combining them, this (as an optimization) directly combines the integers. template <typename InputIteratorT> hash_code hash_combine_range_impl(InputIteratorT first, InputIteratorT last) { … } /// Implement the combining of integral values into a hash_code. /// /// This overload is selected when the value type of the iterator is integral /// and when the input iterator is actually a pointer. Rather than computing /// a hash_code for each object and then combining them, this (as an /// optimization) directly combines the integers. Also, because the integers /// are stored in contiguous memory, this routine avoids copying each value /// and directly reads from the underlying memory. template <typename ValueT> std::enable_if_t<is_hashable_data<ValueT>::value, hash_code> hash_combine_range_impl(ValueT *first, ValueT *last) { … } } // namespace detail } // namespace hashing /// Compute a hash_code for a sequence of values. /// /// This hashes a sequence of values. It produces the same hash_code as /// 'hash_combine(a, b, c, ...)', but can run over arbitrary sized sequences /// and is significantly faster given pointers and types which can be hashed as /// a sequence of bytes. template <typename InputIteratorT> hash_code hash_combine_range(InputIteratorT first, InputIteratorT last) { … } // Implementation details for hash_combine. namespace hashing { namespace detail { /// Helper class to manage the recursive combining of hash_combine /// arguments. /// /// This class exists to manage the state and various calls involved in the /// recursive combining of arguments used in hash_combine. It is particularly /// useful at minimizing the code in the recursive calls to ease the pain /// caused by a lack of variadic functions. struct hash_combine_recursive_helper { … }; } // namespace detail } // namespace hashing /// Combine values into a single hash_code. /// /// This routine accepts a varying number of arguments of any type. It will /// attempt to combine them into a single hash_code. For user-defined types it /// attempts to call a \see hash_value overload (via ADL) for the type. For /// integer and pointer types it directly combines their data into the /// resulting hash_code. /// /// The result is suitable for returning from a user's hash_value /// *implementation* for their user-defined type. Consumers of a type should /// *not* call this routine, they should instead call 'hash_value'. template <typename ...Ts> hash_code hash_combine(const Ts &...args) { … } // Implementation details for implementations of hash_value overloads provided // here. namespace hashing { namespace detail { /// Helper to hash the value of a single integer. /// /// Overloads for smaller integer types are not provided to ensure consistent /// behavior in the presence of integral promotions. Essentially, /// "hash_value('4')" and "hash_value('0' + 4)" should be the same. inline hash_code hash_integer_value(uint64_t value) { … } } // namespace detail } // namespace hashing // Declared and documented above, but defined here so that any of the hashing // infrastructure is available. template <typename T> std::enable_if_t<is_integral_or_enum<T>::value, hash_code> hash_value(T value) { … } // Declared and documented above, but defined here so that any of the hashing // infrastructure is available. template <typename T> hash_code hash_value(const T *ptr) { … } // Declared and documented above, but defined here so that any of the hashing // infrastructure is available. template <typename T, typename U> hash_code hash_value(const std::pair<T, U> &arg) { … } template <typename... Ts> hash_code hash_value(const std::tuple<Ts...> &arg) { … } // Declared and documented above, but defined here so that any of the hashing // infrastructure is available. template <typename T> hash_code hash_value(const std::basic_string<T> &arg) { … } template <typename T> hash_code hash_value(const std::optional<T> &arg) { … } template <> struct DenseMapInfo<hash_code, void> { … }; } // namespace llvm /// Implement std::hash so that hash_code can be used in STL containers. namespace std { template<> struct hash<llvm::hash_code> { … }; } // namespace std; #endif