rapidhash.h | Explore in Territory

/*
 * rapidhash - Very fast, high quality, platform-independent hashing algorithm.
 * Copyright (C) 2024 Nicolas De Carli
 *
 * Based on 'wyhash', by Wang Yi <[email protected]>
 *
 * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *    * Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following disclaimer
 *      in the documentation and/or other materials provided with the
 *      distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * You can contact the author at:
 *   - rapidhash source repository: https://github.com/Nicoshev/rapidhash
 */

#ifndef _THIRD_PARTY_RAPIDHASH_H
#define _THIRD_PARTY_RAPIDHASH_H …

#include <stddef.h>
#include <stdint.h>
#include <string.h>

#include <tuple>
#include <utility>

#include "base/check_op.h"
#include "base/compiler_specific.h"

// Chromium has some local modifications to upstream rapidhash,
// mostly around the concept of HashReaders (including slightly
// more comments for ease of understanding). Generally, rapidhash
// hashes bytes without really caring what these bytes are,
// but often in Chromium, we want to hash _strings_, and strings
// can have multiple representations. In particular, the WTF
// StringImpl class (and by extension, String and AtomicString)
// can have three different states:
//
//   1. Latin1 (or ASCII) code points, in 8 bits per character (LChar).
//   2. The same code points, in 16 bits per character (UChar).
//   3. Strings actually including non-Latin1 code points, in 16 bits
//      per character (UChar, UTF-16 encoding).
//
// The problem is that we'd like to hash case #1 and #2 to actually
// return the same hash. There are several ways we could deal with this:
//
//   a) Just ignore the problem and hash everything as raw bytes;
//      case #2 is fairly rare, and some algorithms (e.g. opportunistic
//      deduplication) could live with some false negatives.
//   b) Expand all strings to UTF-16 before hashing. This was the
//      strategy for the old StringHasher (using SuperFastHash),
//      as it naturally worked in 16-bit increments and it is probably
//      the simplest. However, this both halves the throughput of the
//      hasher and incurs conversion costs.
//   c) Detect #2 and convert those cases to #1 (UTF-16 to Latin1
//      conversion), and apart from that, juts hash bytes.
//
// b) is used in e.g. CaseFoldingHash, but c) is the one we use the most
// often. Most strings that we want to hash are 8-bit (e.g. HTML tags), so
// that makes the common case fast. And for UChar strings, it is fairly fast
// to make a pre-pass over the string to check for the existence of any
// non-Latin1 characters. Of course, #1 and #3 can just be hashed as raw
// bytes, as strings from those groups can never be equal anyway.
//
// To support these 8-to-16 and 16-to-8 conversions, and also things like
// lowercasing on the fly, we have modified rapidhash to be templatized
// on a HashReader, supplying bytes to the hash function. For the default
// case of just hashing raw bytes, this is simply reading. But for other
// conversions, the reader is doing slightly more work, such as packing
// or unpacking. See e.g. ConvertTo8BitHashReader in string_hasher.h.
//
// Note that this reader could be made constexpr if we needed to evaluate
// hashes at compile-time.
struct PlainHashReader { … };

/*
 *  Likely and unlikely macros.
 */
#define _likely_ …
#define _unlikely_ …

/*
 *  Default seed.
 */
static constexpr uint64_t RAPID_SEED = …;

// Default secret parameters. If we wanted to, we could generate our own
// versions of these at renderer startup in order to perturb the hash
// and make it more DoS-resistant (similar to what base/hash.h does),
// but generating new ones takes a little bit of time (~200 µs on a desktop
// machine as of 2024), and good-quality random numbers may not be copious
// from within the sandbox. The secret concept is inherited from wyhash,
// described by the wyhash author here:
//
//   https://github.com/wangyi-fudan/wyhash/issues/139
//
// The rules are:
//
//   1. Each byte must be “balanced”, i.e., have exactly 4 bits set.
//      (This is trivially done by just precompting a LUT with the
//      possible bytes and picking from those.)
//
//   2. Each 64-bit group should have a Hamming distance of 32 to
//      all the others (i.e., popcount(secret[i] ^ secret[j]) == 32).
//      This is just done by rejection sampling.
//
//   3. Each 64-bit group should be prime. It's not obvious that this
//      is really needed for the hash, as opposed to wyrand which also
//      uses the same secret, but according to the author, it is
//      “a feeling to be perfect”. This naturally means the last byte
//      cannot be divisible by 2, but apart from that, is easiest
//      checked by testing a few small factors and then the Miller-Rabin
//      test, which rejects nearly all bad candidates in the first iteration
//      and is fast as long as we have 64x64 -> 128 bit muls and modulos.
//
// For now, we just use the rapidhash-supplied standard.
static constexpr uint64_t rapid_secret[3] = …;

/*
 *  64*64 -> 128bit multiply function.
 *
 *  @param A  Address of 64-bit number.
 *  @param B  Address of 64-bit number.
 *
 *  Calculates 128-bit C = *A * *B.
 */
inline std::pair<uint64_t, uint64_t> rapid_mul128(uint64_t A, uint64_t B) { … }

/*
 *  Multiply and xor mix function.
 *
 *  @param A  64-bit number.
 *  @param B  64-bit number.
 *
 *  Calculates 128-bit C = A * B.
 *  Returns 64-bit xor between high and low 64 bits of C.
 */
inline uint64_t rapid_mix(uint64_t A, uint64_t B) { … }

/*
 *  rapidhash main function.
 *
 *  @param key     Buffer to be hashed.
 *  @param len     Number of input bytes coming from the reader.
 *  @param seed    64-bit seed used to alter the hash result predictably.
 *  @param secret  Triplet of 64-bit secrets used to alter hash result
 *                 predictably.
 *
 *  Returns a 64-bit hash.
 *
 *  The data flow is separated so that we never mix input data with pointers;
 *
 *  a, b, seed, secret[0], secret[1], secret[2], see1 and see2 are affected
 *  by the input data.
 *
 *  p is only ever indexed by len, delta (comes from len only), i (comes from
 *  len only) or integral constants. len is const, so no data can flow into it.
 *
 *  No other reads from memory take place. No writes to memory take place.
 */
template <class Reader>
ALWAYS_INLINE uint64_t rapidhash_internal(const uint8_t* p,
                                          const size_t len,
                                          uint64_t seed,
                                          const uint64_t secret[3]) { … }

/*
 *  rapidhash default seeded hash function.
 *
 *  @param key     Buffer to be hashed.
 *  @param len     Number of input bytes coming from the reader.
 *  @param seed    64-bit seed used to alter the hash result predictably.
 *
 *  Calls rapidhash_internal using provided parameters and default secrets.
 *
 *  Returns a 64-bit hash.
 */
template <class Reader = PlainHashReader>
ALWAYS_INLINE static uint64_t rapidhash(const uint8_t* key,
                                        size_t len,
                                        uint64_t seed = RAPID_SEED) { … }

#undef _likely_
#undef _unlikely_

#endif  // _THIRD_PARTY_RAPIDHASH_H
chromium/third_party/rapidhash/rapidhash.h