0003-templatize.diff | Explore in Territory

diff --git a/third_party/rapidhash/rapidhash.h b/third_party/rapidhash/rapidhash.h
index d71dc5161f128..1a577c443a66e 100644
--- a/third_party/rapidhash/rapidhash.h
+++ b/third_party/rapidhash/rapidhash.h
@@ -43,8 +43,99 @@
 #include <tuple>
 #include <utility>
 
+#include "base/check_op.h"
 #include "base/compiler_specific.h"
 
+// Chromium has some local modifications to upstream rapidhash,
+// mostly around the concept of HashReaders (including slightly
+// more comments for ease of understanding). Generally, rapidhash
+// hashes bytes without really caring what these bytes are,
+// but often in Chromium, we want to hash _strings_, and strings
+// can have multiple representations. In particular, the WTF
+// StringImpl class (and by extension, String and AtomicString)
+// can have three different states:
+//
+//   1. Latin1 (or ASCII) code points, in 8 bits per character (LChar).
+//   2. The same code points, in 16 bits per character (UChar).
+//   3. Strings actually including non-Latin1 code points, in 16 bits
+//      per character (UChar, UTF-16 encoding).
+//
+// The problem is that we'd like to hash case #1 and #2 to actually
+// return the same hash. There are several ways we could deal with this:
+//
+//   a) Just ignore the problem and hash everything as raw bytes;
+//      case #2 is fairly rare, and some algorithms (e.g. opportunistic
+//      deduplication) could live with some false negatives.
+//   b) Expand all strings to UTF-16 before hashing. This was the
+//      strategy for the old StringHasher (using SuperFastHash),
+//      as it naturally worked in 16-bit increments and it is probably
+//      the simplest. However, this both halves the throughput of the
+//      hasher and incurs conversion costs.
+//   c) Detect #2 and convert those cases to #1 (UTF-16 to Latin1
+//      conversion), and apart from that, juts hash bytes.
+//
+// b) is used in e.g. CaseFoldingHash, but c) is the one we use the most
+// often. Most strings that we want to hash are 8-bit (e.g. HTML tags), so
+// that makes the common case fast. And for UChar strings, it is fairly fast
+// to make a pre-pass over the string to check for the existence of any
+// non-Latin1 characters. Of course, #1 and #3 can just be hashed as raw
+// bytes, as strings from those groups can never be equal anyway.
+//
+// To support these 8-to-16 and 16-to-8 conversions, and also things like
+// lowercasing on the fly, we have modified rapidhash to be templatized
+// on a HashReader, supplying bytes to the hash function. For the default
+// case of just hashing raw bytes, this is simply reading. But for other
+// conversions, the reader is doing slightly more work, such as packing
+// or unpacking. See e.g. ConvertTo8BitHashReader in string_hasher.h.
+//
+// Note that this reader could be made constexpr if we needed to evaluate
+// hashes at compile-time.
+struct PlainHashReader {
+  // If this is different from 1 (only 1, 2 and 4 are really reasonable
+  // options), it means the reader consumes its input at a faster rate than
+  // would be normally expected. For instance, if it is 2, it means that when
+  // the reader produces 64 bits, it needs to move its input 128 bits
+  // ahead. This is used when e.g. a reader converts from UTF-16 to ASCII,
+  // by removing zeros. The input length must divide the compression factor.
+  static constexpr unsigned kCompressionFactor = 1;
+
+  // This is the opposite of kExpansionFactor. It does not make sense to have
+  // both kCompressionFactor and kExpansionFactor different from 1.
+  // The output length must divide the expansion factor.
+  static constexpr unsigned kExpansionFactor = 1;
+
+  // Read 64 little-endian bits from the input, taking into account
+  // the expansion/compression if relevant. Unaligned reads must be
+  // supported.
+  static inline uint64_t Read64(const uint8_t* p) {
+    uint64_t v;
+    memcpy(&v, p, 8);
+    return v;
+  }
+
+  // Similarly, read 32 little-endian (unaligned) bits. Note that
+  // this must return uint64_t, _not_ uint32_t, as the hasher assumes
+  // it can freely shift such numbers up and down.
+  static inline uint64_t Read32(const uint8_t* p) {
+    uint32_t v;
+    memcpy(&v, p, 4);
+    return v;
+  }
+
+  // Read 1, 2 or 3 bytes from the input, and distribute them somewhat
+  // reasonably across the resulting 64-bit number. Note that this is
+  // done in a branch-free fashion, so that it always reads three bytes
+  // but never reads past the end.
+  //
+  // This is only used in the case where we hash a string of exactly
+  // 1, 2 or 3 bytes; if the hash is e.g. 7 bytes, two overlapping 32-bit
+  // reads will be done instead. Note that if you have kCompressionFactor == 2,
+  // this means that this will only ever be called with k = 2.
+  static inline uint64_t ReadSmall(const uint8_t* p, size_t k) {
+    return (uint64_t{p[0]} << 56) | (uint64_t{p[k >> 1]} << 32) | p[k - 1];
+  }
+};
+
 /*
  *  Likely and unlikely macros.
  */
@@ -171,8 +262,8 @@ inline uint64_t rapid_readSmall(const uint8_t* p, size_t k) {
 /*
  *  rapidhash main function.
  *
- *  @param p       Buffer to be hashed.
- *  @param len     @key length, in bytes.
+ *  @param key     Buffer to be hashed.
+ *  @param len     Number of input bytes coming from the reader.
  *  @param seed    64-bit seed used to alter the hash result predictably.
  *  @param secret  Triplet of 64-bit secrets used to alter hash result
  *                 predictably.
@@ -189,24 +280,30 @@ inline uint64_t rapid_readSmall(const uint8_t* p, size_t k) {
  *
  *  No other reads from memory take place. No writes to memory take place.
  */
-inline uint64_t rapidhash_internal(const uint8_t* p,
-                                   const size_t len,
-                                   uint64_t seed,
-                                   const uint64_t secret[3]) {
+template <class Reader>
+ALWAYS_INLINE uint64_t rapidhash_internal(const uint8_t* p,
+                                          const size_t len,
+                                          uint64_t seed,
+                                          const uint64_t secret[3]) {
+  // For brevity.
+  constexpr unsigned x = Reader::kCompressionFactor;
+  constexpr unsigned y = Reader::kExpansionFactor;
+  DCHECK_EQ(len % y, 0u);
+
   seed ^= rapid_mix(seed ^ secret[0], secret[1]) ^ len;
   uint64_t a, b;
   if (_likely_(len <= 16)) {
     if (_likely_(len >= 4)) {
       // Read the first and last 32 bits (they may overlap).
-      const uint8_t* plast = p + len - 4;
-      a = (rapid_read32(p) << 32) | rapid_read32(plast);
+      const uint8_t* plast = p + (len - 4) * x / y;
+      a = (Reader::Read32(p) << 32) | Reader::Read32(plast);
 
       // This is equivalent to: delta = (len >= 8) ? 4 : 0;
-      const uint64_t delta = ((len & 24) >> (len >> 3));
-      b = ((rapid_read32(p + delta) << 32) | rapid_read32(plast - delta));
+      const uint64_t delta = ((len & 24) >> (len >> 3)) * x / y;
+      b = ((Reader::Read32(p + delta) << 32) | Reader::Read32(plast - delta));
     } else if (_likely_(len > 0)) {
       // 1, 2 or 3 bytes.
-      a = rapid_readSmall(p, len);
+      a = Reader::ReadSmall(p, len);
       b = 0;
     } else {
       a = b = 0;
@@ -216,27 +313,32 @@ inline uint64_t rapidhash_internal(const uint8_t* p,
     if (_unlikely_(i > 48)) {
       uint64_t see1 = seed, see2 = seed;
       do {
-        seed =
-            rapid_mix(rapid_read64(p) ^ secret[0], rapid_read64(p + 8) ^ seed);
-        see1 = rapid_mix(rapid_read64(p + 16) ^ secret[1],
-                         rapid_read64(p + 24) ^ see1);
-        see2 = rapid_mix(rapid_read64(p + 32) ^ secret[2],
-                         rapid_read64(p + 40) ^ see2);
-        p += 48;
+        seed = rapid_mix(Reader::Read64(p) ^ secret[0],
+                         Reader::Read64(p + 8 * x / y) ^ seed);
+        see1 = rapid_mix(Reader::Read64(p + 16 * x / y) ^ secret[1],
+                         Reader::Read64(p + 24 * x / y) ^ see1);
+        see2 = rapid_mix(Reader::Read64(p + 32 * x / y) ^ secret[2],
+                         Reader::Read64(p + 40 * x / y) ^ see2);
+        p += 48 * x / y;
         i -= 48;
       } while (_likely_(i >= 48));
       seed ^= see1 ^ see2;
     }
     if (i > 16) {
-      seed = rapid_mix(rapid_read64(p) ^ secret[2],
-                       rapid_read64(p + 8) ^ seed ^ secret[1]);
+      seed = rapid_mix(Reader::Read64(p) ^ secret[2],
+                       Reader::Read64(p + 8 * x / y) ^ seed ^ secret[1]);
       if (i > 32) {
-        seed = rapid_mix(rapid_read64(p + 16) ^ secret[2],
-                         rapid_read64(p + 24) ^ seed);
+        seed = rapid_mix(Reader::Read64(p + 16 * x / y) ^ secret[2],
+                         Reader::Read64(p + 24 * x / y) ^ seed);
       }
     }
-    a = rapid_read64(p + i - 16);
-    b = rapid_read64(p + i - 8);
+
+    // Read the last 2x64 bits. Note that due to the division by y,
+    // this must be a signed quantity (or the division would become
+    // unsigned, possibly moving the sign bit down into the address).
+    // Similarly for x.
+    a = Reader::Read64(p + (static_cast<ptrdiff_t>(i) - 16) * x / y);
+    b = Reader::Read64(p + (static_cast<ptrdiff_t>(i) - 8) * x / y);
   }
   a ^= secret[1];
   b ^= seed;
@@ -248,17 +350,18 @@ inline uint64_t rapidhash_internal(const uint8_t* p,
  *  rapidhash default seeded hash function.
  *
  *  @param key     Buffer to be hashed.
- *  @param len     @key length, in bytes.
+ *  @param len     Number of input bytes coming from the reader.
  *  @param seed    64-bit seed used to alter the hash result predictably.
  *
  *  Calls rapidhash_internal using provided parameters and default secrets.
  *
  *  Returns a 64-bit hash.
  */
-inline uint64_t rapidhash(const uint8_t* key,
-                          size_t len,
-                          uint64_t seed = RAPID_SEED) {
-  return rapidhash_internal(key, len, seed, rapid_secret);
+template <class Reader = PlainHashReader>
+ALWAYS_INLINE static uint64_t rapidhash(const uint8_t* key,
+                                        size_t len,
+                                        uint64_t seed = RAPID_SEED) {
+  return rapidhash_internal<Reader>(key, len, seed, rapid_secret);
 }
 
 #endif  // _THIRD_PARTY_RAPIDHASH_H
chromium/third_party/rapidhash/patches/0003-templatize.diff