find_length_of_declaration_list-inl.h

// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/351564777): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif

#ifndef THIRD_PARTY_BLINK_RENDERER_CORE_CSS_PARSER_FIND_LENGTH_OF_DECLARATION_LIST_INL_H_
#define THIRD_PARTY_BLINK_RENDERER_CORE_CSS_PARSER_FIND_LENGTH_OF_DECLARATION_LIST_INL_H_

// This file contains SIMD code to try to heuristically detect
// the length of a CSS declaration block. We use this during parsing
// in order to skip over them quickly (we don't need to parse
// all the properties until the first time something actually matches
// the selector). This is akin to setting a BlockGuard and then
// immediately calling SkipToEndOfBlock(), except that
//
//  a) It is much, much faster (something like 10x), since we don't
//     need to run the full tokenizer.
//  b) It is allowed to error out if there are some cases that are
//     too complicated for it to understand (e.g. cases that would
//     require simulating the entire block stack).
//  c) It knows to detect nested rules, and also similarly error out.
//     All of them have to involve { in some shape or form, so that
//     is a fairly easy check (except that we ignore it within strings).
//
// We _don't_ support these cases (i.e., we just error out), which
// we've empirically found to be rare within declaration blocks:
//
//   - Escaping using \ (possible, but requires counting whether
//     we have an even or odd number of them).
//   - [ and ] (would require the block stack).
//   - Extraneous ) (possible, but adds complications and would be rare)
//   - CSS comments (would require interactions with string parsing).
//   - ' within " or " within ' (complex, see below).
//
// The entry point is FindLengthOfDeclarationList(), which returns
// the number of bytes until the block's ending }, exclusive.
// Returns 0 if some kind of error occurred, which means the caller
// will need to parse the block the normal (slow) way.
//
// On x86, we are a bit hampered by our instruction set support;
// it would be nice to have e.g. AVX2, or possibly a smaller subset
// that includes PSHUFB and/or PCLMULQDQ. However, we do fairly well
// already with base SSE2. On Arm (at least on M1, which is very wide),
// it would help slightly to unroll 2x manually to extract more ILP;
// there are many paths where there are long dependency chains.

#ifdef __SSE2__
#include <immintrin.h>
#elif defined(__ARM_NEON__)
#include <arm_neon.h>
#endif

#include "third_party/blink/renderer/platform/wtf/text/string_view.h"

#ifdef __SSE2__

// Loads the 16 next UTF-16 runes, but any character that is >= 256
// will be converted into 0xFF, so that it has no chance of matching
// anything that we care about below. This maps elegantly to a
// saturating pack, since our values are in little-endian already.
static inline __m128i LoadAndCollapseHighBytes(const UChar* ptr) { … }

// For LChar, this is trivial; just load the bytes as-is.
static inline __m128i LoadAndCollapseHighBytes(const LChar* ptr) { … }

template <class CharType>
ALWAYS_INLINE static size_t FindLengthOfDeclarationList(const CharType* begin,
                                                        const CharType* end) { … }

// The AVX2 version is 50–65% faster than SSE2, depending on CPU;
// partially from wider vectors, and partially from increased
// instruction availability. The caller must call
// FindLengthOfDeclarationListAVX2() themselves if relevant.
//
// Like with NEON below, we'll only really document the differences
// with the SSE2 version. AVX2 is generally a 2x128-bit instruction set,
// much like NEON is a 2x64-bit instruction set, which will necessitate
// some of the same strategies.

__attribute__((target("avx2"))) static inline __m256i
LoadAndCollapseHighBytesAVX2(const UChar* ptr) { … }

__attribute__((target("avx2"))) static inline __m256i
LoadAndCollapseHighBytesAVX2(const LChar* ptr) { … }

// Similar to NEON, our parenthesis cascade doesn't cross the 128-bit lanes
// (shifts are not 256-bit, but rather two separate 128-bit shifts), so we'll
// need a final operation to propagate the highest element of the low lane
// into all the elements in the high lane. The compiler converts this to
// a nice combination of shuffles and permutations.
__attribute__((target("avx2"))) static inline __m256i BroadcastToHigh(
    __m256i x) { … }

// For the prefix-xor cascade, we can abuse the carryless multiplication
// function found in modern CPUs (there are really none that support AVX2 but
// not this). Essentially, it does a binary multiplication, except that adds are
// replaced with XORs, which means we can multiply with 1111111... to get
// exactly what we want. Note that the upper bits will contain junk that we may
// need to remove later.
//
// Also note that doing this operation over both halves of a 256-bit register
// is a much newer extension, but we only really need it over a 32-bit value.
// We go through an integer register to convert the 256-bit values to 32
// single bits (if we had eight bits per byte, they would be masking each other
// out anyway), and then immediately bump upwards again to a 128-bit register
// for the multiplication. Note that we return that 128-bit register; since we
// want the value _both_ in an integer register (it lets us do more work
// in parallel with the parenthesis cascade) _and_ in a vector register
// (since we need to use it to mask out bytes before said cascade), we let
// the caller do the conversion.
__attribute__((target("avx2,pclmul"))) ALWAYS_INLINE static __m128i
PrefixXORAVX2(__m256i x, uint64_t prev) { … }

// Once PrefixXORAVX2() has created a bit mask, we need to convert that back
// to a byte mask. This is an adapted version of
//
//   https://stackoverflow.com/questions/21622212/how-to-perform-the-inverse-of-mm256-movemask-epi8-vpmovmskb
//
// except that we take in the input value in the bottom 32 bits of a vector
// register, which gives less transfer back and forth through the integer
// registers. Clang figures out a fairly fast way of computing vmask using
// shuffles.
__attribute__((target("avx2"))) ALWAYS_INLINE static __m256i MaskToAVX2(
    __m128i mask) { … }

template <class CharType>
__attribute__((target("avx2,pclmul"))) ALWAYS_INLINE static size_t
FindLengthOfDeclarationListAVX2(const CharType* begin, const CharType* end) { … }

__attribute__((target("avx2,pclmul"))) inline size_t
FindLengthOfDeclarationListAVX2(StringView str) { … }

#elif defined(__ARM_NEON__)

static inline uint8x16_t LoadAndCollapseHighBytes(const UChar* ptr) {
  uint8x16_t x1;
  uint8x16_t x2;
  memcpy(&x1, ptr, sizeof(x1));
  memcpy(&x2, ptr + 8, sizeof(x2));
  return vreinterpretq_u8_u64(
      vcombine_u64(vreinterpret_u64_u8(vqmovn_u16(vreinterpretq_u16_u8(x1))),
                   vreinterpret_u64_u8(vqmovn_u16(vreinterpretq_u16_u8(x2)))));
}
static inline uint8x16_t LoadAndCollapseHighBytes(const LChar* ptr) {
  uint8x16_t ret;
  memcpy(&ret, ptr, sizeof(ret));
  return ret;
}

// The NEON implementation follows basically the same pattern as the
// SSE2 implementation; comments will be added only where they differ
// substantially.
//
// For A64, we _do_ have access to the PMULL instruction (the NEON
// equivalent of PCLMULQDQ), but it's supposedly slow, so we use
// the same XOR-shift cascade.
template <class CharType>
ALWAYS_INLINE static size_t FindLengthOfDeclarationList(const CharType* begin,
                                                        const CharType* end) {
  // Since NEON doesn't have a natural way of moving the last element
  // to the first slot (shift right by 15 _bytes_), but _does_ have
  // fairly cheap broadcasting (unlike SSE2 without SSSE3), we use
  // a slightly different convention: The prev_* elements hold the
  // last element in _all_ lanes, and is then applied _after_
  // the prefix sum/prefix XOR. This would create havoc with
  // saturating operations, but works well when they commute.
  uint8x16_t prev_quoted = vdupq_n_u8(0);
  uint8x16_t prev_parens = vdupq_n_u8(0);

  const CharType* ptr = begin;
  while (ptr + 17 <= end) {
    uint8x16_t x = LoadAndCollapseHighBytes(ptr);
    const uint8x16_t next_x = LoadAndCollapseHighBytes(ptr + 1);
    const uint8x16_t eq_backslash = x == '\\';
    const uint8x16_t eq_double_quote = x == '"';
    const uint8x16_t eq_single_quote = x == '\'';
    uint8x16_t quoted = x & (eq_double_quote | eq_single_quote);

    // NEON doesn't have 128-bit bytewise shifts like SSE2 have.
    // We thus need to do the algorithm separately in 64-bit halves,
    // then to a separate duplication step to transfer the result
    // from the highest element of the bottom half to all elements
    // of the top half. (The alternative would be to use TBL
    // instructions to simulate the shifts, but they can be slow
    // on mobile CPUs.)
    quoted ^=
        vreinterpretq_u8_u64(vshlq_n_u64(vreinterpretq_u64_u8(quoted), 8));
    quoted ^=
        vreinterpretq_u8_u64(vshlq_n_u64(vreinterpretq_u64_u8(quoted), 16));
    quoted ^=
        vreinterpretq_u8_u64(vshlq_n_u64(vreinterpretq_u64_u8(quoted), 32));
    quoted ^= vreinterpretq_u8_u64(vcombine_u64(
        vdup_n_u64(0),
        vreinterpret_u64_u8(vdup_lane_u8(
            vreinterpret_u8_u64(vget_low_u64(vreinterpretq_u64_u8(quoted))),
            7))));
    quoted ^= prev_quoted;
    const uint8x16_t mixed_quote = quoted == static_cast<char>('\'' ^ '"');

    x &= ~(quoted > vdupq_n_u8(0));

    const uint8x16_t comment_start = (x == '/') & (next_x == '*');
    const uint8x16_t opening_paren = x == '(';
    const uint8x16_t closing_paren = x == ')';
    uint8x16_t parens = closing_paren - opening_paren;
    parens +=
        vreinterpretq_u8_u64(vshlq_n_u64(vreinterpretq_u64_u8(parens), 8));
    parens +=
        vreinterpretq_u8_u64(vshlq_n_u64(vreinterpretq_u64_u8(parens), 16));
    parens +=
        vreinterpretq_u8_u64(vshlq_n_u64(vreinterpretq_u64_u8(parens), 32));
    parens += vreinterpretq_u8_u64(vcombine_u64(
        vdup_n_u64(0),
        vreinterpret_u64_u8(vdup_lane_u8(
            vreinterpret_u8_u64(vget_low_u64(vreinterpretq_u64_u8(parens))),
            7))));
    parens += prev_parens;

    // The VSHRN trick below doesn't guarantee the use of the top bit
    // the same way PMOVMSKB does, so we can't just use the parens value
    // directly for overflow check. We could compare directly against 255
    // here, but it's nice to have exactly the same behavior as on Intel,
    // so we do a signed shift to just replicate the top bit into the entire
    // byte. (Supposedly, this also has one cycle better throughput on
    // some CPUs.)
    const uint8x16_t parens_overflow =
        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(parens), 7));

    const uint8x16_t opening_block = (x | vdupq_n_u8(0x20)) == '{';
    const uint8x16_t eq_rightbrace = x == '}';
    uint8x16_t must_end = eq_backslash | mixed_quote | opening_block |
                          comment_start | eq_rightbrace | parens_overflow;

    // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
    uint64_t must_end_narrowed = vget_lane_u64(
        vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(must_end), 4)), 0);
    if (must_end_narrowed != 0) {
      unsigned idx = __builtin_ctzll(must_end_narrowed) >> 2;
      ptr += idx;
      if (*ptr == '}') {
        // Since we don't have cheap PMOVMSKB, and this is not on
        // the most critical path, we just chicken out here and let
        // the compiler spill the value to the stack, where we can
        // do a normal indexing.
        if (parens[idx] != 0) {
          return 0;
        } else {
          return ptr - begin;
        }
      } else {
        return 0;
      }
    }

    // As mentioned above, broadcast instead of shifting.
    ptr += 16;
    prev_quoted = vdupq_lane_u8(
        vreinterpret_u8_u64(vget_high_u64(vreinterpretq_u64_u8(quoted))), 7);
    prev_parens = vdupq_lane_u8(
        vreinterpret_u8_u64(vget_high_u64(vreinterpretq_u64_u8(parens))), 7);
  }
  return 0;
}

#else

// If we have neither SSE2 nor NEON, we simply return 0 immediately.
// We will then never use lazy parsing.
template <class CharType>
ALWAYS_INLINE static size_t FindLengthOfDeclarationList(const CharType* begin,
                                                        const CharType* end) {
  return 0;
}

#endif

ALWAYS_INLINE static size_t FindLengthOfDeclarationList(StringView str) { … }

#endif  // THIRD_PARTY_BLINK_RENDERER_CORE_CSS_PARSER_FIND_LENGTH_OF_DECLARATION_LIST_INL_H_
chromium/third_party/blink/renderer/core/css/parser/find_length_of_declaration_list-inl.h