// Copyright 2024 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifdef UNSAFE_BUFFERS_BUILD // TODO(crbug.com/351564777): Remove this and convert code to safer constructs. #pragma allow_unsafe_buffers #endif #ifndef THIRD_PARTY_BLINK_RENDERER_CORE_CSS_PARSER_FIND_LENGTH_OF_DECLARATION_LIST_INL_H_ #define THIRD_PARTY_BLINK_RENDERER_CORE_CSS_PARSER_FIND_LENGTH_OF_DECLARATION_LIST_INL_H_ // This file contains SIMD code to try to heuristically detect // the length of a CSS declaration block. We use this during parsing // in order to skip over them quickly (we don't need to parse // all the properties until the first time something actually matches // the selector). This is akin to setting a BlockGuard and then // immediately calling SkipToEndOfBlock(), except that // // a) It is much, much faster (something like 10x), since we don't // need to run the full tokenizer. // b) It is allowed to error out if there are some cases that are // too complicated for it to understand (e.g. cases that would // require simulating the entire block stack). // c) It knows to detect nested rules, and also similarly error out. // All of them have to involve { in some shape or form, so that // is a fairly easy check (except that we ignore it within strings). // // We _don't_ support these cases (i.e., we just error out), which // we've empirically found to be rare within declaration blocks: // // - Escaping using \ (possible, but requires counting whether // we have an even or odd number of them). // - [ and ] (would require the block stack). // - Extraneous ) (possible, but adds complications and would be rare) // - CSS comments (would require interactions with string parsing). // - ' within " or " within ' (complex, see below). // // The entry point is FindLengthOfDeclarationList(), which returns // the number of bytes until the block's ending }, exclusive. // Returns 0 if some kind of error occurred, which means the caller // will need to parse the block the normal (slow) way. // // On x86, we are a bit hampered by our instruction set support; // it would be nice to have e.g. AVX2, or possibly a smaller subset // that includes PSHUFB and/or PCLMULQDQ. However, we do fairly well // already with base SSE2. On Arm (at least on M1, which is very wide), // it would help slightly to unroll 2x manually to extract more ILP; // there are many paths where there are long dependency chains. #ifdef __SSE2__ #include <immintrin.h> #elif defined(__ARM_NEON__) #include <arm_neon.h> #endif #include "third_party/blink/renderer/platform/wtf/text/string_view.h" #ifdef __SSE2__ // Loads the 16 next UTF-16 runes, but any character that is >= 256 // will be converted into 0xFF, so that it has no chance of matching // anything that we care about below. This maps elegantly to a // saturating pack, since our values are in little-endian already. static inline __m128i LoadAndCollapseHighBytes(const UChar* ptr) { … } // For LChar, this is trivial; just load the bytes as-is. static inline __m128i LoadAndCollapseHighBytes(const LChar* ptr) { … } template <class CharType> ALWAYS_INLINE static size_t FindLengthOfDeclarationList(const CharType* begin, const CharType* end) { … } // The AVX2 version is 50–65% faster than SSE2, depending on CPU; // partially from wider vectors, and partially from increased // instruction availability. The caller must call // FindLengthOfDeclarationListAVX2() themselves if relevant. // // Like with NEON below, we'll only really document the differences // with the SSE2 version. AVX2 is generally a 2x128-bit instruction set, // much like NEON is a 2x64-bit instruction set, which will necessitate // some of the same strategies. __attribute__((target("avx2"))) static inline __m256i LoadAndCollapseHighBytesAVX2(const UChar* ptr) { … } __attribute__((target("avx2"))) static inline __m256i LoadAndCollapseHighBytesAVX2(const LChar* ptr) { … } // Similar to NEON, our parenthesis cascade doesn't cross the 128-bit lanes // (shifts are not 256-bit, but rather two separate 128-bit shifts), so we'll // need a final operation to propagate the highest element of the low lane // into all the elements in the high lane. The compiler converts this to // a nice combination of shuffles and permutations. __attribute__((target("avx2"))) static inline __m256i BroadcastToHigh( __m256i x) { … } // For the prefix-xor cascade, we can abuse the carryless multiplication // function found in modern CPUs (there are really none that support AVX2 but // not this). Essentially, it does a binary multiplication, except that adds are // replaced with XORs, which means we can multiply with 1111111... to get // exactly what we want. Note that the upper bits will contain junk that we may // need to remove later. // // Also note that doing this operation over both halves of a 256-bit register // is a much newer extension, but we only really need it over a 32-bit value. // We go through an integer register to convert the 256-bit values to 32 // single bits (if we had eight bits per byte, they would be masking each other // out anyway), and then immediately bump upwards again to a 128-bit register // for the multiplication. Note that we return that 128-bit register; since we // want the value _both_ in an integer register (it lets us do more work // in parallel with the parenthesis cascade) _and_ in a vector register // (since we need to use it to mask out bytes before said cascade), we let // the caller do the conversion. __attribute__((target("avx2,pclmul"))) ALWAYS_INLINE static __m128i PrefixXORAVX2(__m256i x, uint64_t prev) { … } // Once PrefixXORAVX2() has created a bit mask, we need to convert that back // to a byte mask. This is an adapted version of // // https://stackoverflow.com/questions/21622212/how-to-perform-the-inverse-of-mm256-movemask-epi8-vpmovmskb // // except that we take in the input value in the bottom 32 bits of a vector // register, which gives less transfer back and forth through the integer // registers. Clang figures out a fairly fast way of computing vmask using // shuffles. __attribute__((target("avx2"))) ALWAYS_INLINE static __m256i MaskToAVX2( __m128i mask) { … } template <class CharType> __attribute__((target("avx2,pclmul"))) ALWAYS_INLINE static size_t FindLengthOfDeclarationListAVX2(const CharType* begin, const CharType* end) { … } __attribute__((target("avx2,pclmul"))) inline size_t FindLengthOfDeclarationListAVX2(StringView str) { … } #elif defined(__ARM_NEON__) static inline uint8x16_t LoadAndCollapseHighBytes(const UChar* ptr) { uint8x16_t x1; uint8x16_t x2; memcpy(&x1, ptr, sizeof(x1)); memcpy(&x2, ptr + 8, sizeof(x2)); return vreinterpretq_u8_u64( vcombine_u64(vreinterpret_u64_u8(vqmovn_u16(vreinterpretq_u16_u8(x1))), vreinterpret_u64_u8(vqmovn_u16(vreinterpretq_u16_u8(x2))))); } static inline uint8x16_t LoadAndCollapseHighBytes(const LChar* ptr) { uint8x16_t ret; memcpy(&ret, ptr, sizeof(ret)); return ret; } // The NEON implementation follows basically the same pattern as the // SSE2 implementation; comments will be added only where they differ // substantially. // // For A64, we _do_ have access to the PMULL instruction (the NEON // equivalent of PCLMULQDQ), but it's supposedly slow, so we use // the same XOR-shift cascade. template <class CharType> ALWAYS_INLINE static size_t FindLengthOfDeclarationList(const CharType* begin, const CharType* end) { // Since NEON doesn't have a natural way of moving the last element // to the first slot (shift right by 15 _bytes_), but _does_ have // fairly cheap broadcasting (unlike SSE2 without SSSE3), we use // a slightly different convention: The prev_* elements hold the // last element in _all_ lanes, and is then applied _after_ // the prefix sum/prefix XOR. This would create havoc with // saturating operations, but works well when they commute. uint8x16_t prev_quoted = vdupq_n_u8(0); uint8x16_t prev_parens = vdupq_n_u8(0); const CharType* ptr = begin; while (ptr + 17 <= end) { uint8x16_t x = LoadAndCollapseHighBytes(ptr); const uint8x16_t next_x = LoadAndCollapseHighBytes(ptr + 1); const uint8x16_t eq_backslash = x == '\\'; const uint8x16_t eq_double_quote = x == '"'; const uint8x16_t eq_single_quote = x == '\''; uint8x16_t quoted = x & (eq_double_quote | eq_single_quote); // NEON doesn't have 128-bit bytewise shifts like SSE2 have. // We thus need to do the algorithm separately in 64-bit halves, // then to a separate duplication step to transfer the result // from the highest element of the bottom half to all elements // of the top half. (The alternative would be to use TBL // instructions to simulate the shifts, but they can be slow // on mobile CPUs.) quoted ^= vreinterpretq_u8_u64(vshlq_n_u64(vreinterpretq_u64_u8(quoted), 8)); quoted ^= vreinterpretq_u8_u64(vshlq_n_u64(vreinterpretq_u64_u8(quoted), 16)); quoted ^= vreinterpretq_u8_u64(vshlq_n_u64(vreinterpretq_u64_u8(quoted), 32)); quoted ^= vreinterpretq_u8_u64(vcombine_u64( vdup_n_u64(0), vreinterpret_u64_u8(vdup_lane_u8( vreinterpret_u8_u64(vget_low_u64(vreinterpretq_u64_u8(quoted))), 7)))); quoted ^= prev_quoted; const uint8x16_t mixed_quote = quoted == static_cast<char>('\'' ^ '"'); x &= ~(quoted > vdupq_n_u8(0)); const uint8x16_t comment_start = (x == '/') & (next_x == '*'); const uint8x16_t opening_paren = x == '('; const uint8x16_t closing_paren = x == ')'; uint8x16_t parens = closing_paren - opening_paren; parens += vreinterpretq_u8_u64(vshlq_n_u64(vreinterpretq_u64_u8(parens), 8)); parens += vreinterpretq_u8_u64(vshlq_n_u64(vreinterpretq_u64_u8(parens), 16)); parens += vreinterpretq_u8_u64(vshlq_n_u64(vreinterpretq_u64_u8(parens), 32)); parens += vreinterpretq_u8_u64(vcombine_u64( vdup_n_u64(0), vreinterpret_u64_u8(vdup_lane_u8( vreinterpret_u8_u64(vget_low_u64(vreinterpretq_u64_u8(parens))), 7)))); parens += prev_parens; // The VSHRN trick below doesn't guarantee the use of the top bit // the same way PMOVMSKB does, so we can't just use the parens value // directly for overflow check. We could compare directly against 255 // here, but it's nice to have exactly the same behavior as on Intel, // so we do a signed shift to just replicate the top bit into the entire // byte. (Supposedly, this also has one cycle better throughput on // some CPUs.) const uint8x16_t parens_overflow = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(parens), 7)); const uint8x16_t opening_block = (x | vdupq_n_u8(0x20)) == '{'; const uint8x16_t eq_rightbrace = x == '}'; uint8x16_t must_end = eq_backslash | mixed_quote | opening_block | comment_start | eq_rightbrace | parens_overflow; // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon uint64_t must_end_narrowed = vget_lane_u64( vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(must_end), 4)), 0); if (must_end_narrowed != 0) { unsigned idx = __builtin_ctzll(must_end_narrowed) >> 2; ptr += idx; if (*ptr == '}') { // Since we don't have cheap PMOVMSKB, and this is not on // the most critical path, we just chicken out here and let // the compiler spill the value to the stack, where we can // do a normal indexing. if (parens[idx] != 0) { return 0; } else { return ptr - begin; } } else { return 0; } } // As mentioned above, broadcast instead of shifting. ptr += 16; prev_quoted = vdupq_lane_u8( vreinterpret_u8_u64(vget_high_u64(vreinterpretq_u64_u8(quoted))), 7); prev_parens = vdupq_lane_u8( vreinterpret_u8_u64(vget_high_u64(vreinterpretq_u64_u8(parens))), 7); } return 0; } #else // If we have neither SSE2 nor NEON, we simply return 0 immediately. // We will then never use lazy parsing. template <class CharType> ALWAYS_INLINE static size_t FindLengthOfDeclarationList(const CharType* begin, const CharType* end) { return 0; } #endif ALWAYS_INLINE static size_t FindLengthOfDeclarationList(StringView str) { … } #endif // THIRD_PARTY_BLINK_RENDERER_CORE_CSS_PARSER_FIND_LENGTH_OF_DECLARATION_LIST_INL_H_