html_document_parser_fastpath.cc | Explore in Territory

// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/351564777): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif

#include "third_party/blink/renderer/core/html/parser/html_document_parser_fastpath.h"

#include <algorithm>
#include <iostream>
#include <type_traits>

#include "base/metrics/histogram_functions.h"
#include "base/timer/elapsed_timer.h"
#include "base/trace_event/trace_event.h"
#include "third_party/blink/renderer/core/dom/attribute.h"
#include "third_party/blink/renderer/core/dom/document_fragment.h"
#include "third_party/blink/renderer/core/dom/element.h"
#include "third_party/blink/renderer/core/dom/qualified_name.h"
#include "third_party/blink/renderer/core/dom/text.h"
#include "third_party/blink/renderer/core/html/forms/html_button_element.h"
#include "third_party/blink/renderer/core/html/forms/html_form_element.h"
#include "third_party/blink/renderer/core/html/forms/html_input_element.h"
#include "third_party/blink/renderer/core/html/forms/html_label_element.h"
#include "third_party/blink/renderer/core/html/forms/html_option_element.h"
#include "third_party/blink/renderer/core/html/forms/html_select_element.h"
#include "third_party/blink/renderer/core/html/html_anchor_element.h"
#include "third_party/blink/renderer/core/html/html_br_element.h"
#include "third_party/blink/renderer/core/html/html_div_element.h"
#include "third_party/blink/renderer/core/html/html_element.h"
#include "third_party/blink/renderer/core/html/html_image_element.h"
#include "third_party/blink/renderer/core/html/html_li_element.h"
#include "third_party/blink/renderer/core/html/html_olist_element.h"
#include "third_party/blink/renderer/core/html/html_paragraph_element.h"
#include "third_party/blink/renderer/core/html/html_span_element.h"
#include "third_party/blink/renderer/core/html/html_template_element.h"
#include "third_party/blink/renderer/core/html/html_ulist_element.h"
#include "third_party/blink/renderer/core/html/parser/atomic_html_token.h"
#include "third_party/blink/renderer/core/html/parser/html_construction_site.h"
#include "third_party/blink/renderer/core/html/parser/html_entity_parser.h"
#include "third_party/blink/renderer/core/html_names.h"
#include "third_party/blink/renderer/core/svg/svg_element.h"
#include "third_party/blink/renderer/core/svg_names.h"
#include "third_party/blink/renderer/platform/heap/garbage_collected.h"
#include "third_party/blink/renderer/platform/runtime_enabled_features.h"
#include "third_party/blink/renderer/platform/text/segmented_string.h"
#include "third_party/blink/renderer/platform/wtf/text/atomic_string_encoding.h"
#include "third_party/blink/renderer/platform/wtf/text/wtf_uchar.h"

#if defined(BLINK_ENABLE_VECTORIZED_HTML_SCANNING)
#include "third_party/highway/src/hwy/highway.h"
#define VECTORIZE_SCANNING …
#else
#define VECTORIZE_SCANNING …
#endif

namespace blink {

namespace {

#if VECTORIZE_SCANNING
// We use the vectorized classification trick to scan and classify characters.
// Instead of checking the string byte-by-byte (or vector-by-vector), the
// algorithm takes the lower nibbles (4-bits) of the passed string and uses them
// as offsets into the table, represented as a vector. The values corresponding
// to those offsets are actual interesting symbols. The algorithm then simply
// compares the looked up values with the input vector. The true lanes in the
// resulting vector correspond to the interesting symbols.
//
// A big shout out to Daniel Lemire for suggesting the idea. See more on
// vectorized classification in the Daniel's paper:
// https://arxiv.org/pdf/1902.08318.
//
// For relatively short incoming strings (less than 64 characters) it's assumed
// that byte-by-byte comparison is faster. TODO(340582182): According to
// microbenchmarks on M1, string larger than 16 bytes are already scanned faster
// with SIMD.
constexpr size_t kVectorizationThreshold = 64;
// The byte that shall never match any symbol. Using 0xff for it is okay since
// we only want to match ASCII chars (<=128).
constexpr uint8_t kNeverMatchedChar = 0xff;

// The result of the TryMatch function (see below). Contains the index inside
// the vector (the lane) and the found character.
struct MatchedCharacter {
  bool Matched() const { return found_character != kNeverMatchedChar; }

  size_t index_in_vector = 0;
  uint8_t found_character = kNeverMatchedChar;
};

// Tries to match the characters for the single vector. If matched, returns the
// first matched character in the vector.
template <typename D, typename VectorT>
  requires(sizeof(hwy::HWY_NAMESPACE::TFromD<D>) == 1)
HWY_ATTR ALWAYS_INLINE MatchedCharacter TryMatch(D tag,
                                                 VectorT input,
                                                 VectorT low_nibble_table,
                                                 VectorT low_nib_and_mask) {
  namespace hw = hwy::HWY_NAMESPACE;

  // Get the low nibbles.
  const auto nib_lo = input & low_nib_and_mask;
  // Lookup the values in the table using the nibbles as offsets into the table.
  const auto shuf_lo = hw::TableLookupBytes(low_nibble_table, nib_lo);
  // The values in the tables correspond to the interesting symbols. Just
  // compare them with the input vector.
  const auto result = shuf_lo == input;
  // Find the interesting symbol.
  if (const intptr_t index = hw::FindFirstTrue(tag, result); index != -1) {
    return {static_cast<size_t>(index), hw::ExtractLane(input, index)};
  }

  return {};
}

// Scans the 1-byte string and returns the first matched character (1-byte) or
// kNeverMatchedChar otherwise.
template <typename T, typename VectorT>
  requires(sizeof(T) == 1)
HWY_ATTR ALWAYS_INLINE uint8_t SimdAdvanceAndLookup(const T*& start,
                                                    const T* end,
                                                    VectorT low_nibble_table) {
  namespace hw = hwy::HWY_NAMESPACE;
  DCHECK_GE(static_cast<size_t>(end - start), kVectorizationThreshold);

  hw::FixedTag<uint8_t, 16> tag;
  static constexpr auto stride = hw::MaxLanes(tag);

  const auto low_nib_and_mask = hw::Set(tag, 0xf);

  // The main scanning loop.
  for (; start + (stride - 1) < end; start += stride) {
    const auto input = hw::LoadU(tag, reinterpret_cast<const uint8_t*>(start));
    if (const auto result =
            TryMatch(tag, input, low_nibble_table, low_nib_and_mask);
        result.Matched()) {
      start = reinterpret_cast<const T*>(start + result.index_in_vector);
      return result.found_character;
    };
  }

  // Scan the last stride.
  if (start < end) {
    const auto input =
        hw::LoadU(tag, reinterpret_cast<const uint8_t*>(end - stride));
    if (const auto result =
            TryMatch(tag, input, low_nibble_table, low_nib_and_mask);
        result.Matched()) {
      start = end - stride + result.index_in_vector;
      return result.found_character;
    }
    start = end;
  }

  return kNeverMatchedChar;
}

// This overload for 2-bytes strings uses the interleaved load to check the
// lower bytes of the string. We don't use the gather instruction, since it's
// not available on NEON (as opposed to SVE) and is emulated in Highway.
template <typename T, typename VectorT>
  requires(sizeof(T) == 2)
HWY_ATTR ALWAYS_INLINE uint8_t SimdAdvanceAndLookup(const T*& start,
                                                    const T* end,
                                                    VectorT low_nibble_table) {
  namespace hw = hwy::HWY_NAMESPACE;
  DCHECK_GE(static_cast<size_t>(end - start), kVectorizationThreshold);

  hw::FixedTag<uint8_t, 16> tag;
  static constexpr auto stride = hw::MaxLanes(tag);

  const auto low_nib_and_mask = hw::Set(tag, 0xf);

  // The main scanning loop.
  while (start + (stride - 1) < end) {
    VectorT dummy_upper;
    VectorT input;
    hw::LoadInterleaved2(tag, reinterpret_cast<const uint8_t*>(start), input,
                         dummy_upper);
    if (const auto result =
            TryMatch(tag, input, low_nibble_table, low_nib_and_mask);
        result.Matched()) {
      const auto index = result.index_in_vector;
      // Check if the upper byte is zero.
      if (*(start + index) >> 8 == 0) {
        start = reinterpret_cast<const T*>(start + index);
        return result.found_character;
      }

      start += index + 1;
      continue;
    }

    // Otherwise, continue scanning.
    start += stride;
  }

  // Scan the last stride.
  if (start < end) {
    VectorT dummy_upper;
    VectorT input;
    hw::LoadInterleaved2(tag, reinterpret_cast<const uint8_t*>(end - stride),
                         input, dummy_upper);
    for (auto result = TryMatch(tag, input, low_nibble_table, low_nib_and_mask);
         result.Matched();
         result = TryMatch(tag, input, low_nibble_table, low_nib_and_mask)) {
      const auto index = result.index_in_vector;
      // Check if the upper byte is zero.
      if (*(end - stride + index) >> 8 == 0) {
        start = reinterpret_cast<const T*>(end - stride + index);
        return result.found_character;
      }

      // Otherwise, set the corresponding lane to kNeverMatchedChar to never
      // match it again and continue.
      input = hw::InsertLane(input, index, kNeverMatchedChar);
    }
    start = end;
  }

  return kNeverMatchedChar;
}
#endif  // VECTORIZE_SCANNING

template <class Char, size_t n>
bool operator==(base::span<const Char> span, const char (&s)[n]) { … }

template <int n>
constexpr bool OnlyContainsLowercaseASCIILetters(const char (&s)[n]) { … }

template <class Char, size_t n>
bool SpanMatchesLowercase(base::span<const Char> span, const char (&s)[n]) { … }

// A hash function that is just good enough to distinguish the supported
// tagnames. It needs to be adapted as soon as we have colliding tagnames.
// The implementation was chosen to map to a dense integer range to allow for
// compact switch jump-tables. If adding support for a new tag results in a
// collision, then pick a new function that minimizes the number of operations
// and results in a dense integer range. This will require some finesse, feel
// free to reach out to owners of bug 1407201 for help.
template <uint32_t n>
constexpr uint32_t TagnameHash(const char (&s)[n]) { … }
template <class Char>
uint32_t TagnameHash(base::span<const Char> s) { … }
uint32_t TagnameHash(const String& s) { … }

#define SUPPORTED_TAGS …

UCharLiteralBufferType;

template <class Char>
struct ScanTextResult { … };

template <>
String ScanTextResult<LChar>::TextToString() const { … }

template <>
String ScanTextResult<UChar>::TextToString() const { … }

// This HTML parser is used as a fast-path for setting innerHTML.
// It is faster than the general parser by only supporting a subset of valid
// HTML. This way, it can be spec-compliant without following the algorithm
// described in the spec. Unsupported features or parse errors lead to bailout,
// falling back to the general HTML parser.
// It differs from the general HTML parser in the following ways.
//
// Implementation:
// - It uses recursive descent for better CPU branch prediction.
// - It merges tokenization with parsing.
// - Whenever possible, tokens are represented as subsequences of the original
//   input, avoiding allocating memory for them.
//
// Restrictions (these may evolve based on uma data, https://crbug.com/1407201):
// - No auto-closing of tags.
// - Wrong nesting of HTML elements (for example nested <p>) leads to bailout
//   instead of fix-up.
// - No custom elements, no "is"-attribute.
// - No duplicate attributes. This restriction could be lifted easily.
// - Unquoted attribute names are very restricted.
// - Many tags are unsupported, but we could support more. For example, <table>
//   because of the complex re-parenting rules
// - Only a few named "&" character references are supported.
// - No '\0'. The handling of '\0' varies depending upon where it is found
//   and in general the correct handling complicates things.
// - Fails if an attribute name starts with 'on'. Such attributes are generally
//   events that may be fired. Allowing this could be problematic if the fast
//   path fails. For example, the 'onload' event of an <img> would be called
//   multiple times if parsing fails.
// - Fails if a text is encountered larger than Text::kDefaultLengthLimit. This
//   requires special processing.
// - Fails if a deep hierarchy is encountered. This is both to avoid a crash,
//   but also at a certain depth elements get added as siblings vs children (see
//   use of HTMLConstructionSite::kMaximumHTMLParserDOMTreeDepth).
// - Fails if an <img> is encountered. Image elements request the image early
//   on, resulting in network connections. Additionally, loading the image
//   may consume preloaded resources.
template <class Char>
class HTMLFastPathParser { … };

void LogFastPathResult(HtmlFastPathResult result) { … }

bool CanUseFastPath(Document& document,
                    Element& context_element,
                    ParserContentPolicy policy,
                    HTMLFragmentParsingBehaviorSet behavior) { … }

// A hand picked enumeration of the most frequently used tags on web pages with
// some amount of grouping. Ranking comes from
// (https://discuss.httparchive.org/t/use-of-html-elements/1438).
//
// These values are persisted to logs. Entries should not be renumbered and
// numeric values should never be reused (unless the histogram name is
// updated).
enum class UnsupportedTagType : uint32_t { … };

constexpr uint32_t kAllUnsupportedTags = …;
// If UnsupportedTagType is > 24, then need to add a fourth chunk to the
// overall histogram.
static_assert …;

#define CHECK_TAG_TYPE(t) …

#define NODE_HAS_TAG_NAME(t) …

// Returns the UnsupportedTagType for node. Returns 0 if `node` is one of the
// supported tags.
UnsupportedTagType UnsupportedTagTypeValueForNode(const Node& node) { … }

// Histogram names used when logging unsupported tag type.
const char* kUnsupportedTagTypeCompositeName = …;
const char* kUnsupportedTagTypeMaskNames[] = …;

// Histogram names used when logging unsupported context tag type.
const char* kUnsupportedContextTagTypeCompositeName = …;
const char* kUnsupportedContextTagTypeMaskNames[] = …;

// Logs histograms for either an unsupported tag or unsupported context tag.
// `type_mask` is a bitmask of the unsupported tags that were encountered. As
// the uma frontend doesn't handle large bitmasks well, there are 4 separate
// histograms logged:
// . histogram for bits 1-8, 9-16, 17-24. The names used for these histograms
//   is specified in `mask_histogram_names`.
// . A histogram identifying which bit ranges of `type_mask` have at least one
//   bit set. More specifically:
//   . bit 1 set if `type_mask` has at least one bit set in bits 1-8.
//   . bit 2 set if `type_mask` has at least one bit set in bits 9-16.
//   . bit 3 set if `type_mask` has at least one bit set in bits 17-24.
void LogFastPathUnsupportedTagTypeDetails(uint32_t type_mask,
                                          const char* composite_histogram_name,
                                          const char* mask_histogram_names[]) { … }

template <class Char>
bool TryParsingHTMLFragmentImpl(const base::span<const Char>& source,
                                Document& document,
                                ContainerNode& root_node,
                                Element& context_element,
                                HTMLFragmentParsingBehaviorSet behavior,
                                bool* failed_because_unsupported_tag) { … }

}  // namespace

bool TryParsingHTMLFragment(const String& source,
                            Document& document,
                            ContainerNode& parent,
                            Element& context_element,
                            ParserContentPolicy policy,
                            HTMLFragmentParsingBehaviorSet behavior,
                            bool* failed_because_unsupported_tag) { … }

void LogTagsForUnsupportedTagTypeFailure(DocumentFragment& fragment) { … }

#undef SUPPORTED_TAGS

}  // namespace blink
chromium/third_party/blink/renderer/core/html/parser/html_document_parser_fastpath.cc