css_tokenizer.cc | Explore in Territory

// Copyright 2014 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/351564777): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif

#include "third_party/blink/renderer/core/css/parser/css_tokenizer.h"
#include "third_party/blink/renderer/core/css/parser/css_parser_idioms.h"
#include "third_party/blink/renderer/core/css/parser/css_parser_token_range.h"
#include "third_party/blink/renderer/core/html/parser/html_parser_idioms.h"
#include "third_party/blink/renderer/platform/wtf/text/character_names.h"

#ifdef __SSE2__
#include <immintrin.h>
#elif defined(__ARM_NEON__)
#include <arm_neon.h>
#endif

namespace blink {
namespace {

// To avoid resizing we err on the side of reserving too much space.
// Most strings we tokenize have about 3.5 to 5 characters per token.
constexpr wtf_size_t kEstimatedCharactersPerToken = …;

}  // namespace

CSSTokenizer::CSSTokenizer(const String& string, wtf_size_t offset)
    : … { … }

CSSTokenizer::CSSTokenizer(StringView string, wtf_size_t offset)
    : … { … }

Vector<CSSParserToken, 32> CSSTokenizer::TokenizeToEOF() { … }

std::pair<Vector<CSSParserToken, 32>, Vector<wtf_size_t, 32>>
CSSTokenizer::TokenizeToEOFWithOffsets() {
  wtf_size_t estimated_tokens =
      (input_.length() - Offset()) / kEstimatedCharactersPerToken;
  Vector<CSSParserToken, 32> tokens;
  tokens.ReserveInitialCapacity(estimated_tokens);
  Vector<wtf_size_t, 32> offsets;
  offsets.ReserveInitialCapacity(estimated_tokens + 1);

  while (true) {
    offsets.push_back(input_.Offset());
    const CSSParserToken token =
        NextToken</*SkipComments=*/true, /*StoreOffset=*/false>();
    if (token.GetType() == kEOFToken) {
      return {tokens, offsets};
    } else {
      tokens.push_back(token);
    }
  }
}

StringView CSSTokenizer::StringRangeFrom(wtf_size_t start) const { … }

StringView CSSTokenizer::StringRangeAt(wtf_size_t start,
                                       wtf_size_t length) const { … }

CSSParserToken CSSTokenizer::TokenizeSingle() { … }

CSSParserToken CSSTokenizer::TokenizeSingleWithComments() { … }

wtf_size_t CSSTokenizer::TokenCount() { … }

void CSSTokenizer::Reconsume(UChar c) { … }

UChar CSSTokenizer::Consume() { … }

CSSParserToken CSSTokenizer::BlockStart(CSSParserTokenType type) { … }

CSSParserToken CSSTokenizer::BlockStart(CSSParserTokenType block_type,
                                        CSSParserTokenType type,
                                        StringView name) { … }

CSSParserToken CSSTokenizer::BlockEnd(CSSParserTokenType type,
                                      CSSParserTokenType start_type) { … }

CSSParserToken CSSTokenizer::HyphenMinus(UChar cc) { … }

CSSParserToken CSSTokenizer::Hash(UChar cc) { … }

CSSParserToken CSSTokenizer::LetterU(UChar cc) { … }

template <bool SkipComments, bool StoreOffset>
CSSParserToken CSSTokenizer::NextToken() { … }

// This method merges the following spec sections for efficiency
// http://www.w3.org/TR/css3-syntax/#consume-a-number
// http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number
CSSParserToken CSSTokenizer::ConsumeNumber() { … }

// http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token
CSSParserToken CSSTokenizer::ConsumeNumericToken() { … }

// https://drafts.csswg.org/css-syntax/#consume-ident-like-token
CSSParserToken CSSTokenizer::ConsumeIdentLikeToken() { … }

// https://drafts.csswg.org/css-syntax/#consume-a-string-token
CSSParserToken CSSTokenizer::ConsumeStringTokenUntil(UChar ending_code_point) { … }

CSSParserToken CSSTokenizer::ConsumeUnicodeRange() { … }

// https://drafts.csswg.org/css-syntax/#non-printable-code-point
static bool IsNonPrintableCodePoint(UChar cc) { … }

// https://drafts.csswg.org/css-syntax/#consume-url-token
CSSParserToken CSSTokenizer::ConsumeUrlToken() { … }

// https://drafts.csswg.org/css-syntax/#consume-the-remnants-of-a-bad-url
void CSSTokenizer::ConsumeBadUrlRemnants() { … }

void CSSTokenizer::ConsumeSingleWhitespaceIfNext() { … }

void CSSTokenizer::ConsumeUntilCommentEndFound() { … }

bool CSSTokenizer::ConsumeIfNext(UChar character) { … }

// http://www.w3.org/TR/css3-syntax/#consume-name
//
// Consumes a name, which is defined as a contiguous sequence of name code
// points (see IsNameCodePoint()), possibly with escapes. We stop at the first
// thing that is _not_ a name code point (or the end of a string); if that is a
// backslash, we hand over to the more complete and slower blink::ConsumeName().
// If not, we can send back the relevant substring of the input, without any
// allocations.
//
// If SIMD is available (we support only SSE2 and NEON), we do this 16 and 16
// bytes at a time, generally giving a speed boost except for very short names.
// (We don't get short-circuiting, and we need some extra setup to load
// constants, but we also don't get a lot of branches per byte that we
// consider.)
//
// The checking for \0 is a bit odd; \0 is sometimes used as an EOF marker
// internal to this code, so we need to call into blink::ConsumeName()
// to escape it (into a Unicode replacement character) if we should see it.
StringView CSSTokenizer::ConsumeName() { … }

// https://drafts.csswg.org/css-syntax/#consume-an-escaped-code-point
UChar32 CSSTokenizer::ConsumeEscape() { … }

bool CSSTokenizer::NextTwoCharsAreValidEscape() { … }

// http://www.w3.org/TR/css3-syntax/#starts-with-a-number
bool CSSTokenizer::NextCharsAreNumber(UChar first) { … }

bool CSSTokenizer::NextCharsAreNumber() { … }

// https://drafts.csswg.org/css-syntax/#would-start-an-identifier
bool CSSTokenizer::NextCharsAreIdentifier(UChar first) { … }

bool CSSTokenizer::NextCharsAreIdentifier() { … }

StringView CSSTokenizer::RegisterString(const String& string) { … }

}  // namespace blink
chromium/third_party/blink/renderer/core/css/parser/css_tokenizer.cc