html_tokenizer.cc | Explore in Territory

/*
 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/351564777): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif

#include "third_party/blink/renderer/core/html/parser/html_tokenizer.h"

#include "third_party/blink/renderer/core/html/parser/html_entity_parser.h"
#include "third_party/blink/renderer/core/html/parser/html_parser_idioms.h"
#include "third_party/blink/renderer/core/html/parser/html_tree_builder.h"
#include "third_party/blink/renderer/core/html/parser/markup_tokenizer_inlines.h"
#include "third_party/blink/renderer/core/html_names.h"
#include "third_party/blink/renderer/core/html_tokenizer_names.h"
#include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
#include "third_party/blink/renderer/platform/wtf/text/unicode.h"

namespace blink {

// clang-format off
#define INT_0_TO_127_LIST(V) …
// clang-format on

// Character flags for fast paths.
enum class ScanFlags : uint16_t { … };

static constexpr uint16_t CreateScanFlags(UChar cc) { … }

// DOM Part marker strings. Eventually move these to html_tokenizer_names.
#define kChildNodePartStartMarker …
#define kChildNodePartEndMarker …
#define kNodePartMarker …
#define kAttributePartMarker …

// Table of precomputed scan flags for the first 128 ASCII characters.
static constexpr const uint16_t character_scan_flags_[128] = …;

static inline UChar ToLowerCase(UChar cc) { … }

static inline bool CheckScanFlag(UChar cc, ScanFlags flag) { … }

static inline UChar ToLowerCaseIfAlpha(UChar cc) { … }

static inline bool VectorEqualsString(const LCharLiteralBuffer<32>& vector,
                                      const String& string) { … }

#define HTML_BEGIN_STATE(stateName) …
#define HTML_BEGIN_STATE_NOLABEL(stateName) …
#define HTML_RECONSUME_IN(stateName) …
#define HTML_ADVANCE_TO(stateName) …
#define HTML_ADVANCE_PAST_NON_NEWLINE_TO(stateName) …
#define HTML_CONSUME(stateName) …
#define HTML_CONSUME_NON_NEWLINE(stateName) …
#define HTML_SWITCH_TO(stateName) …

HTMLTokenizer::HTMLTokenizer(const HTMLParserOptions& options)
    : … { … }

HTMLTokenizer::~HTMLTokenizer() = default;

void HTMLTokenizer::Reset() { … }

inline bool HTMLTokenizer::ProcessEntity(SegmentedString& source) { … }

bool HTMLTokenizer::FlushBufferedEndTag(SegmentedString& source,
                                        bool current_char_may_be_newline) { … }

#define FLUSH_AND_ADVANCE_TO(stateName, current_char_may_be_newline) …

#define FLUSH_AND_ADVANCE_TO_NO_NEWLINE(stateName) …

#define FLUSH_AND_ADVANCE_TO_MAY_CONTAIN_NEWLINE(stateName) …

#define ADVANCE_PAST_MULTIPLE_NO_NEWLINE(len, newState) …

bool HTMLTokenizer::FlushEmitAndResumeInDataState(SegmentedString& source) { … }

HTMLToken* HTMLTokenizer::NextToken(SegmentedString& source) { … }

bool HTMLTokenizer::NextTokenImpl(SegmentedString& source) { … }

bool HTMLTokenizer::SkipWhitespaces(SegmentedString& source, UChar& cc) { … }

bool HTMLTokenizer::SkipWhitespacesHelper(SegmentedString& source, UChar& cc) { … }

bool HTMLTokenizer::EmitData(SegmentedString& source, UChar cc) { … }

bool HTMLTokenizer::EmitPLAINTEXT(SegmentedString& source, UChar cc) { … }

String HTMLTokenizer::BufferedCharacters() const { … }

void HTMLTokenizer::UpdateStateFor(const HTMLToken& token) { … }

void HTMLTokenizer::UpdateStateFor(html_names::HTMLTag tag) { … }

std::optional<HTMLTokenizer::State> HTMLTokenizer::SpeculativeStateForTag(
    html_names::HTMLTag tag) const { … }

inline bool HTMLTokenizer::TemporaryBufferIs(const String& expected_string) { … }

inline void HTMLTokenizer::AddToPossibleEndTag(LChar cc) { … }

inline bool HTMLTokenizer::IsAppropriateEndTag() { … }

inline void HTMLTokenizer::ParseError() { … }

}  // namespace blink
chromium/third_party/blink/renderer/core/html/parser/html_tokenizer.cc