regexp-compiler-tonode.cc | Explore in Territory

// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "src/common/globals.h"
#include "src/execution/isolate.h"
#include "src/objects/string.h"
#include "src/regexp/regexp-compiler.h"
#include "src/regexp/regexp.h"
#include "src/strings/unicode-inl.h"
#include "src/zone/zone-list-inl.h"

#ifdef V8_INTL_SUPPORT
#include "src/base/strings.h"
#include "src/regexp/special-case.h"
#include "unicode/locid.h"
#include "unicode/uniset.h"
#include "unicode/utypes.h"
#endif  // V8_INTL_SUPPORT

namespace v8 {
namespace internal {

usingnamespaceregexp_compiler_constants;  // NOLINT(build/namespaces)

constexpr base::uc32 kMaxCodePoint = …;
constexpr int kMaxUtf16CodeUnit = …;
constexpr uint32_t kMaxUtf16CodeUnitU = …;

// -------------------------------------------------------------------
// Tree to graph conversion

RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
                               RegExpNode* on_success) { … }

RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
                               RegExpNode* on_success) { … }

namespace {

bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
                          const int* special_class, int length) { … }

bool CompareRanges(ZoneList<CharacterRange>* ranges, const int* special_class,
                   int length) { … }

}  // namespace

bool RegExpClassRanges::is_standard(Zone* zone) { … }

UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) { … }

void UnicodeRangeSplitter::AddRange(CharacterRange range) { … }

namespace {

// Translates between new and old V8-isms (SmallVector, ZoneList).
ZoneList<CharacterRange>* ToCanonicalZoneList(
    const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) { … }

void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
                      RegExpNode* on_success, UnicodeRangeSplitter* splitter) { … }

UC16Range;  // {from, to} packed into one uint32_t.
constexpr UC16Range ToUC16Range(base::uc16 from, base::uc16 to) { … }
constexpr base::uc16 ExtractFrom(UC16Range r) { … }
constexpr base::uc16 ExtractTo(UC16Range r) { … }

void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
                             RegExpNode* on_success,
                             UnicodeRangeSplitter* splitter) { … }

RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
    RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
    ZoneList<CharacterRange>* match, RegExpNode* on_success,
    bool read_backward) { … }

RegExpNode* MatchAndNegativeLookaroundInReadDirection(
    RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
    ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
    bool read_backward) { … }

void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
                           RegExpNode* on_success,
                           UnicodeRangeSplitter* splitter) { … }

void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
                            RegExpNode* on_success,
                            UnicodeRangeSplitter* splitter) { … }

RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
                              RegExpNode* on_success) { … }

}  // namespace

// static
// Only for /ui and /vi, not for /i regexps.
void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
                                               Zone* zone) { … }

RegExpNode* RegExpClassRanges::ToNode(RegExpCompiler* compiler,
                                      RegExpNode* on_success) { … }

RegExpNode* RegExpClassSetOperand::ToNode(RegExpCompiler* compiler,
                                          RegExpNode* on_success) { … }

RegExpNode* RegExpClassSetExpression::ToNode(RegExpCompiler* compiler,
                                             RegExpNode* on_success) { … }

void RegExpClassSetOperand::Union(RegExpClassSetOperand* other, Zone* zone) { … }

void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other,
                                      ZoneList<CharacterRange>* temp_ranges,
                                      Zone* zone) { … }

void RegExpClassSetOperand::Subtract(RegExpClassSetOperand* other,
                                     ZoneList<CharacterRange>* temp_ranges,
                                     Zone* zone) { … }

// static
RegExpClassSetOperand* RegExpClassSetExpression::ComputeExpression(
    RegExpTree* root, ZoneList<CharacterRange>* temp_ranges, Zone* zone) { … }

namespace {

int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { … }

#ifdef V8_INTL_SUPPORT

int CompareCaseInsensitive(const icu::UnicodeString& a,
                           const icu::UnicodeString& b) { … }

int CompareFirstCharCaseInsensitive(RegExpTree* const* a,
                                    RegExpTree* const* b) { … }

bool Equals(bool ignore_case, const icu::UnicodeString& a,
            const icu::UnicodeString& b) { … }

bool CharAtEquals(bool ignore_case, int index, const RegExpAtom* a,
                  const RegExpAtom* b) { … }

#else

unibrow::uchar Canonical(
    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
    unibrow::uchar c) {
  unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
  int length = canonicalize->get(c, '\0', chars);
  DCHECK_LE(length, 1);
  unibrow::uchar canonical = c;
  if (length == 1) canonical = chars[0];
  return canonical;
}

int CompareCaseInsensitive(
    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
    unibrow::uchar a, unibrow::uchar b) {
  if (a == b) return 0;
  if (a >= 'a' || b >= 'a') {
    a = Canonical(canonicalize, a);
    b = Canonical(canonicalize, b);
  }
  return static_cast<int>(a) - static_cast<int>(b);
}

int CompareFirstCharCaseInsensitive(
    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
    RegExpTree* const* a, RegExpTree* const* b) {
  RegExpAtom* atom1 = (*a)->AsAtom();
  RegExpAtom* atom2 = (*b)->AsAtom();
  return CompareCaseInsensitive(canonicalize, atom1->data().at(0),
                                atom2->data().at(0));
}

bool Equals(bool ignore_case,
            unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
            unibrow::uchar a, unibrow::uchar b) {
  if (a == b) return true;
  if (ignore_case) {
    return CompareCaseInsensitive(canonicalize, a, b) == 0;
  }
  return false;  // Case-sensitive equality already checked above.
}

bool CharAtEquals(bool ignore_case,
                  unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
                  int index, const RegExpAtom* a, const RegExpAtom* b) {
  return Equals(ignore_case, canonicalize, a->data().at(index),
                b->data().at(index));
}

#endif  // V8_INTL_SUPPORT

}  // namespace

// We can stable sort runs of atoms, since the order does not matter if they
// start with different characters.
// Returns true if any consecutive atoms were found.
bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { … }

// Optimizes ab|ac|az to a(?:b|c|d).
void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { … }

// Optimizes b|c|z to [bcz].
void RegExpDisjunction::FixSingleCharacterDisjunctions(
    RegExpCompiler* compiler) { … }

RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
                                      RegExpNode* on_success) { … }

RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
                                     RegExpNode* on_success) { … }

namespace {
// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
//         \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
                                          RegExpNode* on_success,
                                          RegExpAssertion::Type type) { … }
}  // anonymous namespace

RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
                                    RegExpNode* on_success) { … }

RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
                                        RegExpNode* on_success) { … }

RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
                                RegExpNode* on_success) { … }

namespace {

class V8_NODISCARD ModifiersScope { … };

}  // namespace

RegExpNode* RegExpGroup::ToNode(RegExpCompiler* compiler,
                                RegExpNode* on_success) { … }

RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
                                   int stack_pointer_register,
                                   int position_register,
                                   int capture_register_count,
                                   int capture_register_start)
    : … { … }

RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) { … }

RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
                                     RegExpNode* on_success) { … }

RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
                                  RegExpNode* on_success) { … }

RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,
                                  RegExpCompiler* compiler,
                                  RegExpNode* on_success) { … }

namespace {

class AssertionSequenceRewriter final { … };

}  // namespace

RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
                                      RegExpNode* on_success) { … }

namespace {

void AddClass(const int* elmv, int elmc, ZoneList<CharacterRange>* ranges,
              Zone* zone) { … }

void AddClassNegated(const int* elmv, int elmc,
                     ZoneList<CharacterRange>* ranges, Zone* zone) { … }

}  // namespace

void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set,
                                    ZoneList<CharacterRange>* ranges,
                                    bool add_unicode_case_equivalents,
                                    Zone* zone) { … }

// static
// Only for /i, not for /ui or /vi.
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
                                        ZoneList<CharacterRange>* ranges,
                                        bool is_one_byte) { … }

bool CharacterRange::IsCanonical(const ZoneList<CharacterRange>* ranges) { … }

ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { … }

namespace {

// Move a number of elements in a zonelist to another position
// in the same list. Handles overlapping source and target areas.
void MoveRanges(ZoneList<CharacterRange>* list, int from, int to, int count) { … }

int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,
                               CharacterRange insert) { … }

}  // namespace

void CharacterSet::Canonicalize() { … }

// static
void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { … }

// static
void CharacterRange::Negate(const ZoneList<CharacterRange>* ranges,
                            ZoneList<CharacterRange>* negated_ranges,
                            Zone* zone) { … }

// static
void CharacterRange::Intersect(const ZoneList<CharacterRange>* lhs,
                               const ZoneList<CharacterRange>* rhs,
                               ZoneList<CharacterRange>* intersection,
                               Zone* zone) { … }

namespace {

// Advance |index| and set |from| and |to| to the new range, if not out of
// bounds of |range|, otherwise |from| is set to a code point beyond the legal
// unicode character range.
void SafeAdvanceRange(const ZoneList<CharacterRange>* range, int* index,
                      base::uc32* from, base::uc32* to) { … }

}  // namespace

// static
void CharacterRange::Subtract(const ZoneList<CharacterRange>* src,
                              const ZoneList<CharacterRange>* to_remove,
                              ZoneList<CharacterRange>* result, Zone* zone) { … }

// static
void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) { … }

// static
bool CharacterRange::Equals(const ZoneList<CharacterRange>* lhs,
                            const ZoneList<CharacterRange>* rhs) { … }

namespace {

// Scoped object to keep track of how much we unroll quantifier loops in the
// regexp graph generator.
class RegExpExpansionLimiter { … };

}  // namespace

RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
                                     RegExpTree* body, RegExpCompiler* compiler,
                                     RegExpNode* on_success,
                                     bool not_at_start) { … }

}  // namespace internal
}  // namespace v8
chromium/v8/src/regexp/regexp-compiler-tonode.cc