regexp-unittest.cc | Explore in Territory

// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/regexp/regexp.h"

#include <cstdlib>
#include <memory>
#include <sstream>

#include "include/v8-context.h"
#include "include/v8-initialization.h"
#include "include/v8-isolate.h"
#include "include/v8-local-handle.h"
#include "src/api/api-inl.h"
#include "src/ast/ast.h"
#include "src/base/strings.h"
#include "src/codegen/assembler-arch.h"
#include "src/codegen/macro-assembler.h"
#include "src/init/v8.h"
#include "src/objects/js-regexp-inl.h"
#include "src/objects/objects-inl.h"
#include "src/regexp/regexp-bytecode-generator.h"
#include "src/regexp/regexp-bytecodes.h"
#include "src/regexp/regexp-compiler.h"
#include "src/regexp/regexp-interpreter.h"
#include "src/regexp/regexp-macro-assembler-arch.h"
#include "src/regexp/regexp-parser.h"
#include "src/strings/char-predicates-inl.h"
#include "src/strings/string-stream.h"
#include "src/strings/unicode-inl.h"
#include "src/utils/ostreams.h"
#include "src/zone/zone-list-inl.h"
#include "test/common/flag-utils.h"
#include "test/unittests/test-utils.h"

namespace v8 {
namespace internal {

TEST_F(TestWithNativeContext, ConvertRegExpFlagsToString) { … }

TEST_F(TestWithNativeContext, ConvertRegExpFlagsToStringNoFlags) { … }

TEST_F(TestWithNativeContext, ConvertRegExpFlagsToStringAllFlags) { … }

using RegExpTest = TestWithIsolate;

static bool CheckParse(const char* input) { … }

static void CheckParseEq(const char* input, const char* expected,
                         bool unicode = false) { … }

static bool CheckSimple(const char* input) { … }

struct MinMaxPair { … };

static MinMaxPair CheckMinMaxMatch(const char* input) { … }

#define CHECK_PARSE_ERROR …
#define CHECK_SIMPLE …
#define CHECK_MIN_MAX …

TEST_F(RegExpTest, RegExpParser) { … }

TEST_F(RegExpTest, ParserRegression) { … }

static void ExpectError(const char* input, const char* expected,
                        bool unicode = false) { … }

TEST_F(RegExpTest, Errors) { … }

static bool IsDigit(base::uc32 c) { … }

static bool NotDigit(base::uc32 c) { … }

static bool NotWhiteSpaceNorLineTermiantor(base::uc32 c) { … }

static bool NotWord(base::uc32 c) { … }

static bool NotLineTerminator(base::uc32 c) { … }

static void TestCharacterClassEscapes(StandardCharacterSet c,
                                      bool(pred)(base::uc32 c)) { … }

TEST_F(RegExpTest, CharacterClassEscapes) { … }

static RegExpNode* Compile(const char* input, bool multiline, bool unicode,
                           bool is_one_byte, Zone* zone) { … }

static void Execute(const char* input, bool multiline, bool unicode,
                    bool is_one_byte, bool dot_output = false) { … }

// Test of debug-only syntax.
#ifdef DEBUG

TEST_F(RegExpTest, ParsePossessiveRepetition) { … }

#endif

// Tests of interpreter.

#if V8_TARGET_ARCH_IA32
using ArchRegExpMacroAssembler = RegExpMacroAssemblerIA32;
#elif V8_TARGET_ARCH_X64
ArchRegExpMacroAssembler;
#elif V8_TARGET_ARCH_ARM
using ArchRegExpMacroAssembler = RegExpMacroAssemblerARM;
#elif V8_TARGET_ARCH_ARM64
using ArchRegExpMacroAssembler = RegExpMacroAssemblerARM64;
#elif V8_TARGET_ARCH_S390
using ArchRegExpMacroAssembler = RegExpMacroAssemblerS390;
#elif V8_TARGET_ARCH_PPC64
using ArchRegExpMacroAssembler = RegExpMacroAssemblerPPC;
#elif V8_TARGET_ARCH_MIPS64
using ArchRegExpMacroAssembler = RegExpMacroAssemblerMIPS;
#elif V8_TARGET_ARCH_LOONG64
using ArchRegExpMacroAssembler = RegExpMacroAssemblerLOONG64;
#elif V8_TARGET_ARCH_RISCV64
using ArchRegExpMacroAssembler = RegExpMacroAssemblerRISCV;
#elif V8_TARGET_ARCH_RISCV32
using ArchRegExpMacroAssembler = RegExpMacroAssemblerRISCV;
#endif

class ContextInitializer { … };

// Create new JSRegExp object with only necessary fields (for this tests)
// initialized.
static Handle<JSRegExp> CreateJSRegExp(DirectHandle<String> source,
                                       DirectHandle<Code> code,
                                       bool is_unicode = false) { … }

static ArchRegExpMacroAssembler::Result Execute(
    Tagged<JSRegExp> regexp, Tagged<String> input, int start_offset,
    Address input_start, Address input_end, int* captures) { … }

TEST_F(RegExpTest, MacroAssemblerNativeSuccess) { … }

TEST_F(RegExpTest, MacroAssemblerNativeSimple) { … }

TEST_F(RegExpTest, MacroAssemblerNativeSimpleUC16) { … }

TEST_F(RegExpTest, MacroAssemblerNativeBacktrack) { … }

TEST_F(RegExpTest, MacroAssemblerNativeBackReferenceLATIN1) { … }

TEST_F(RegExpTest, MacroAssemblerNativeBackReferenceUC16) { … }

TEST_F(RegExpTest, MacroAssemblernativeAtStart) { … }

TEST_F(RegExpTest, MacroAssemblerNativeBackRefNoCase) { … }

TEST_F(RegExpTest, MacroAssemblerNativeRegisters) { … }

TEST_F(RegExpTest, MacroAssemblerStackOverflow) { … }

TEST_F(RegExpTest, MacroAssemblerNativeLotsOfRegisters) { … }

TEST_F(RegExpTest, MacroAssembler) { … }

#ifndef V8_INTL_SUPPORT
static base::uc32 canonicalize(base::uc32 c) {
  unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth];
  int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, nullptr);
  if (count == 0) {
    return c;
  } else {
    CHECK_EQ(1, count);
    return canon[0];
  }
}

TEST_F(RegExpTest, LatinCanonicalize) {
  unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
  for (unibrow::uchar lower = 'a'; lower <= 'z'; lower++) {
    unibrow::uchar upper = lower + ('A' - 'a');
    CHECK_EQ(canonicalize(lower), canonicalize(upper));
    unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth];
    int length = un_canonicalize.get(lower, '\0', uncanon);
    CHECK_EQ(2, length);
    CHECK_EQ(upper, uncanon[0]);
    CHECK_EQ(lower, uncanon[1]);
  }
  for (base::uc32 c = 128; c < (1 << 21); c++) CHECK_GE(canonicalize(c), 128);
  unibrow::Mapping<unibrow::ToUppercase> to_upper;
  // Canonicalization is only defined for the Basic Multilingual Plane.
  for (base::uc32 c = 0; c < (1 << 16); c++) {
    unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth];
    int length = to_upper.get(c, '\0', upper);
    if (length == 0) {
      length = 1;
      upper[0] = c;
    }
    base::uc32 u = upper[0];
    if (length > 1 || (c >= 128 && u < 128)) u = c;
    CHECK_EQ(u, canonicalize(c));
  }
}

static base::uc32 CanonRangeEnd(base::uc32 c) {
  unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
  int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, nullptr);
  if (count == 0) {
    return c;
  } else {
    CHECK_EQ(1, count);
    return canon[0];
  }
}

TEST_F(RegExpTest, RangeCanonicalization) {
  // Check that we arrive at the same result when using the basic
  // range canonicalization primitives as when using immediate
  // canonicalization.
  unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
  int block_start = 0;
  while (block_start <= 0xFFFF) {
    base::uc32 block_end = CanonRangeEnd(block_start);
    unsigned block_length = block_end - block_start + 1;
    if (block_length > 1) {
      unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth];
      int first_length = un_canonicalize.get(block_start, '\0', first);
      for (unsigned i = 1; i < block_length; i++) {
        unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth];
        int succ_length = un_canonicalize.get(block_start + i, '\0', succ);
        CHECK_EQ(first_length, succ_length);
        for (int j = 0; j < succ_length; j++) {
          int calc = first[j] + i;
          int found = succ[j];
          CHECK_EQ(calc, found);
        }
      }
    }
    block_start = block_start + block_length;
  }
}

TEST_F(RegExpTest, UncanonicalizeEquivalence) {
  unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
  for (int i = 0; i < (1 << 16); i++) {
    int length = un_canonicalize.get(i, '\0', chars);
    for (int j = 0; j < length; j++) {
      unibrow::uchar chars2[unibrow::Ecma262UnCanonicalize::kMaxWidth];
      int length2 = un_canonicalize.get(chars[j], '\0', chars2);
      CHECK_EQ(length, length2);
      for (int k = 0; k < length; k++)
        CHECK_EQ(static_cast<int>(chars[k]), static_cast<int>(chars2[k]));
    }
  }
}

#endif

static void TestRangeCaseIndependence(Isolate* isolate, CharacterRange input,
                                      base::Vector<CharacterRange> expected) { … }

static void TestSimpleRangeCaseIndependence(Isolate* isolate,
                                            CharacterRange input,
                                            CharacterRange expected) { … }

TEST_F(RegExpTest, CharacterRangeCaseIndependence) { … }

static bool InClass(base::uc32 c,
                    const UnicodeRangeSplitter::CharacterRangeVector* ranges) { … }

TEST_F(RegExpTest, UnicodeRangeSplitter) { … }

TEST_F(RegExpTest, CanonicalizeCharacterSets) { … }

TEST_F(RegExpTest, CharacterRangeMerge) { … }

TEST_F(RegExpTest, Graph) { … }

namespace {

int* global_use_counts = …;

void MockUseCounterCallback(v8::Isolate* isolate,
                            v8::Isolate::UseCounterFeature feature) { … }

}  // namespace

using RegExpTestWithContext = TestWithContext;
// Test that ES2015+ RegExp compatibility fixes are in place, that they
// are not overly broad, and the appropriate UseCounters are incremented
TEST_F(RegExpTestWithContext, UseCountRegExp) { … }

class UncachedExternalStringResource
    : public v8::String::ExternalOneByteStringResource { … };

TEST_F(RegExpTestWithContext, UncachedExternalString) { … }

// Test bytecode peephole optimization

void CreatePeepholeNoChangeBytecode(RegExpMacroAssembler* m) { … }

TEST_F(RegExpTest, PeepholeNoChange) { … }

void CreatePeepholeSkipUntilCharBytecode(RegExpMacroAssembler* m) { … }

TEST_F(RegExpTest, PeepholeSkipUntilChar) { … }

void CreatePeepholeSkipUntilBitInTableBytecode(RegExpMacroAssembler* m,
                                               Factory* factory) { … }

TEST_F(RegExpTest, PeepholeSkipUntilBitInTable) { … }

void CreatePeepholeSkipUntilCharPosCheckedBytecode(RegExpMacroAssembler* m) { … }

TEST_F(RegExpTest, PeepholeSkipUntilCharPosChecked) { … }

void CreatePeepholeSkipUntilCharAndBytecode(RegExpMacroAssembler* m) { … }

TEST_F(RegExpTest, PeepholeSkipUntilCharAnd) { … }

void CreatePeepholeSkipUntilCharOrCharBytecode(RegExpMacroAssembler* m) { … }

TEST_F(RegExpTest, PeepholeSkipUntilCharOrChar) { … }

void CreatePeepholeSkipUntilGtOrNotBitInTableBytecode(RegExpMacroAssembler* m,
                                                      Factory* factory) { … }

TEST_F(RegExpTest, PeepholeSkipUntilGtOrNotBitInTable) { … }

void CreatePeepholeLabelFixupsInsideBytecode(RegExpMacroAssembler* m,
                                             Label* dummy_before,
                                             Label* dummy_after,
                                             Label* dummy_inside) { … }

TEST_F(RegExpTest, PeepholeLabelFixupsInside) { … }

void CreatePeepholeLabelFixupsComplexBytecode(RegExpMacroAssembler* m,
                                              Label* dummy_before,
                                              Label* dummy_between,
                                              Label* dummy_after,
                                              Label* dummy_inside) { … }

TEST_F(RegExpTest, PeepholeLabelFixupsComplex) { … }

TEST_F(RegExpTestWithContext, UnicodePropertyEscapeCodeSize) { … }

namespace {

struct RegExpExecData { … };

i::Handle<i::Object> RegExpExec(const RegExpExecData* d) { … }

void ReenterRegExp(v8::Isolate* isolate, void* data) { … }

}  // namespace

// Tests reentrant irregexp calls.
TEST_F(RegExpTestWithContext, RegExpInterruptReentrantExecution) { … }

#undef CHECK_PARSE_ERROR
#undef CHECK_SIMPLE
#undef CHECK_MIN_MAX

}  // namespace internal
}  // namespace v8
chromium/v8/test/unittests/regexp/regexp-unittest.cc