chromium/third_party/re2/src/re2/prog.cc

// Copyright 2007 The RE2 Authors.  All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Compiled regular expression representation.
// Tested by compile_test.cc

#include "re2/prog.h"

#include <stdint.h>
#include <string.h>

#include <algorithm>
#include <string>
#include <utility>
#include <vector>

#include "absl/base/attributes.h"
#include "absl/log/absl_check.h"
#include "absl/log/absl_log.h"
#include "absl/strings/str_format.h"
#include "absl/strings/string_view.h"
#include "re2/bitmap256.h"
#include "re2/pod_array.h"
#include "re2/sparse_array.h"
#include "re2/sparse_set.h"

#if defined(__AVX2__)
#include <immintrin.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif
#endif

namespace re2 {

// Constructors per Inst opcode

void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {}

void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {}

void Prog::Inst::InitCapture(int cap, uint32_t out) {}

void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {}

void Prog::Inst::InitMatch(int32_t id) {}

void Prog::Inst::InitNop(uint32_t out) {}

void Prog::Inst::InitFail() {}

std::string Prog::Inst::Dump() {}

Prog::Prog()
  :{}

Prog::~Prog() {}

Workq;

static inline void AddToQueue(Workq* q, int id) {}

static std::string ProgToString(Prog* prog, Workq* q) {}

static std::string FlattenedProgToString(Prog* prog, int start) {}

std::string Prog::Dump() {}

std::string Prog::DumpUnanchored() {}

std::string Prog::DumpByteMap() {}

// Is ip a guaranteed match at end of text, perhaps after some capturing?
static bool IsMatch(Prog* prog, Prog::Inst* ip) {}

// Peep-hole optimizer.
void Prog::Optimize() {}

uint32_t Prog::EmptyFlags(absl::string_view text, const char* p) {}

// ByteMapBuilder implements a coloring algorithm.
//
// The first phase is a series of "mark and merge" batches: we mark one or more
// [lo-hi] ranges, then merge them into our internal state. Batching is not for
// performance; rather, it means that the ranges are treated indistinguishably.
//
// Internally, the ranges are represented using a bitmap that stores the splits
// and a vector that stores the colors; both of them are indexed by the ranges'
// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
// hi (if not already split), then recolor each range in between. The color map
// (i.e. from the old color to the new color) is maintained for the lifetime of
// the batch and so underpins this somewhat obscure approach to set operations.
//
// The second phase builds the bytemap from our internal state: we recolor each
// range, then store the new color (which is now the byte class) in each of the
// corresponding array elements. Finally, we output the number of byte classes.
class ByteMapBuilder {};

void ByteMapBuilder::Mark(int lo, int hi) {}

void ByteMapBuilder::Merge() {}

void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {}

int ByteMapBuilder::Recolor(int oldcolor) {}

void Prog::ComputeByteMap() {}

// Prog::Flatten() implements a graph rewriting algorithm.
//
// The overall process is similar to epsilon removal, but retains some epsilon
// transitions: those from Capture and EmptyWidth instructions; and those from
// nullable subexpressions. (The latter avoids quadratic blowup in transitions
// in the worst case.) It might be best thought of as Alt instruction elision.
//
// In conceptual terms, it divides the Prog into "trees" of instructions, then
// traverses the "trees" in order to produce "lists" of instructions. A "tree"
// is one or more instructions that grow from one "root" instruction to one or
// more "leaf" instructions; if a "tree" has exactly one instruction, then the
// "root" is also the "leaf". In most cases, a "root" is the successor of some
// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
// and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
// EmptyWidth or Match instruction. However, this is insufficient for handling
// nested nullable subexpressions correctly, so in some cases, a "root" is the
// dominator of the instructions reachable from some "successor root" (i.e. it
// has an unreachable predecessor) and is considered a "dominator root". Since
// only Alt instructions can be "dominator roots" (other instructions would be
// "leaves"), only Alt instructions are required to be marked as predecessors.
//
// Dividing the Prog into "trees" comprises two passes: marking the "successor
// roots" and the predecessors; and marking the "dominator roots". Sorting the
// "successor roots" by their bytecode offsets enables iteration in order from
// greatest to least during the second pass; by working backwards in this case
// and flooding the graph no further than "leaves" and already marked "roots",
// it becomes possible to mark "dominator roots" without doing excessive work.
//
// Traversing the "trees" is just iterating over the "roots" in order of their
// marking and flooding the graph no further than "leaves" and "roots". When a
// "leaf" is reached, the instruction is copied with its successor remapped to
// its "root" number. When a "root" is reached, a Nop instruction is generated
// with its successor remapped similarly. As each "list" is produced, its last
// instruction is marked as such. After all of the "lists" have been produced,
// a pass over their instructions remaps their successors to bytecode offsets.
void Prog::Flatten() {}

void Prog::MarkSuccessors(SparseArray<int>* rootmap,
                          SparseArray<int>* predmap,
                          std::vector<std::vector<int>>* predvec,
                          SparseSet* reachable, std::vector<int>* stk) {}

void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
                         SparseArray<int>* predmap,
                         std::vector<std::vector<int>>* predvec,
                         SparseSet* reachable, std::vector<int>* stk) {}

void Prog::EmitList(int root, SparseArray<int>* rootmap,
                    std::vector<Inst>* flat,
                    SparseSet* reachable, std::vector<int>* stk) {}

// For each ByteRange instruction in [begin, end), computes a hint to execution
// engines: the delta to the next instruction (in flat) worth exploring iff the
// current instruction matched.
//
// Implements a coloring algorithm related to ByteMapBuilder, but in this case,
// colors are instructions and recoloring ranges precisely identifies conflicts
// between instructions. Iterating backwards over [begin, end) is guaranteed to
// identify the nearest conflict (if any) with only linear complexity.
void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {}

// The final state will always be this, which frees up a register for the hot
// loop and thus avoids the spilling that can occur when building with Clang.
static const size_t kShiftDFAFinal =;

// This function takes the prefix as std::string (i.e. not const std::string&
// as normal) because it's going to clobber it, so a temporary is convenient.
static uint64_t* BuildShiftDFA(std::string prefix) {}

void Prog::ConfigurePrefixAccel(const std::string& prefix,
                                bool prefix_foldcase) {}

const void* Prog::PrefixAccel_ShiftDFA(const void* data, size_t size) {}

#if defined(__AVX2__)
// Finds the least significant non-zero bit in n.
static int FindLSBSet(uint32_t n) {
  ABSL_DCHECK_NE(n, uint32_t{0});
#if defined(__GNUC__)
  return __builtin_ctz(n);
#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
  unsigned long c;
  _BitScanForward(&c, n);
  return static_cast<int>(c);
#else
  int c = 31;
  for (int shift = 1 << 4; shift != 0; shift >>= 1) {
    uint32_t word = n << shift;
    if (word != 0) {
      n = word;
      c -= shift;
    }
  }
  return c;
#endif
}
#endif

const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) {}

}  // namespace re2