// Copyright 2007 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Compiled regular expression representation. // Tested by compile_test.cc #include "re2/prog.h" #include <stdint.h> #include <string.h> #include <algorithm> #include <string> #include <utility> #include <vector> #include "absl/base/attributes.h" #include "absl/log/absl_check.h" #include "absl/log/absl_log.h" #include "absl/strings/str_format.h" #include "absl/strings/string_view.h" #include "re2/bitmap256.h" #include "re2/pod_array.h" #include "re2/sparse_array.h" #include "re2/sparse_set.h" #if defined(__AVX2__) #include <immintrin.h> #ifdef _MSC_VER #include <intrin.h> #endif #endif namespace re2 { // Constructors per Inst opcode void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) { … } void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) { … } void Prog::Inst::InitCapture(int cap, uint32_t out) { … } void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) { … } void Prog::Inst::InitMatch(int32_t id) { … } void Prog::Inst::InitNop(uint32_t out) { … } void Prog::Inst::InitFail() { … } std::string Prog::Inst::Dump() { … } Prog::Prog() : … { … } Prog::~Prog() { … } Workq; static inline void AddToQueue(Workq* q, int id) { … } static std::string ProgToString(Prog* prog, Workq* q) { … } static std::string FlattenedProgToString(Prog* prog, int start) { … } std::string Prog::Dump() { … } std::string Prog::DumpUnanchored() { … } std::string Prog::DumpByteMap() { … } // Is ip a guaranteed match at end of text, perhaps after some capturing? static bool IsMatch(Prog* prog, Prog::Inst* ip) { … } // Peep-hole optimizer. void Prog::Optimize() { … } uint32_t Prog::EmptyFlags(absl::string_view text, const char* p) { … } // ByteMapBuilder implements a coloring algorithm. // // The first phase is a series of "mark and merge" batches: we mark one or more // [lo-hi] ranges, then merge them into our internal state. Batching is not for // performance; rather, it means that the ranges are treated indistinguishably. // // Internally, the ranges are represented using a bitmap that stores the splits // and a vector that stores the colors; both of them are indexed by the ranges' // last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at // hi (if not already split), then recolor each range in between. The color map // (i.e. from the old color to the new color) is maintained for the lifetime of // the batch and so underpins this somewhat obscure approach to set operations. // // The second phase builds the bytemap from our internal state: we recolor each // range, then store the new color (which is now the byte class) in each of the // corresponding array elements. Finally, we output the number of byte classes. class ByteMapBuilder { … }; void ByteMapBuilder::Mark(int lo, int hi) { … } void ByteMapBuilder::Merge() { … } void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) { … } int ByteMapBuilder::Recolor(int oldcolor) { … } void Prog::ComputeByteMap() { … } // Prog::Flatten() implements a graph rewriting algorithm. // // The overall process is similar to epsilon removal, but retains some epsilon // transitions: those from Capture and EmptyWidth instructions; and those from // nullable subexpressions. (The latter avoids quadratic blowup in transitions // in the worst case.) It might be best thought of as Alt instruction elision. // // In conceptual terms, it divides the Prog into "trees" of instructions, then // traverses the "trees" in order to produce "lists" of instructions. A "tree" // is one or more instructions that grow from one "root" instruction to one or // more "leaf" instructions; if a "tree" has exactly one instruction, then the // "root" is also the "leaf". In most cases, a "root" is the successor of some // "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction) // and is considered a "successor root". A "leaf" can be a ByteRange, Capture, // EmptyWidth or Match instruction. However, this is insufficient for handling // nested nullable subexpressions correctly, so in some cases, a "root" is the // dominator of the instructions reachable from some "successor root" (i.e. it // has an unreachable predecessor) and is considered a "dominator root". Since // only Alt instructions can be "dominator roots" (other instructions would be // "leaves"), only Alt instructions are required to be marked as predecessors. // // Dividing the Prog into "trees" comprises two passes: marking the "successor // roots" and the predecessors; and marking the "dominator roots". Sorting the // "successor roots" by their bytecode offsets enables iteration in order from // greatest to least during the second pass; by working backwards in this case // and flooding the graph no further than "leaves" and already marked "roots", // it becomes possible to mark "dominator roots" without doing excessive work. // // Traversing the "trees" is just iterating over the "roots" in order of their // marking and flooding the graph no further than "leaves" and "roots". When a // "leaf" is reached, the instruction is copied with its successor remapped to // its "root" number. When a "root" is reached, a Nop instruction is generated // with its successor remapped similarly. As each "list" is produced, its last // instruction is marked as such. After all of the "lists" have been produced, // a pass over their instructions remaps their successors to bytecode offsets. void Prog::Flatten() { … } void Prog::MarkSuccessors(SparseArray<int>* rootmap, SparseArray<int>* predmap, std::vector<std::vector<int>>* predvec, SparseSet* reachable, std::vector<int>* stk) { … } void Prog::MarkDominator(int root, SparseArray<int>* rootmap, SparseArray<int>* predmap, std::vector<std::vector<int>>* predvec, SparseSet* reachable, std::vector<int>* stk) { … } void Prog::EmitList(int root, SparseArray<int>* rootmap, std::vector<Inst>* flat, SparseSet* reachable, std::vector<int>* stk) { … } // For each ByteRange instruction in [begin, end), computes a hint to execution // engines: the delta to the next instruction (in flat) worth exploring iff the // current instruction matched. // // Implements a coloring algorithm related to ByteMapBuilder, but in this case, // colors are instructions and recoloring ranges precisely identifies conflicts // between instructions. Iterating backwards over [begin, end) is guaranteed to // identify the nearest conflict (if any) with only linear complexity. void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) { … } // The final state will always be this, which frees up a register for the hot // loop and thus avoids the spilling that can occur when building with Clang. static const size_t kShiftDFAFinal = …; // This function takes the prefix as std::string (i.e. not const std::string& // as normal) because it's going to clobber it, so a temporary is convenient. static uint64_t* BuildShiftDFA(std::string prefix) { … } void Prog::ConfigurePrefixAccel(const std::string& prefix, bool prefix_foldcase) { … } const void* Prog::PrefixAccel_ShiftDFA(const void* data, size_t size) { … } #if defined(__AVX2__) // Finds the least significant non-zero bit in n. static int FindLSBSet(uint32_t n) { ABSL_DCHECK_NE(n, uint32_t{0}); #if defined(__GNUC__) return __builtin_ctz(n); #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) unsigned long c; _BitScanForward(&c, n); return static_cast<int>(c); #else int c = 31; for (int shift = 1 << 4; shift != 0; shift >>= 1) { uint32_t word = n << shift; if (word != 0) { n = word; c -= shift; } } return c; #endif } #endif const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) { … } } // namespace re2