// Copyright 2006 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #ifndef RE2_REGEXP_H_ #define RE2_REGEXP_H_ // --- SPONSORED LINK -------------------------------------------------- // If you want to use this library for regular expression matching, // you should use re2/re2.h, which provides a class RE2 that // mimics the PCRE interface provided by PCRE's C++ wrappers. // This header describes the low-level interface used to implement RE2 // and may change in backwards-incompatible ways from time to time. // In contrast, RE2's interface will not. // --------------------------------------------------------------------- // Regular expression library: parsing, execution, and manipulation // of regular expressions. // // Any operation that traverses the Regexp structures should be written // using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested // regular expressions such as x++++++++++++++++++++... might cause recursive // traversals to overflow the stack. // // It is the caller's responsibility to provide appropriate mutual exclusion // around manipulation of the regexps. RE2 does this. // // PARSING // // Regexp::Parse parses regular expressions encoded in UTF-8. // The default syntax is POSIX extended regular expressions, // with the following changes: // // 1. Backreferences (optional in POSIX EREs) are not supported. // (Supporting them precludes the use of DFA-based // matching engines.) // // 2. Collating elements and collation classes are not supported. // (No one has needed or wanted them.) // // The exact syntax accepted can be modified by passing flags to // Regexp::Parse. In particular, many of the basic Perl additions // are available. The flags are documented below (search for LikePerl). // // If parsed with the flag Regexp::Latin1, both the regular expression // and the input to the matching routines are assumed to be encoded in // Latin-1, not UTF-8. // // EXECUTION // // Once Regexp has parsed a regular expression, it provides methods // to search text using that regular expression. These methods are // implemented via calling out to other regular expression libraries. // (Let's call them the sublibraries.) // // To call a sublibrary, Regexp does not simply prepare a // string version of the regular expression and hand it to the // sublibrary. Instead, Regexp prepares, from its own parsed form, the // corresponding internal representation used by the sublibrary. // This has the drawback of needing to know the internal representation // used by the sublibrary, but it has two important benefits: // // 1. The syntax and meaning of regular expressions is guaranteed // to be that used by Regexp's parser, not the syntax expected // by the sublibrary. Regexp might accept a restricted or // expanded syntax for regular expressions as compared with // the sublibrary. As long as Regexp can translate from its // internal form into the sublibrary's, clients need not know // exactly which sublibrary they are using. // // 2. The sublibrary parsers are bypassed. For whatever reason, // sublibrary regular expression parsers often have security // problems. For example, plan9grep's regular expression parser // has a buffer overflow in its handling of large character // classes, and PCRE's parser has had buffer overflow problems // in the past. Security-team requires sandboxing of sublibrary // regular expression parsers. Avoiding the sublibrary parsers // avoids the sandbox. // // The execution methods we use now are provided by the compiled form, // Prog, described in prog.h // // MANIPULATION // // Unlike other regular expression libraries, Regexp makes its parsed // form accessible to clients, so that client code can analyze the // parsed regular expressions. #include <stddef.h> #include <stdint.h> #include <map> #include <set> #include <string> #include "absl/log/absl_check.h" #include "absl/log/absl_log.h" #include "absl/strings/string_view.h" #include "util/utf.h" namespace re2 { // Keep in sync with string list kOpcodeNames[] in testing/dump.cc enum RegexpOp { … }; // Keep in sync with string list in regexp.cc enum RegexpStatusCode { … }; // Error status for certain operations. class RegexpStatus { … }; // Compiled form; see prog.h class Prog; struct RuneRange { … }; // Less-than on RuneRanges treats a == b if they overlap at all. // This lets us look in a set to find the range covering a particular Rune. struct RuneRangeLess { … }; class CharClassBuilder; class CharClass { … }; class Regexp { … }; // Character class set: contains non-overlapping, non-abutting RuneRanges. RuneRangeSet; class CharClassBuilder { … }; // Bitwise ops on ParseFlags produce ParseFlags. inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b) { … } inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b) { … } inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b) { … } inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) { … } } // namespace re2 #endif // RE2_REGEXP_H_