// Copyright 2006 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Regular expression parser. // The parser is a simple precedence-based parser with a // manual stack. The parsing work is done by the methods // of the ParseState class. The Regexp::Parse function is // essentially just a lexer that calls the ParseState method // for each token. // The parser recognizes POSIX extended regular expressions // excluding backreferences, collating elements, and collating // classes. It also allows the empty string as a regular expression // and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W. // See regexp.h for rationale. #include <stddef.h> #include <stdint.h> #include <string.h> #include <algorithm> #include <string> #include <vector> #include "absl/base/attributes.h" #include "absl/base/macros.h" #include "absl/log/absl_log.h" #include "absl/strings/ascii.h" #include "absl/strings/string_view.h" #include "re2/pod_array.h" #include "re2/regexp.h" #include "re2/unicode_casefold.h" #include "re2/unicode_groups.h" #include "re2/walker-inl.h" #include "util/utf.h" #if defined(RE2_USE_ICU) #include "unicode/uniset.h" #include "unicode/unistr.h" #include "unicode/utypes.h" #endif namespace re2 { // Controls the maximum repeat count permitted by the parser. static int maximum_repeat_count = …; void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) { … } // Regular expression parse state. // The list of parsed regexps so far is maintained as a vector of // Regexp pointers called the stack. Left parenthesis and vertical // bar markers are also placed on the stack, as Regexps with // non-standard opcodes. // Scanning a left parenthesis causes the parser to push a left parenthesis // marker on the stack. // Scanning a vertical bar causes the parser to pop the stack until it finds a // vertical bar or left parenthesis marker (not popping the marker), // concatenate all the popped results, and push them back on // the stack (DoConcatenation). // Scanning a right parenthesis causes the parser to act as though it // has seen a vertical bar, which then leaves the top of the stack in the // form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar. // The parser pops all this off the stack and creates an alternation of the // regexps (DoAlternation). class Regexp::ParseState { … }; // Pseudo-operators - only on parse stack. const RegexpOp kLeftParen = …; const RegexpOp kVerticalBar = …; Regexp::ParseState::ParseState(ParseFlags flags, absl::string_view whole_regexp, RegexpStatus* status) : … { … } // Cleans up by freeing all the regexps on the stack. Regexp::ParseState::~ParseState() { … } // Finishes the regexp if necessary, preparing it for use in // a more complex expression. // If it is a CharClassBuilder, converts into a CharClass. Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) { … } // Pushes the given regular expression onto the stack. // Could check for too much memory used here. bool Regexp::ParseState::PushRegexp(Regexp* re) { … } // Searches the case folding tables and returns the CaseFold* that contains r. // If there isn't one, returns the CaseFold* with smallest f->lo bigger than r. // If there isn't one, returns NULL. const CaseFold* LookupCaseFold(const CaseFold* f, int n, Rune r) { … } // Returns the result of applying the fold f to the rune r. Rune ApplyFold(const CaseFold* f, Rune r) { … } // Returns the next Rune in r's folding cycle (see unicode_casefold.h). // Examples: // CycleFoldRune('A') = 'a' // CycleFoldRune('a') = 'A' // // CycleFoldRune('K') = 'k' // CycleFoldRune('k') = 0x212A (Kelvin) // CycleFoldRune(0x212A) = 'K' // // CycleFoldRune('?') = '?' Rune CycleFoldRune(Rune r) { … } // Add lo-hi to the class, along with their fold-equivalent characters. static void AddFoldedRangeLatin1(CharClassBuilder* cc, Rune lo, Rune hi) { … } // Add lo-hi to the class, along with their fold-equivalent characters. // If lo-hi is already in the class, assume that the fold-equivalent // chars are there too, so there's no work to do. static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { … } // Pushes the literal rune r onto the stack. bool Regexp::ParseState::PushLiteral(Rune r) { … } // Pushes a ^ onto the stack. bool Regexp::ParseState::PushCaret() { … } // Pushes a \b or \B onto the stack. bool Regexp::ParseState::PushWordBoundary(bool word) { … } // Pushes a $ onto the stack. bool Regexp::ParseState::PushDollar() { … } // Pushes a . onto the stack. bool Regexp::ParseState::PushDot() { … } // Pushes a regexp with the given op (and no args) onto the stack. bool Regexp::ParseState::PushSimpleOp(RegexpOp op) { … } // Pushes a repeat operator regexp onto the stack. // A valid argument for the operator must already be on the stack. // The char c is the name of the operator, for use in error messages. bool Regexp::ParseState::PushRepeatOp(RegexpOp op, absl::string_view s, bool nongreedy) { … } // RepetitionWalker reports whether the repetition regexp is valid. // Valid means that the combination of the top-level repetition // and any inner repetitions does not exceed n copies of the // innermost thing. // This rewalks the regexp tree and is called for every repetition, // so we have to worry about inducing quadratic behavior in the parser. // We avoid this by only using RepetitionWalker when min or max >= 2. // In that case the depth of any >= 2 nesting can only get to 9 without // triggering a parse error, so each subtree can only be rewalked 9 times. class RepetitionWalker : public Regexp::Walker<int> { … }; int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { … } int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, int* child_args, int nchild_args) { … } int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) { … } // Pushes a repetition regexp onto the stack. // A valid argument for the operator must already be on the stack. bool Regexp::ParseState::PushRepetition(int min, int max, absl::string_view s, bool nongreedy) { … } // Checks whether a particular regexp op is a marker. bool Regexp::ParseState::IsMarker(RegexpOp op) { … } // Processes a left parenthesis in the input. // Pushes a marker onto the stack. bool Regexp::ParseState::DoLeftParen(absl::string_view name) { … } // Pushes a non-capturing marker onto the stack. bool Regexp::ParseState::DoLeftParenNoCapture() { … } // Processes a vertical bar in the input. bool Regexp::ParseState::DoVerticalBar() { … } // Processes a right parenthesis in the input. bool Regexp::ParseState::DoRightParen() { … } // Processes the end of input, returning the final regexp. Regexp* Regexp::ParseState::DoFinish() { … } // Returns the leading regexp that re starts with. // The returned Regexp* points into a piece of re, // so it must not be used after the caller calls re->Decref(). Regexp* Regexp::LeadingRegexp(Regexp* re) { … } // Removes LeadingRegexp(re) from re and returns what's left. // Consumes the reference to re and may edit it in place. // If caller wants to hold on to LeadingRegexp(re), // must have already Incref'ed it. Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) { … } // Returns the leading string that re starts with. // The returned Rune* points into a piece of re, // so it must not be used after the caller calls re->Decref(). Rune* Regexp::LeadingString(Regexp* re, int* nrune, Regexp::ParseFlags* flags) { … } // Removes the first n leading runes from the beginning of re. // Edits re in place. void Regexp::RemoveLeadingString(Regexp* re, int n) { … } // In the context of factoring alternations, a Splice is: a factored prefix or // merged character class computed by one iteration of one round of factoring; // the span of subexpressions of the alternation to be "spliced" (i.e. removed // and replaced); and, for a factored prefix, the number of suffixes after any // factoring that might have subsequently been performed on them. For a merged // character class, there are no suffixes, of course, so the field is ignored. struct Splice { … }; // Named so because it is used to implement an explicit stack, a Frame is: the // span of subexpressions of the alternation to be factored; the current round // of factoring; any Splices computed; and, for a factored prefix, an iterator // to the next Splice to be factored (i.e. in another Frame) because suffixes. struct Frame { … }; // Bundled into a class for friend access to Regexp without needing to declare // (or define) Splice in regexp.h. class FactorAlternationImpl { … }; // Factors common prefixes from alternation. // For example, // ABC|ABD|AEF|BCX|BCY // simplifies to // A(B(C|D)|EF)|BC(X|Y) // and thence to // A(B[CD]|EF)|BC[XY] // // Rewrites sub to contain simplified list to alternate and returns // the new length of sub. Adjusts reference counts accordingly // (incoming sub[i] decremented, outgoing sub[i] incremented). int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) { … } void FactorAlternationImpl::Round1(Regexp** sub, int nsub, Regexp::ParseFlags flags, std::vector<Splice>* splices) { … } void FactorAlternationImpl::Round2(Regexp** sub, int nsub, Regexp::ParseFlags flags, std::vector<Splice>* splices) { … } void FactorAlternationImpl::Round3(Regexp** sub, int nsub, Regexp::ParseFlags flags, std::vector<Splice>* splices) { … } // Collapse the regexps on top of the stack, down to the // first marker, into a new op node (op == kRegexpAlternate // or op == kRegexpConcat). void Regexp::ParseState::DoCollapse(RegexpOp op) { … } // Finishes the current concatenation, // collapsing it into a single regexp on the stack. void Regexp::ParseState::DoConcatenation() { … } // Finishes the current alternation, // collapsing it to a single regexp on the stack. void Regexp::ParseState::DoAlternation() { … } // Incremental conversion of concatenated literals into strings. // If top two elements on stack are both literal or string, // collapse into single string. // Don't walk down the stack -- the parser calls this frequently // enough that below the bottom two is known to be collapsed. // Only called when another regexp is about to be pushed // on the stack, so that the topmost literal is not being considered. // (Otherwise ab* would turn into (ab)*.) // If r >= 0, consider pushing a literal r on the stack. // Return whether that happened. bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { … } // Lexing routines. // Parses a decimal integer, storing it in *np. // Sets *s to span the remainder of the string. static bool ParseInteger(absl::string_view* s, int* np) { … } // Parses a repetition suffix like {1,2} or {2} or {2,}. // Sets *s to span the remainder of the string on success. // Sets *lo and *hi to the given range. // In the case of {2,}, the high number is unbounded; // sets *hi to -1 to signify this. // {,2} is NOT a valid suffix. // The Maybe in the name signifies that the regexp parse // doesn't fail even if ParseRepetition does, so the string_view // s must NOT be edited unless MaybeParseRepetition returns true. static bool MaybeParseRepetition(absl::string_view* sp, int* lo, int* hi) { … } // Removes the next Rune from the string_view and stores it in *r. // Returns number of bytes removed from sp. // Behaves as though there is a terminating NUL at the end of sp. // Argument order is backwards from usual Google style // but consistent with chartorune. static int StringViewToRune(Rune* r, absl::string_view* sp, RegexpStatus* status) { … } // Returns whether name is valid UTF-8. // If not, sets status to kRegexpBadUTF8. static bool IsValidUTF8(absl::string_view s, RegexpStatus* status) { … } // Is c a hex digit? static int IsHex(int c) { … } // Convert hex digit to value. static int UnHex(int c) { … } // Parse an escape sequence (e.g., \n, \{). // Sets *s to span the remainder of the string. // Sets *rp to the named character. static bool ParseEscape(absl::string_view* s, Rune* rp, RegexpStatus* status, int rune_max) { … } // Add a range to the character class, but exclude newline if asked. // Also handle case folding. void CharClassBuilder::AddRangeFlags( Rune lo, Rune hi, Regexp::ParseFlags parse_flags) { … } // Look for a group with the given name. static const UGroup* LookupGroup(absl::string_view name, const UGroup* groups, int ngroups) { … } // Look for a POSIX group with the given name (e.g., "[:^alpha:]") static const UGroup* LookupPosixGroup(absl::string_view name) { … } static const UGroup* LookupPerlGroup(absl::string_view name) { … } #if !defined(RE2_USE_ICU) // Fake UGroup containing all Runes static URange16 any16[] = …; static URange32 any32[] = …; static UGroup anygroup = …; // Look for a Unicode group with the given name (e.g., "Han") static const UGroup* LookupUnicodeGroup(absl::string_view name) { … } #endif // Add a UGroup or its negation to the character class. static void AddUGroup(CharClassBuilder* cc, const UGroup* g, int sign, Regexp::ParseFlags parse_flags) { … } // Maybe parse a Perl character class escape sequence. // Only recognizes the Perl character classes (\d \s \w \D \S \W), // not the Perl empty-string classes (\b \B \A \Z \z). // On success, sets *s to span the remainder of the string // and returns the corresponding UGroup. // The string_view must *NOT* be edited unless the call succeeds. const UGroup* MaybeParsePerlCCEscape(absl::string_view* s, Regexp::ParseFlags parse_flags) { … } enum ParseStatus { … }; // Maybe parses a Unicode character group like \p{Han} or \P{Han} // (the latter is a negated group). ParseStatus ParseUnicodeGroup(absl::string_view* s, Regexp::ParseFlags parse_flags, CharClassBuilder* cc, RegexpStatus* status) { … } // Parses a character class name like [:alnum:]. // Sets *s to span the remainder of the string. // Adds the ranges corresponding to the class to ranges. static ParseStatus ParseCCName(absl::string_view* s, Regexp::ParseFlags parse_flags, CharClassBuilder* cc, RegexpStatus* status) { … } // Parses a character inside a character class. // There are fewer special characters here than in the rest of the regexp. // Sets *s to span the remainder of the string. // Sets *rp to the character. bool Regexp::ParseState::ParseCCCharacter(absl::string_view* s, Rune* rp, absl::string_view whole_class, RegexpStatus* status) { … } // Parses a character class character, or, if the character // is followed by a hyphen, parses a character class range. // For single characters, rr->lo == rr->hi. // Sets *s to span the remainder of the string. // Sets *rp to the character. bool Regexp::ParseState::ParseCCRange(absl::string_view* s, RuneRange* rr, absl::string_view whole_class, RegexpStatus* status) { … } // Parses a possibly-negated character class expression like [^abx-z[:digit:]]. // Sets *s to span the remainder of the string. // Sets *out_re to the regexp for the class. bool Regexp::ParseState::ParseCharClass(absl::string_view* s, Regexp** out_re, RegexpStatus* status) { … } // Returns whether name is a valid capture name. static bool IsValidCaptureName(absl::string_view name) { … } // Parses a Perl flag setting or non-capturing group or both, // like (?i) or (?: or (?i:. Removes from s, updates parse state. // The caller must check that s begins with "(?". // Returns true on success. If the Perl flag is not // well-formed or not supported, sets status_ and returns false. bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) { … } // Converts latin1 (assumed to be encoded as Latin1 bytes) // into UTF8 encoding in string. // Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is // deprecated and because it rejects code points 0x80-0x9F. void ConvertLatin1ToUTF8(absl::string_view latin1, std::string* utf) { … } // Parses the regular expression given by s, // returning the corresponding Regexp tree. // The caller must Decref the return value when done with it. // Returns NULL on error. Regexp* Regexp::Parse(absl::string_view s, ParseFlags global_flags, RegexpStatus* status) { … } } // namespace re2