parse.cc | Explore in Territory

// Copyright 2006 The RE2 Authors.  All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Regular expression parser.

// The parser is a simple precedence-based parser with a
// manual stack.  The parsing work is done by the methods
// of the ParseState class.  The Regexp::Parse function is
// essentially just a lexer that calls the ParseState method
// for each token.

// The parser recognizes POSIX extended regular expressions
// excluding backreferences, collating elements, and collating
// classes.  It also allows the empty string as a regular expression
// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.
// See regexp.h for rationale.

#include <stddef.h>
#include <stdint.h>
#include <string.h>

#include <algorithm>
#include <string>
#include <vector>

#include "absl/base/attributes.h"
#include "absl/base/macros.h"
#include "absl/log/absl_log.h"
#include "absl/strings/ascii.h"
#include "absl/strings/string_view.h"
#include "re2/pod_array.h"
#include "re2/regexp.h"
#include "re2/unicode_casefold.h"
#include "re2/unicode_groups.h"
#include "re2/walker-inl.h"
#include "util/utf.h"

#if defined(RE2_USE_ICU)
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/utypes.h"
#endif

namespace re2 {

// Controls the maximum repeat count permitted by the parser.
static int maximum_repeat_count = …;

void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) { … }

// Regular expression parse state.
// The list of parsed regexps so far is maintained as a vector of
// Regexp pointers called the stack.  Left parenthesis and vertical
// bar markers are also placed on the stack, as Regexps with
// non-standard opcodes.
// Scanning a left parenthesis causes the parser to push a left parenthesis
// marker on the stack.
// Scanning a vertical bar causes the parser to pop the stack until it finds a
// vertical bar or left parenthesis marker (not popping the marker),
// concatenate all the popped results, and push them back on
// the stack (DoConcatenation).
// Scanning a right parenthesis causes the parser to act as though it
// has seen a vertical bar, which then leaves the top of the stack in the
// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
// The parser pops all this off the stack and creates an alternation of the
// regexps (DoAlternation).

class Regexp::ParseState { … };

// Pseudo-operators - only on parse stack.
const RegexpOp kLeftParen = …;
const RegexpOp kVerticalBar = …;

Regexp::ParseState::ParseState(ParseFlags flags,
                               absl::string_view whole_regexp,
                               RegexpStatus* status)
  : … { … }

// Cleans up by freeing all the regexps on the stack.
Regexp::ParseState::~ParseState() { … }

// Finishes the regexp if necessary, preparing it for use in
// a more complex expression.
// If it is a CharClassBuilder, converts into a CharClass.
Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) { … }

// Pushes the given regular expression onto the stack.
// Could check for too much memory used here.
bool Regexp::ParseState::PushRegexp(Regexp* re) { … }

// Searches the case folding tables and returns the CaseFold* that contains r.
// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
// If there isn't one, returns NULL.
const CaseFold* LookupCaseFold(const CaseFold* f, int n, Rune r) { … }

// Returns the result of applying the fold f to the rune r.
Rune ApplyFold(const CaseFold* f, Rune r) { … }

// Returns the next Rune in r's folding cycle (see unicode_casefold.h).
// Examples:
//   CycleFoldRune('A') = 'a'
//   CycleFoldRune('a') = 'A'
//
//   CycleFoldRune('K') = 'k'
//   CycleFoldRune('k') = 0x212A (Kelvin)
//   CycleFoldRune(0x212A) = 'K'
//
//   CycleFoldRune('?') = '?'
Rune CycleFoldRune(Rune r) { … }

// Add lo-hi to the class, along with their fold-equivalent characters.
static void AddFoldedRangeLatin1(CharClassBuilder* cc, Rune lo, Rune hi) { … }

// Add lo-hi to the class, along with their fold-equivalent characters.
// If lo-hi is already in the class, assume that the fold-equivalent
// chars are there too, so there's no work to do.
static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { … }

// Pushes the literal rune r onto the stack.
bool Regexp::ParseState::PushLiteral(Rune r) { … }

// Pushes a ^ onto the stack.
bool Regexp::ParseState::PushCaret() { … }

// Pushes a \b or \B onto the stack.
bool Regexp::ParseState::PushWordBoundary(bool word) { … }

// Pushes a $ onto the stack.
bool Regexp::ParseState::PushDollar() { … }

// Pushes a . onto the stack.
bool Regexp::ParseState::PushDot() { … }

// Pushes a regexp with the given op (and no args) onto the stack.
bool Regexp::ParseState::PushSimpleOp(RegexpOp op) { … }

// Pushes a repeat operator regexp onto the stack.
// A valid argument for the operator must already be on the stack.
// The char c is the name of the operator, for use in error messages.
bool Regexp::ParseState::PushRepeatOp(RegexpOp op, absl::string_view s,
                                      bool nongreedy) { … }

// RepetitionWalker reports whether the repetition regexp is valid.
// Valid means that the combination of the top-level repetition
// and any inner repetitions does not exceed n copies of the
// innermost thing.
// This rewalks the regexp tree and is called for every repetition,
// so we have to worry about inducing quadratic behavior in the parser.
// We avoid this by only using RepetitionWalker when min or max >= 2.
// In that case the depth of any >= 2 nesting can only get to 9 without
// triggering a parse error, so each subtree can only be rewalked 9 times.
class RepetitionWalker : public Regexp::Walker<int> { … };

int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { … }

int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
                                int* child_args, int nchild_args) { … }

int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) { … }

// Pushes a repetition regexp onto the stack.
// A valid argument for the operator must already be on the stack.
bool Regexp::ParseState::PushRepetition(int min, int max, absl::string_view s,
                                        bool nongreedy) { … }

// Checks whether a particular regexp op is a marker.
bool Regexp::ParseState::IsMarker(RegexpOp op) { … }

// Processes a left parenthesis in the input.
// Pushes a marker onto the stack.
bool Regexp::ParseState::DoLeftParen(absl::string_view name) { … }

// Pushes a non-capturing marker onto the stack.
bool Regexp::ParseState::DoLeftParenNoCapture() { … }

// Processes a vertical bar in the input.
bool Regexp::ParseState::DoVerticalBar() { … }

// Processes a right parenthesis in the input.
bool Regexp::ParseState::DoRightParen() { … }

// Processes the end of input, returning the final regexp.
Regexp* Regexp::ParseState::DoFinish() { … }

// Returns the leading regexp that re starts with.
// The returned Regexp* points into a piece of re,
// so it must not be used after the caller calls re->Decref().
Regexp* Regexp::LeadingRegexp(Regexp* re) { … }

// Removes LeadingRegexp(re) from re and returns what's left.
// Consumes the reference to re and may edit it in place.
// If caller wants to hold on to LeadingRegexp(re),
// must have already Incref'ed it.
Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) { … }

// Returns the leading string that re starts with.
// The returned Rune* points into a piece of re,
// so it must not be used after the caller calls re->Decref().
Rune* Regexp::LeadingString(Regexp* re, int* nrune,
                            Regexp::ParseFlags* flags) { … }

// Removes the first n leading runes from the beginning of re.
// Edits re in place.
void Regexp::RemoveLeadingString(Regexp* re, int n) { … }

// In the context of factoring alternations, a Splice is: a factored prefix or
// merged character class computed by one iteration of one round of factoring;
// the span of subexpressions of the alternation to be "spliced" (i.e. removed
// and replaced); and, for a factored prefix, the number of suffixes after any
// factoring that might have subsequently been performed on them. For a merged
// character class, there are no suffixes, of course, so the field is ignored.
struct Splice { … };

// Named so because it is used to implement an explicit stack, a Frame is: the
// span of subexpressions of the alternation to be factored; the current round
// of factoring; any Splices computed; and, for a factored prefix, an iterator
// to the next Splice to be factored (i.e. in another Frame) because suffixes.
struct Frame { … };

// Bundled into a class for friend access to Regexp without needing to declare
// (or define) Splice in regexp.h.
class FactorAlternationImpl { … };

// Factors common prefixes from alternation.
// For example,
//     ABC|ABD|AEF|BCX|BCY
// simplifies to
//     A(B(C|D)|EF)|BC(X|Y)
// and thence to
//     A(B[CD]|EF)|BC[XY]
//
// Rewrites sub to contain simplified list to alternate and returns
// the new length of sub.  Adjusts reference counts accordingly
// (incoming sub[i] decremented, outgoing sub[i] incremented).
int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) { … }

void FactorAlternationImpl::Round1(Regexp** sub, int nsub,
                                   Regexp::ParseFlags flags,
                                   std::vector<Splice>* splices) { … }

void FactorAlternationImpl::Round2(Regexp** sub, int nsub,
                                   Regexp::ParseFlags flags,
                                   std::vector<Splice>* splices) { … }

void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
                                   Regexp::ParseFlags flags,
                                   std::vector<Splice>* splices) { … }

// Collapse the regexps on top of the stack, down to the
// first marker, into a new op node (op == kRegexpAlternate
// or op == kRegexpConcat).
void Regexp::ParseState::DoCollapse(RegexpOp op) { … }

// Finishes the current concatenation,
// collapsing it into a single regexp on the stack.
void Regexp::ParseState::DoConcatenation() { … }

// Finishes the current alternation,
// collapsing it to a single regexp on the stack.
void Regexp::ParseState::DoAlternation() { … }

// Incremental conversion of concatenated literals into strings.
// If top two elements on stack are both literal or string,
// collapse into single string.
// Don't walk down the stack -- the parser calls this frequently
// enough that below the bottom two is known to be collapsed.
// Only called when another regexp is about to be pushed
// on the stack, so that the topmost literal is not being considered.
// (Otherwise ab* would turn into (ab)*.)
// If r >= 0, consider pushing a literal r on the stack.
// Return whether that happened.
bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { … }

// Lexing routines.

// Parses a decimal integer, storing it in *np.
// Sets *s to span the remainder of the string.
static bool ParseInteger(absl::string_view* s, int* np) { … }

// Parses a repetition suffix like {1,2} or {2} or {2,}.
// Sets *s to span the remainder of the string on success.
// Sets *lo and *hi to the given range.
// In the case of {2,}, the high number is unbounded;
// sets *hi to -1 to signify this.
// {,2} is NOT a valid suffix.
// The Maybe in the name signifies that the regexp parse
// doesn't fail even if ParseRepetition does, so the string_view
// s must NOT be edited unless MaybeParseRepetition returns true.
static bool MaybeParseRepetition(absl::string_view* sp, int* lo, int* hi) { … }

// Removes the next Rune from the string_view and stores it in *r.
// Returns number of bytes removed from sp.
// Behaves as though there is a terminating NUL at the end of sp.
// Argument order is backwards from usual Google style
// but consistent with chartorune.
static int StringViewToRune(Rune* r, absl::string_view* sp,
                            RegexpStatus* status) { … }

// Returns whether name is valid UTF-8.
// If not, sets status to kRegexpBadUTF8.
static bool IsValidUTF8(absl::string_view s, RegexpStatus* status) { … }

// Is c a hex digit?
static int IsHex(int c) { … }

// Convert hex digit to value.
static int UnHex(int c) { … }

// Parse an escape sequence (e.g., \n, \{).
// Sets *s to span the remainder of the string.
// Sets *rp to the named character.
static bool ParseEscape(absl::string_view* s, Rune* rp,
                        RegexpStatus* status, int rune_max) { … }

// Add a range to the character class, but exclude newline if asked.
// Also handle case folding.
void CharClassBuilder::AddRangeFlags(
    Rune lo, Rune hi, Regexp::ParseFlags parse_flags) { … }

// Look for a group with the given name.
static const UGroup* LookupGroup(absl::string_view name,
                                 const UGroup* groups, int ngroups) { … }

// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
static const UGroup* LookupPosixGroup(absl::string_view name) { … }

static const UGroup* LookupPerlGroup(absl::string_view name) { … }

#if !defined(RE2_USE_ICU)
// Fake UGroup containing all Runes
static URange16 any16[] = …;
static URange32 any32[] = …;
static UGroup anygroup = …;

// Look for a Unicode group with the given name (e.g., "Han")
static const UGroup* LookupUnicodeGroup(absl::string_view name) { … }
#endif

// Add a UGroup or its negation to the character class.
static void AddUGroup(CharClassBuilder* cc, const UGroup* g, int sign,
                      Regexp::ParseFlags parse_flags) { … }

// Maybe parse a Perl character class escape sequence.
// Only recognizes the Perl character classes (\d \s \w \D \S \W),
// not the Perl empty-string classes (\b \B \A \Z \z).
// On success, sets *s to span the remainder of the string
// and returns the corresponding UGroup.
// The string_view must *NOT* be edited unless the call succeeds.
const UGroup* MaybeParsePerlCCEscape(absl::string_view* s,
                                     Regexp::ParseFlags parse_flags) { … }

enum ParseStatus { … };

// Maybe parses a Unicode character group like \p{Han} or \P{Han}
// (the latter is a negated group).
ParseStatus ParseUnicodeGroup(absl::string_view* s,
                              Regexp::ParseFlags parse_flags,
                              CharClassBuilder* cc, RegexpStatus* status) { … }

// Parses a character class name like [:alnum:].
// Sets *s to span the remainder of the string.
// Adds the ranges corresponding to the class to ranges.
static ParseStatus ParseCCName(absl::string_view* s,
                               Regexp::ParseFlags parse_flags,
                               CharClassBuilder* cc, RegexpStatus* status) { … }

// Parses a character inside a character class.
// There are fewer special characters here than in the rest of the regexp.
// Sets *s to span the remainder of the string.
// Sets *rp to the character.
bool Regexp::ParseState::ParseCCCharacter(absl::string_view* s, Rune* rp,
                                          absl::string_view whole_class,
                                          RegexpStatus* status) { … }

// Parses a character class character, or, if the character
// is followed by a hyphen, parses a character class range.
// For single characters, rr->lo == rr->hi.
// Sets *s to span the remainder of the string.
// Sets *rp to the character.
bool Regexp::ParseState::ParseCCRange(absl::string_view* s, RuneRange* rr,
                                      absl::string_view whole_class,
                                      RegexpStatus* status) { … }

// Parses a possibly-negated character class expression like [^abx-z[:digit:]].
// Sets *s to span the remainder of the string.
// Sets *out_re to the regexp for the class.
bool Regexp::ParseState::ParseCharClass(absl::string_view* s, Regexp** out_re,
                                        RegexpStatus* status) { … }

// Returns whether name is a valid capture name.
static bool IsValidCaptureName(absl::string_view name) { … }

// Parses a Perl flag setting or non-capturing group or both,
// like (?i) or (?: or (?i:.  Removes from s, updates parse state.
// The caller must check that s begins with "(?".
// Returns true on success.  If the Perl flag is not
// well-formed or not supported, sets status_ and returns false.
bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) { … }

// Converts latin1 (assumed to be encoded as Latin1 bytes)
// into UTF8 encoding in string.
// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
// deprecated and because it rejects code points 0x80-0x9F.
void ConvertLatin1ToUTF8(absl::string_view latin1, std::string* utf) { … }

// Parses the regular expression given by s,
// returning the corresponding Regexp tree.
// The caller must Decref the return value when done with it.
// Returns NULL on error.
Regexp* Regexp::Parse(absl::string_view s, ParseFlags global_flags,
                      RegexpStatus* status) { … }

}  // namespace re2
chromium/third_party/re2/src/re2/parse.cc