// Copyright 2006 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Rewrite POSIX and other features in re // to use simple extended regular expression features. // Also sort and simplify character classes. #include <stddef.h> #include <algorithm> #include <string> #include "absl/log/absl_log.h" #include "absl/strings/string_view.h" #include "re2/pod_array.h" #include "re2/regexp.h" #include "re2/walker-inl.h" #include "util/utf.h" namespace re2 { // Parses the regexp src and then simplifies it and sets *dst to the // string representation of the simplified form. Returns true on success. // Returns false and sets *error (if error != NULL) on error. bool Regexp::SimplifyRegexp(absl::string_view src, ParseFlags flags, std::string* dst, RegexpStatus* status) { … } // Assuming the simple_ flags on the children are accurate, // is this Regexp* simple? bool Regexp::ComputeSimple() { … } // Walker subclass used by Simplify. // Coalesces runs of star/plus/quest/repeat of the same literal along with any // occurrences of that literal into repeats of that literal. It also works for // char classes, any char and any byte. // PostVisit creates the coalesced result, which should then be simplified. class CoalesceWalker : public Regexp::Walker<Regexp*> { … }; // Walker subclass used by Simplify. // The simplify walk is purely post-recursive: given the simplified children, // PostVisit creates the simplified result. // The child_args are simplified Regexp*s. class SimplifyWalker : public Regexp::Walker<Regexp*> { … }; // Simplifies a regular expression, returning a new regexp. // The new regexp uses traditional Unix egrep features only, // plus the Perl (?:) non-capturing parentheses. // Otherwise, no POSIX or Perl additions. The new regexp // captures exactly the same subexpressions (with the same indices) // as the original. // Does not edit current object. // Caller must Decref() return value when done with it. Regexp* Regexp::Simplify() { … } #define Simplify … // Utility function for PostVisit implementations that compares re->sub() with // child_args to determine whether any child_args changed. In the common case, // where nothing changed, calls Decref() for all child_args and returns false, // so PostVisit must return re->Incref(). Otherwise, returns true. static bool ChildArgsChanged(Regexp* re, Regexp** child_args) { … } Regexp* CoalesceWalker::Copy(Regexp* re) { … } Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { … } Regexp* CoalesceWalker::PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, Regexp** child_args, int nchild_args) { … } bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) { … } void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) { … } Regexp* SimplifyWalker::Copy(Regexp* re) { … } Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { … } Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { … } Regexp* SimplifyWalker::PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, Regexp** child_args, int nchild_args) { … } // Creates a concatenation of two Regexp, consuming refs to re1 and re2. // Returns a new Regexp, handing the ref to the caller. Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags parse_flags) { … } // Returns true if re is an empty-width op. static bool IsEmptyOp(Regexp* re) { … } // Simplifies the expression re{min,max} in terms of *, +, and ?. // Returns a new regexp. Does not edit re. Does not consume reference to re. // Caller must Decref return value when done with it. // The result will *not* necessarily have the right capturing parens // if you call ToString() and re-parse it: (x){2} becomes (x)(x), // but in the Regexp* representation, both (x) are marked as $1. Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, Regexp::ParseFlags f) { … } // Simplifies a character class. // Caller must Decref return value when done with it. Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) { … } } // namespace re2