regexp.h | Explore in Territory

// Copyright 2006 The RE2 Authors.  All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#ifndef RE2_REGEXP_H_
#define RE2_REGEXP_H_

// --- SPONSORED LINK --------------------------------------------------
// If you want to use this library for regular expression matching,
// you should use re2/re2.h, which provides a class RE2 that
// mimics the PCRE interface provided by PCRE's C++ wrappers.
// This header describes the low-level interface used to implement RE2
// and may change in backwards-incompatible ways from time to time.
// In contrast, RE2's interface will not.
// ---------------------------------------------------------------------

// Regular expression library: parsing, execution, and manipulation
// of regular expressions.
//
// Any operation that traverses the Regexp structures should be written
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
// regular expressions such as x++++++++++++++++++++... might cause recursive
// traversals to overflow the stack.
//
// It is the caller's responsibility to provide appropriate mutual exclusion
// around manipulation of the regexps.  RE2 does this.
//
// PARSING
//
// Regexp::Parse parses regular expressions encoded in UTF-8.
// The default syntax is POSIX extended regular expressions,
// with the following changes:
//
//   1.  Backreferences (optional in POSIX EREs) are not supported.
//         (Supporting them precludes the use of DFA-based
//          matching engines.)
//
//   2.  Collating elements and collation classes are not supported.
//         (No one has needed or wanted them.)
//
// The exact syntax accepted can be modified by passing flags to
// Regexp::Parse.  In particular, many of the basic Perl additions
// are available.  The flags are documented below (search for LikePerl).
//
// If parsed with the flag Regexp::Latin1, both the regular expression
// and the input to the matching routines are assumed to be encoded in
// Latin-1, not UTF-8.
//
// EXECUTION
//
// Once Regexp has parsed a regular expression, it provides methods
// to search text using that regular expression.  These methods are
// implemented via calling out to other regular expression libraries.
// (Let's call them the sublibraries.)
//
// To call a sublibrary, Regexp does not simply prepare a
// string version of the regular expression and hand it to the
// sublibrary.  Instead, Regexp prepares, from its own parsed form, the
// corresponding internal representation used by the sublibrary.
// This has the drawback of needing to know the internal representation
// used by the sublibrary, but it has two important benefits:
//
//   1. The syntax and meaning of regular expressions is guaranteed
//      to be that used by Regexp's parser, not the syntax expected
//      by the sublibrary.  Regexp might accept a restricted or
//      expanded syntax for regular expressions as compared with
//      the sublibrary.  As long as Regexp can translate from its
//      internal form into the sublibrary's, clients need not know
//      exactly which sublibrary they are using.
//
//   2. The sublibrary parsers are bypassed.  For whatever reason,
//      sublibrary regular expression parsers often have security
//      problems.  For example, plan9grep's regular expression parser
//      has a buffer overflow in its handling of large character
//      classes, and PCRE's parser has had buffer overflow problems
//      in the past.  Security-team requires sandboxing of sublibrary
//      regular expression parsers.  Avoiding the sublibrary parsers
//      avoids the sandbox.
//
// The execution methods we use now are provided by the compiled form,
// Prog, described in prog.h
//
// MANIPULATION
//
// Unlike other regular expression libraries, Regexp makes its parsed
// form accessible to clients, so that client code can analyze the
// parsed regular expressions.

#include <stddef.h>
#include <stdint.h>

#include <map>
#include <set>
#include <string>

#include "absl/log/absl_check.h"
#include "absl/log/absl_log.h"
#include "absl/strings/string_view.h"
#include "util/utf.h"

namespace re2 {

// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
enum RegexpOp { … };

// Keep in sync with string list in regexp.cc
enum RegexpStatusCode { … };

// Error status for certain operations.
class RegexpStatus { … };

// Compiled form; see prog.h
class Prog;

struct RuneRange { … };

// Less-than on RuneRanges treats a == b if they overlap at all.
// This lets us look in a set to find the range covering a particular Rune.
struct RuneRangeLess { … };

class CharClassBuilder;

class CharClass { … };

class Regexp { … };

// Character class set: contains non-overlapping, non-abutting RuneRanges.
RuneRangeSet;

class CharClassBuilder { … };

// Bitwise ops on ParseFlags produce ParseFlags.
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
                                    Regexp::ParseFlags b) { … }

inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
                                    Regexp::ParseFlags b) { … }

inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
                                    Regexp::ParseFlags b) { … }

inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) { … }

}  // namespace re2

#endif  // RE2_REGEXP_H_
chromium/third_party/re2/src/re2/regexp.h