regexcmp.cpp | Explore in Territory

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
//
//  file:  regexcmp.cpp
//
//  Copyright (C) 2002-2016 International Business Machines Corporation and others.
//  All Rights Reserved.
//
//  This file contains the ICU regular expression compiler, which is responsible
//  for processing a regular expression pattern into the compiled form that
//  is used by the match finding engine.
//

#include "unicode/utypes.h"

#if !UCONFIG_NO_REGULAR_EXPRESSIONS

#include "unicode/ustring.h"
#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/uchriter.h"
#include "unicode/parsepos.h"
#include "unicode/parseerr.h"
#include "unicode/regex.h"
#include "unicode/utf.h"
#include "unicode/utf16.h"
#include "patternprops.h"
#include "putilimp.h"
#include "cmemory.h"
#include "cstr.h"
#include "cstring.h"
#include "uvectr32.h"
#include "uvectr64.h"
#include "uassert.h"
#include "uinvchar.h"

#include "regeximp.h"
#include "regexcst.h"   // Contains state table for the regex pattern parser.
                        //   generated by a Perl script.
#include "regexcmp.h"
#include "regexst.h"
#include "regextxt.h"



U_NAMESPACE_BEGIN


//------------------------------------------------------------------------------
//
//  Constructor.
//
//------------------------------------------------------------------------------
RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : … { … }

static const char16_t   chAmp       = …;      // '&'
static const char16_t   chDash      = …;      // '-'


//------------------------------------------------------------------------------
//
//  Destructor
//
//------------------------------------------------------------------------------
RegexCompile::~RegexCompile() { … }

static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { … }

//------------------------------------------------------------------------------
//
//  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
//                           The state tables are hand-written in the file regexcst.txt,
//                           and converted to the form used here by a perl
//                           script regexcst.pl
//
//------------------------------------------------------------------------------
void    RegexCompile::compile(
                         const UnicodeString &pat,   // Source pat to be compiled.
                         UParseError &pp,            // Error position info
                         UErrorCode &e)              // Error Code
{ … }

//
//   compile, UText mode
//     All the work is actually done here.
//
void    RegexCompile::compile(
                         UText *pat,                 // Source pat to be compiled.
                         UParseError &pp,            // Error position info
                         UErrorCode &e)              // Error Code
{ … }





//------------------------------------------------------------------------------
//
//  doParseAction        Do some action during regex pattern parsing.
//                       Called by the parse state machine.
//
//                       Generation of the match engine PCode happens here, or
//                       in functions called from the parse actions defined here.
//
//
//------------------------------------------------------------------------------
UBool RegexCompile::doParseActions(int32_t action)
{ … }



//------------------------------------------------------------------------------
//
//   literalChar           We've encountered a literal character from the pattern,
//                             or an escape sequence that reduces to a character.
//                         Add it to the string containing all literal chars/strings from
//                             the pattern.
//
//------------------------------------------------------------------------------
void RegexCompile::literalChar(UChar32 c)  { … }


//------------------------------------------------------------------------------
//
//    fixLiterals           When compiling something that can follow a literal
//                          string in a pattern, emit the code to match the
//                          accumulated literal string.
//
//                          Optionally, split the last char of the string off into
//                          a single "ONE_CHAR" operation, so that quantifiers can
//                          apply to that char alone.  Example:   abc*
//                          The * must apply to the 'c' only.
//
//------------------------------------------------------------------------------
void    RegexCompile::fixLiterals(UBool split) { … }


int32_t RegexCompile::buildOp(int32_t type, int32_t val) { … }


//------------------------------------------------------------------------------
//
//   appendOp()             Append a new instruction onto the compiled pattern
//                          Includes error checking, limiting the size of the
//                          pattern to lengths that can be represented in the
//                          24 bit operand field of an instruction.
//
//------------------------------------------------------------------------------
void RegexCompile::appendOp(int32_t op) { … }

void RegexCompile::appendOp(int32_t type, int32_t val) { … }


//------------------------------------------------------------------------------
//
//   insertOp()             Insert a slot for a new opcode into the already
//                          compiled pattern code.
//
//                          Fill the slot with a NOP.  Our caller will replace it
//                          with what they really wanted.
//
//------------------------------------------------------------------------------
void   RegexCompile::insertOp(int32_t where) { … }


//------------------------------------------------------------------------------
//
//   allocateData()        Allocate storage in the matcher's static data area.
//                         Return the index for the newly allocated data.
//                         The storage won't actually exist until we are running a match
//                         operation, but the storage indexes are inserted into various
//                         opcodes while compiling the pattern.
//
//------------------------------------------------------------------------------
int32_t RegexCompile::allocateData(int32_t size) { … }


//------------------------------------------------------------------------------
//
//   allocateStackData()   Allocate space in the back-tracking stack frame.
//                         Return the index for the newly allocated data.
//                         The frame indexes are inserted into various
//                         opcodes while compiling the pattern, meaning that frame
//                         size must be restricted to the size that will fit
//                         as an operand (24 bits).
//
//------------------------------------------------------------------------------
int32_t RegexCompile::allocateStackData(int32_t size) { … }


//------------------------------------------------------------------------------
//
//   blockTopLoc()          Find or create a location in the compiled pattern
//                          at the start of the operation or block that has
//                          just been compiled.  Needed when a quantifier (* or
//                          whatever) appears, and we need to add an operation
//                          at the start of the thing being quantified.
//
//                          (Parenthesized Blocks) have a slot with a NOP that
//                          is reserved for this purpose.  .* or similar don't
//                          and a slot needs to be added.
//
//       parameter reserveLoc   :  true -  ensure that there is space to add an opcode
//                                         at the returned location.
//                                 false - just return the address,
//                                         do not reserve a location there.
//
//------------------------------------------------------------------------------
int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) { … }



//------------------------------------------------------------------------------
//
//    handleCloseParen      When compiling a close paren, we need to go back
//                          and fix up any JMP or SAVE operations within the
//                          parenthesized block that need to target the end
//                          of the block.  The locations of these are kept on
//                          the paretheses stack.
//
//                          This function is called both when encountering a
//                          real ) and at the end of the pattern.
//
//------------------------------------------------------------------------------
void  RegexCompile::handleCloseParen() { … }



//------------------------------------------------------------------------------
//
//   compileSet       Compile the pattern operations for a reference to a
//                    UnicodeSet.
//
//------------------------------------------------------------------------------
void        RegexCompile::compileSet(UnicodeSet *theSet)
{ … }


//------------------------------------------------------------------------------
//
//   compileInterval    Generate the code for a {min, max} style interval quantifier.
//                      Except for the specific opcodes used, the code is the same
//                      for all three types (greedy, non-greedy, possessive) of
//                      intervals.  The opcodes are supplied as parameters.
//                      (There are two sets of opcodes - greedy & possessive use the
//                      same ones, while non-greedy has it's own.)
//
//                      The code for interval loops has this form:
//                         0  CTR_INIT   counter loc (in stack frame)
//                         1             5  patt address of CTR_LOOP at bottom of block
//                         2             min count
//                         3             max count   (-1 for unbounded)
//                         4  ...        block to be iterated over
//                         5  CTR_LOOP
//
//                       In
//------------------------------------------------------------------------------
void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
{ … }



UBool RegexCompile::compileInlineInterval() { … }



//------------------------------------------------------------------------------
//
//   caseInsensitiveStart  given a single code point from a pattern string, determine the 
//                         set of characters that could potentially begin a case-insensitive 
//                         match of a string beginning with that character, using full Unicode
//                         case insensitive matching.
//
//          This is used in optimizing find().
//
//          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
//          misses cases like this:
//             A string from the pattern begins with 'ss' (although all we know
//                 in this context is that it begins with 's')
//             The pattern could match a string beginning with a German sharp-s
//
//           To the ordinary case closure for a character c, we add all other
//           characters cx where the case closure of cx includes a string form that begins
//           with the original character c.
//
//           This function could be made smarter. The full pattern string is available
//           and it would be possible to verify that the extra characters being added
//           to the starting set fully match, rather than having just a first-char of the
//           folded form match.
//
//------------------------------------------------------------------------------
void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) { … }


// Increment with overflow check.
// val and delta will both be positive.

static int32_t safeIncrement(int32_t val, int32_t delta) { … }


//------------------------------------------------------------------------------
//
//   matchStartType    Determine how a match can start.
//                     Used to optimize find() operations.
//
//                     Operation is very similar to minMatchLength().  Walk the compiled
//                     pattern, keeping an on-going minimum-match-length.  For any
//                     op where the min match coming in is zero, add that ops possible
//                     starting matches to the possible starts for the overall pattern.
//
//------------------------------------------------------------------------------
void   RegexCompile::matchStartType() { … }



//------------------------------------------------------------------------------
//
//   minMatchLength    Calculate the length of the shortest string that could
//                     match the specified pattern.
//                     Length is in 16 bit code units, not code points.
//
//                     The calculated length may not be exact.  The returned
//                     value may be shorter than the actual minimum; it must
//                     never be longer.
//
//                     start and end are the range of p-code operations to be
//                     examined.  The endpoints are included in the range.
//
//------------------------------------------------------------------------------
int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) { … }

//------------------------------------------------------------------------------
//
//   maxMatchLength    Calculate the length of the longest string that could
//                     match the specified pattern.
//                     Length is in 16 bit code units, not code points.
//
//                     The calculated length may not be exact.  The returned
//                     value may be longer than the actual maximum; it must
//                     never be shorter.
//
//                     start, end: the range of the pattern to check.
//                     end is inclusive.
//
//------------------------------------------------------------------------------
int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) { … }


//------------------------------------------------------------------------------
//
//   stripNOPs    Remove any NOP operations from the compiled pattern code.
//                Extra NOPs are inserted for some constructs during the initial
//                code generation to provide locations that may be patched later.
//                Many end up unneeded, and are removed by this function.
//
//                In order to minimize the number of passes through the pattern,
//                back-reference fixup is also performed here (adjusting
//                back-reference operands to point to the correct frame offsets).
//
//------------------------------------------------------------------------------
void RegexCompile::stripNOPs() { … }




//------------------------------------------------------------------------------
//
//  Error         Report a rule parse error.
//                Only report it if no previous error has been recorded.
//
//------------------------------------------------------------------------------
void RegexCompile::error(UErrorCode e) { … }


//
//  Assorted Unicode character constants.
//     Numeric because there is no portable way to enter them as literals.
//     (Think EBCDIC).
//
static const char16_t   chCR        = …;      // New lines, for terminating comments.
static const char16_t   chLF        = …;      // Line Feed
static const char16_t   chPound     = …;      // '#', introduces a comment.
static const char16_t   chDigit0    = …;      // '0'
static const char16_t   chDigit7    = …;      // '9'
static const char16_t   chColon     = …;      // ':'
static const char16_t   chE         = …;      // 'E'
static const char16_t   chQ         = …;      // 'Q'
//static const char16_t   chN         = 0x4E;      // 'N'
static const char16_t   chP         = …;      // 'P'
static const char16_t   chBackSlash = …;      // '\'  introduces a char escape
//static const char16_t   chLBracket  = 0x5b;      // '['
static const char16_t   chRBracket  = …;      // ']'
static const char16_t   chUp        = …;      // '^'
static const char16_t   chLowerP    = …;
static const char16_t   chLBrace    = …;      // '{'
static const char16_t   chRBrace    = …;      // '}'
static const char16_t   chNEL       = …;      //    NEL newline variant
static const char16_t   chLS        = …;    //    Unicode Line Separator


//------------------------------------------------------------------------------
//
//  nextCharLL    Low Level Next Char from the regex pattern.
//                Get a char from the string, keep track of input position
//                     for error reporting.
//
//------------------------------------------------------------------------------
UChar32  RegexCompile::nextCharLL() { … }

//------------------------------------------------------------------------------
//
//   peekCharLL    Low Level Character Scanning, sneak a peek at the next
//                 character without actually getting it.
//
//------------------------------------------------------------------------------
UChar32  RegexCompile::peekCharLL() { … }


//------------------------------------------------------------------------------
//
//   nextChar     for pattern scanning.  At this level, we handle stripping
//                out comments and processing some backslash character escapes.
//                The rest of the pattern grammar is handled at the next level up.
//
//------------------------------------------------------------------------------
void RegexCompile::nextChar(RegexPatternChar &c) { … }



//------------------------------------------------------------------------------
//
//  scanNamedChar
//            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
//
//             The scan position will be at the 'N'.  On return
//             the scan position should be just after the '}'
//
//             Return the UChar32
//
//------------------------------------------------------------------------------
UChar32  RegexCompile::scanNamedChar() { … }

//------------------------------------------------------------------------------
//
//  scanProp   Construct a UnicodeSet from the text at the current scan
//             position, which will be of the form \p{whaterver}
//
//             The scan position will be at the 'p' or 'P'.  On return
//             the scan position should be just after the '}'
//
//             Return a UnicodeSet, constructed from the \P pattern,
//             or nullptr if the pattern is invalid.
//
//------------------------------------------------------------------------------
UnicodeSet *RegexCompile::scanProp() { … }

//------------------------------------------------------------------------------
//
//  scanPosixProp   Construct a UnicodeSet from the text at the current scan
//             position, which is expected be of the form [:property expression:]
//
//             The scan position will be at the opening ':'.  On return
//             the scan position must be on the closing ']'
//
//             Return a UnicodeSet constructed from the pattern,
//             or nullptr if this is not a valid POSIX-style set expression.
//             If not a property expression, restore the initial scan position
//                (to the opening ':')
//
//               Note:  the opening '[:' is not sufficient to guarantee that
//                      this is a [:property:] expression.
//                      [:'+=,] is a perfectly good ordinary set expression that
//                              happens to include ':' as one of its characters.
//
//------------------------------------------------------------------------------
UnicodeSet *RegexCompile::scanPosixProp() { … }

static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { … }

//
//  Create a Unicode Set from a Unicode Property expression.
//     This is common code underlying both \p{...} and [:...:] expressions.
//     Includes trying the Java "properties" that aren't supported as
//     normal ICU UnicodeSet properties
//
UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) { … }


//
//  SetEval   Part of the evaluation of [set expressions].
//            Perform any pending (stacked) operations with precedence
//            equal or greater to that of the next operator encountered
//            in the expression.
//
void RegexCompile::setEval(int32_t nextOp) { … }

void RegexCompile::setPushOp(int32_t op) { … }

U_NAMESPACE_END
#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
chromium/third_party/icu/source/i18n/regexcmp.cpp