// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // file: regexcmp.cpp // // Copyright (C) 2002-2016 International Business Machines Corporation and others. // All Rights Reserved. // // This file contains the ICU regular expression compiler, which is responsible // for processing a regular expression pattern into the compiled form that // is used by the match finding engine. // #include "unicode/utypes.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS #include "unicode/ustring.h" #include "unicode/unistr.h" #include "unicode/uniset.h" #include "unicode/uchar.h" #include "unicode/uchriter.h" #include "unicode/parsepos.h" #include "unicode/parseerr.h" #include "unicode/regex.h" #include "unicode/utf.h" #include "unicode/utf16.h" #include "patternprops.h" #include "putilimp.h" #include "cmemory.h" #include "cstr.h" #include "cstring.h" #include "uvectr32.h" #include "uvectr64.h" #include "uassert.h" #include "uinvchar.h" #include "regeximp.h" #include "regexcst.h" // Contains state table for the regex pattern parser. // generated by a Perl script. #include "regexcmp.h" #include "regexst.h" #include "regextxt.h" U_NAMESPACE_BEGIN //------------------------------------------------------------------------------ // // Constructor. // //------------------------------------------------------------------------------ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : … { … } static const char16_t chAmp = …; // '&' static const char16_t chDash = …; // '-' //------------------------------------------------------------------------------ // // Destructor // //------------------------------------------------------------------------------ RegexCompile::~RegexCompile() { … } static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { … } //------------------------------------------------------------------------------ // // Compile regex pattern. The state machine for rexexp pattern parsing is here. // The state tables are hand-written in the file regexcst.txt, // and converted to the form used here by a perl // script regexcst.pl // //------------------------------------------------------------------------------ void RegexCompile::compile( const UnicodeString &pat, // Source pat to be compiled. UParseError &pp, // Error position info UErrorCode &e) // Error Code { … } // // compile, UText mode // All the work is actually done here. // void RegexCompile::compile( UText *pat, // Source pat to be compiled. UParseError &pp, // Error position info UErrorCode &e) // Error Code { … } //------------------------------------------------------------------------------ // // doParseAction Do some action during regex pattern parsing. // Called by the parse state machine. // // Generation of the match engine PCode happens here, or // in functions called from the parse actions defined here. // // //------------------------------------------------------------------------------ UBool RegexCompile::doParseActions(int32_t action) { … } //------------------------------------------------------------------------------ // // literalChar We've encountered a literal character from the pattern, // or an escape sequence that reduces to a character. // Add it to the string containing all literal chars/strings from // the pattern. // //------------------------------------------------------------------------------ void RegexCompile::literalChar(UChar32 c) { … } //------------------------------------------------------------------------------ // // fixLiterals When compiling something that can follow a literal // string in a pattern, emit the code to match the // accumulated literal string. // // Optionally, split the last char of the string off into // a single "ONE_CHAR" operation, so that quantifiers can // apply to that char alone. Example: abc* // The * must apply to the 'c' only. // //------------------------------------------------------------------------------ void RegexCompile::fixLiterals(UBool split) { … } int32_t RegexCompile::buildOp(int32_t type, int32_t val) { … } //------------------------------------------------------------------------------ // // appendOp() Append a new instruction onto the compiled pattern // Includes error checking, limiting the size of the // pattern to lengths that can be represented in the // 24 bit operand field of an instruction. // //------------------------------------------------------------------------------ void RegexCompile::appendOp(int32_t op) { … } void RegexCompile::appendOp(int32_t type, int32_t val) { … } //------------------------------------------------------------------------------ // // insertOp() Insert a slot for a new opcode into the already // compiled pattern code. // // Fill the slot with a NOP. Our caller will replace it // with what they really wanted. // //------------------------------------------------------------------------------ void RegexCompile::insertOp(int32_t where) { … } //------------------------------------------------------------------------------ // // allocateData() Allocate storage in the matcher's static data area. // Return the index for the newly allocated data. // The storage won't actually exist until we are running a match // operation, but the storage indexes are inserted into various // opcodes while compiling the pattern. // //------------------------------------------------------------------------------ int32_t RegexCompile::allocateData(int32_t size) { … } //------------------------------------------------------------------------------ // // allocateStackData() Allocate space in the back-tracking stack frame. // Return the index for the newly allocated data. // The frame indexes are inserted into various // opcodes while compiling the pattern, meaning that frame // size must be restricted to the size that will fit // as an operand (24 bits). // //------------------------------------------------------------------------------ int32_t RegexCompile::allocateStackData(int32_t size) { … } //------------------------------------------------------------------------------ // // blockTopLoc() Find or create a location in the compiled pattern // at the start of the operation or block that has // just been compiled. Needed when a quantifier (* or // whatever) appears, and we need to add an operation // at the start of the thing being quantified. // // (Parenthesized Blocks) have a slot with a NOP that // is reserved for this purpose. .* or similar don't // and a slot needs to be added. // // parameter reserveLoc : true - ensure that there is space to add an opcode // at the returned location. // false - just return the address, // do not reserve a location there. // //------------------------------------------------------------------------------ int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { … } //------------------------------------------------------------------------------ // // handleCloseParen When compiling a close paren, we need to go back // and fix up any JMP or SAVE operations within the // parenthesized block that need to target the end // of the block. The locations of these are kept on // the paretheses stack. // // This function is called both when encountering a // real ) and at the end of the pattern. // //------------------------------------------------------------------------------ void RegexCompile::handleCloseParen() { … } //------------------------------------------------------------------------------ // // compileSet Compile the pattern operations for a reference to a // UnicodeSet. // //------------------------------------------------------------------------------ void RegexCompile::compileSet(UnicodeSet *theSet) { … } //------------------------------------------------------------------------------ // // compileInterval Generate the code for a {min, max} style interval quantifier. // Except for the specific opcodes used, the code is the same // for all three types (greedy, non-greedy, possessive) of // intervals. The opcodes are supplied as parameters. // (There are two sets of opcodes - greedy & possessive use the // same ones, while non-greedy has it's own.) // // The code for interval loops has this form: // 0 CTR_INIT counter loc (in stack frame) // 1 5 patt address of CTR_LOOP at bottom of block // 2 min count // 3 max count (-1 for unbounded) // 4 ... block to be iterated over // 5 CTR_LOOP // // In //------------------------------------------------------------------------------ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) { … } UBool RegexCompile::compileInlineInterval() { … } //------------------------------------------------------------------------------ // // caseInsensitiveStart given a single code point from a pattern string, determine the // set of characters that could potentially begin a case-insensitive // match of a string beginning with that character, using full Unicode // case insensitive matching. // // This is used in optimizing find(). // // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but // misses cases like this: // A string from the pattern begins with 'ss' (although all we know // in this context is that it begins with 's') // The pattern could match a string beginning with a German sharp-s // // To the ordinary case closure for a character c, we add all other // characters cx where the case closure of cx includes a string form that begins // with the original character c. // // This function could be made smarter. The full pattern string is available // and it would be possible to verify that the extra characters being added // to the starting set fully match, rather than having just a first-char of the // folded form match. // //------------------------------------------------------------------------------ void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) { … } // Increment with overflow check. // val and delta will both be positive. static int32_t safeIncrement(int32_t val, int32_t delta) { … } //------------------------------------------------------------------------------ // // matchStartType Determine how a match can start. // Used to optimize find() operations. // // Operation is very similar to minMatchLength(). Walk the compiled // pattern, keeping an on-going minimum-match-length. For any // op where the min match coming in is zero, add that ops possible // starting matches to the possible starts for the overall pattern. // //------------------------------------------------------------------------------ void RegexCompile::matchStartType() { … } //------------------------------------------------------------------------------ // // minMatchLength Calculate the length of the shortest string that could // match the specified pattern. // Length is in 16 bit code units, not code points. // // The calculated length may not be exact. The returned // value may be shorter than the actual minimum; it must // never be longer. // // start and end are the range of p-code operations to be // examined. The endpoints are included in the range. // //------------------------------------------------------------------------------ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { … } //------------------------------------------------------------------------------ // // maxMatchLength Calculate the length of the longest string that could // match the specified pattern. // Length is in 16 bit code units, not code points. // // The calculated length may not be exact. The returned // value may be longer than the actual maximum; it must // never be shorter. // // start, end: the range of the pattern to check. // end is inclusive. // //------------------------------------------------------------------------------ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { … } //------------------------------------------------------------------------------ // // stripNOPs Remove any NOP operations from the compiled pattern code. // Extra NOPs are inserted for some constructs during the initial // code generation to provide locations that may be patched later. // Many end up unneeded, and are removed by this function. // // In order to minimize the number of passes through the pattern, // back-reference fixup is also performed here (adjusting // back-reference operands to point to the correct frame offsets). // //------------------------------------------------------------------------------ void RegexCompile::stripNOPs() { … } //------------------------------------------------------------------------------ // // Error Report a rule parse error. // Only report it if no previous error has been recorded. // //------------------------------------------------------------------------------ void RegexCompile::error(UErrorCode e) { … } // // Assorted Unicode character constants. // Numeric because there is no portable way to enter them as literals. // (Think EBCDIC). // static const char16_t chCR = …; // New lines, for terminating comments. static const char16_t chLF = …; // Line Feed static const char16_t chPound = …; // '#', introduces a comment. static const char16_t chDigit0 = …; // '0' static const char16_t chDigit7 = …; // '9' static const char16_t chColon = …; // ':' static const char16_t chE = …; // 'E' static const char16_t chQ = …; // 'Q' //static const char16_t chN = 0x4E; // 'N' static const char16_t chP = …; // 'P' static const char16_t chBackSlash = …; // '\' introduces a char escape //static const char16_t chLBracket = 0x5b; // '[' static const char16_t chRBracket = …; // ']' static const char16_t chUp = …; // '^' static const char16_t chLowerP = …; static const char16_t chLBrace = …; // '{' static const char16_t chRBrace = …; // '}' static const char16_t chNEL = …; // NEL newline variant static const char16_t chLS = …; // Unicode Line Separator //------------------------------------------------------------------------------ // // nextCharLL Low Level Next Char from the regex pattern. // Get a char from the string, keep track of input position // for error reporting. // //------------------------------------------------------------------------------ UChar32 RegexCompile::nextCharLL() { … } //------------------------------------------------------------------------------ // // peekCharLL Low Level Character Scanning, sneak a peek at the next // character without actually getting it. // //------------------------------------------------------------------------------ UChar32 RegexCompile::peekCharLL() { … } //------------------------------------------------------------------------------ // // nextChar for pattern scanning. At this level, we handle stripping // out comments and processing some backslash character escapes. // The rest of the pattern grammar is handled at the next level up. // //------------------------------------------------------------------------------ void RegexCompile::nextChar(RegexPatternChar &c) { … } //------------------------------------------------------------------------------ // // scanNamedChar // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. // // The scan position will be at the 'N'. On return // the scan position should be just after the '}' // // Return the UChar32 // //------------------------------------------------------------------------------ UChar32 RegexCompile::scanNamedChar() { … } //------------------------------------------------------------------------------ // // scanProp Construct a UnicodeSet from the text at the current scan // position, which will be of the form \p{whaterver} // // The scan position will be at the 'p' or 'P'. On return // the scan position should be just after the '}' // // Return a UnicodeSet, constructed from the \P pattern, // or nullptr if the pattern is invalid. // //------------------------------------------------------------------------------ UnicodeSet *RegexCompile::scanProp() { … } //------------------------------------------------------------------------------ // // scanPosixProp Construct a UnicodeSet from the text at the current scan // position, which is expected be of the form [:property expression:] // // The scan position will be at the opening ':'. On return // the scan position must be on the closing ']' // // Return a UnicodeSet constructed from the pattern, // or nullptr if this is not a valid POSIX-style set expression. // If not a property expression, restore the initial scan position // (to the opening ':') // // Note: the opening '[:' is not sufficient to guarantee that // this is a [:property:] expression. // [:'+=,] is a perfectly good ordinary set expression that // happens to include ':' as one of its characters. // //------------------------------------------------------------------------------ UnicodeSet *RegexCompile::scanPosixProp() { … } static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { … } // // Create a Unicode Set from a Unicode Property expression. // This is common code underlying both \p{...} and [:...:] expressions. // Includes trying the Java "properties" that aren't supported as // normal ICU UnicodeSet properties // UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) { … } // // SetEval Part of the evaluation of [set expressions]. // Perform any pending (stacked) operations with precedence // equal or greater to that of the next operator encountered // in the expression. // void RegexCompile::setEval(int32_t nextOp) { … } void RegexCompile::setPushOp(int32_t op) { … } U_NAMESPACE_END #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS