/************************************************* * Perl-Compatible Regular Expressions * *************************************************/ /* PCRE is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge New API code Copyright (c) 2016-2023 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the University of Cambridge nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #define NLBLOCK … #define PSSTART … #define PSEND … #include "pcre2_internal.h" /* In rare error cases debugging might require calling pcre2_printint(). */ #if 0 #ifdef EBCDIC #define PRINTABLE … #else #define PRINTABLE … #endif #include "pcre2_printint.c" #define DEBUG_CALL_PRINTINT #endif /* Other debugging code can be enabled by these defines. */ /* #define DEBUG_SHOW_CAPTURES */ /* #define DEBUG_SHOW_PARSED */ /* There are a few things that vary with different code unit sizes. Handle them by defining macros in order to minimize #if usage. */ #if PCRE2_CODE_UNIT_WIDTH == 8 #define STRING_UTFn_RIGHTPAR … #define XDIGIT … #else /* Either 16-bit or 32-bit */ #define XDIGIT(c) … #if PCRE2_CODE_UNIT_WIDTH == 16 #define STRING_UTFn_RIGHTPAR … #else /* 32-bit */ #define STRING_UTFn_RIGHTPAR … #endif #endif /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which consists of uint32_t elements. Assume that if uint32_t can't hold it, two of them will be able to (i.e. assume a 64-bit world). */ #if PCRE2_SIZE_MAX <= UINT32_MAX #define PUTOFFSET … #define GETOFFSET … #define GETPLUSOFFSET … #define READPLUSOFFSET … #define SKIPOFFSET … #define SIZEOFFSET … #else #define PUTOFFSET(s,p) … #define GETOFFSET(s,p) … #define GETPLUSOFFSET(s,p) … #define READPLUSOFFSET(s,p) … #define SKIPOFFSET(p) … #define SIZEOFFSET … #endif /* Macros for manipulating elements of the parsed pattern vector. */ #define META_CODE(x) … #define META_DATA(x) … #define META_DIFF(x,y) … /* Function definitions to allow mutual recursion */ #ifdef SUPPORT_UNICODE static unsigned int add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, uint32_t, compile_block *, const uint32_t *, unsigned int); #endif static int compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *, open_capitem *, compile_block *, PCRE2_SIZE *); static int get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *, compile_block *); static BOOL set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *, compile_block *); static int check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *, compile_block *, int *); /************************************************* * Code parameters and static tables * *************************************************/ #define MAX_GROUP_NUMBER … #define MAX_REPEAT_COUNT … #define REPEAT_UNLIMITED … /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in different ways in the different pattern scans. The parsing and group- identifying pre-scan uses it to handle nesting, and needs it to be 16-bit aligned for this. Having defined the size in code units, we set up C16_WORK_SIZE as the number of elements in the 16-bit vector. During the first compiling phase, when determining how much memory is required, the regex is partly compiled into this space, but the compiled parts are discarded as soon as they can be, so that hopefully there will never be an overrun. The code does, however, check for an overrun, which can occur for pathological patterns. The size of the workspace depends on LINK_SIZE because the length of compiled items varies with this. In the real compile phase, this workspace is not currently used. */ #define COMPILE_WORK_SIZE … #define C16_WORK_SIZE … /* A uint32_t vector is used for caching information about the size of capturing groups, to improve performance. A default is created on the stack of this size. */ #define GROUPINFO_DEFAULT_SIZE … /* The overrun tests check for a slightly smaller size so that they detect the overrun before it actually does run off the end of the data block. */ #define WORK_SIZE_SAFETY_MARGIN … /* This value determines the size of the initial vector that is used for remembering named groups during the pre-compile. It is allocated on the stack, but if it is too small, it is expanded, in a similar way to the workspace. The value is the number of slots in the list. */ #define NAMED_GROUP_LIST_SIZE … /* The pre-compiling pass over the pattern creates a parsed pattern in a vector of uint32_t. For short patterns this lives on the stack, with this size. Heap memory is used for longer patterns. */ #define PARSED_PATTERN_DEFAULT_SIZE … /* Maximum length value to check against when making sure that the variable that holds the compiled pattern length does not overflow. We make it a bit less than INT_MAX to allow for adding in group terminating code units, so that we don't have to check them every time. */ #define OFLOW_MAX … /* Code values for parsed patterns, which are stored in a vector of 32-bit unsigned ints. Values less than META_END are literal data values. The coding for identifying the item is in the top 16-bits, leaving 16 bits for the additional data that some of them need. The META_CODE, META_DATA, and META_DIFF macros are used to manipulate parsed pattern elements. NOTE: When these definitions are changed, the table of extra lengths for each code (meta_extra_lengths, just below) must be updated to remain in step. */ #define META_END … #define META_ALT … #define META_ATOMIC … #define META_BACKREF … #define META_BACKREF_BYNAME … #define META_BIGVALUE … #define META_CALLOUT_NUMBER … #define META_CALLOUT_STRING … #define META_CAPTURE … #define META_CIRCUMFLEX … #define META_CLASS … #define META_CLASS_EMPTY … #define META_CLASS_EMPTY_NOT … #define META_CLASS_END … #define META_CLASS_NOT … #define META_COND_ASSERT … #define META_COND_DEFINE … #define META_COND_NAME … #define META_COND_NUMBER … #define META_COND_RNAME … #define META_COND_RNUMBER … #define META_COND_VERSION … #define META_DOLLAR … #define META_DOT … #define META_ESCAPE … #define META_KET … #define META_NOCAPTURE … #define META_OPTIONS … #define META_POSIX … #define META_POSIX_NEG … #define META_RANGE_ESCAPED … #define META_RANGE_LITERAL … #define META_RECURSE … #define META_RECURSE_BYNAME … #define META_SCRIPT_RUN … /* These must be kept together to make it easy to check that an assertion is present where expected in a conditional group. */ #define META_LOOKAHEAD … #define META_LOOKAHEADNOT … #define META_LOOKBEHIND … #define META_LOOKBEHINDNOT … /* These cannot be conditions */ #define META_LOOKAHEAD_NA … #define META_LOOKBEHIND_NA … /* These must be kept in this order, with consecutive values, and the _ARG versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument versions. */ #define META_MARK … #define META_ACCEPT … #define META_FAIL … #define META_COMMIT … #define META_COMMIT_ARG … #define META_PRUNE … #define META_PRUNE_ARG … #define META_SKIP … #define META_SKIP_ARG … #define META_THEN … #define META_THEN_ARG … /* These must be kept in groups of adjacent 3 values, and all together. */ #define META_ASTERISK … #define META_ASTERISK_PLUS … #define META_ASTERISK_QUERY … #define META_PLUS … #define META_PLUS_PLUS … #define META_PLUS_QUERY … #define META_QUERY … #define META_QUERY_PLUS … #define META_QUERY_QUERY … #define META_MINMAX … #define META_MINMAX_PLUS … #define META_MINMAX_QUERY … #define META_FIRST_QUANTIFIER … #define META_LAST_QUANTIFIER … /* This is a special "meta code" that is used only to distinguish (*asr: from (*sr: in the table of aphabetic assertions. It is never stored in the parsed pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is therefore no need for it to have a length entry, so use a high value. */ #define META_ATOMIC_SCRIPT_RUN … /* Table of extra lengths for each of the meta codes. Must be kept in step with the definitions above. For some items these values are a basic length to which a variable amount has to be added. */ static unsigned char meta_extra_lengths[] = …; /* Types for skipping parts of a parsed pattern. */ enum { … }; /* Macro for setting individual bits in class bitmaps. It took some experimenting to figure out how to stop gcc 5.3.0 from warning with -Wconversion. This version gets a warning: #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7)) Let's hope the apparently less efficient version isn't actually so bad if the compiler is clever with identical subexpressions. */ #define SETBIT(a,b) … /* Values and flags for the unsigned xxcuflags variables that accompany xxcu variables, which are concerned with first and required code units. A value greater than or equal to REQ_NONE means "no code unit set"; otherwise the matching xxcu variable is set, and the low valued bits are relevant. */ #define REQ_UNSET … #define REQ_NONE … #define REQ_CASELESS … #define REQ_VARY … /* These flags are used in the groupinfo vector. */ #define GI_SET_FIXED_LENGTH … #define GI_NOT_FIXED_LENGTH … #define GI_FIXED_LENGTH_MASK … /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC and is fast (a good compiler can turn it into a subtraction and unsigned comparison). */ #define IS_DIGIT(x) … /* Table to identify hex digits. The tables in chartables are dependent on the locale, and may mark arbitrary characters as digits. We want to recognize only 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It costs 256 bytes, but it is a lot faster than doing character value tests (at least in some simple cases I timed), and in some applications one wants PCRE2 to compile efficiently as well as match efficiently. The value in the table is the binary hex digit value, or 0xff for non-hex digits. */ /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in UTF-8 mode. */ #ifndef EBCDIC static const uint8_t xdigitab[] = …;/* 248-255 */ #else /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ static const uint8_t xdigitab[] = { 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */ 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */ 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */ 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */ 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */ #endif /* EBCDIC */ /* Table for handling alphanumeric escaped characters. Positive returns are simple data values; negative values are for special things like \d and so on. Zero means further processing is needed (for things like \x), or the escape is invalid. */ /* This is the "normal" table for ASCII systems or for EBCDIC systems running in UTF-8 mode. It runs from '0' to 'z'. */ #ifndef EBCDIC #define ESCAPES_FIRST … #define ESCAPES_LAST … #define UPPER_CASE(c) … static const short int escapes[] = …; #else /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a because it is defined as 'a', which of course picks up the ASCII value. */ #if 'a' == 0x81 /* Check for a real EBCDIC environment */ #define ESCAPES_FIRST … #define ESCAPES_LAST … #define UPPER_CASE … #else /* Testing in an ASCII environment */ #define ESCAPES_FIRST … #define ESCAPES_LAST … #define UPPER_CASE … #endif static const short int escapes[] = { /* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0, /* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0, /* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p, /* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0, /* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0, /* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0, /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', /* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0, /* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P, /* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0, /* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X, /* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0, /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, /* F8 */ 0, 0 }; /* We also need a table of characters that may follow \c in an EBCDIC environment for characters 0-31. */ static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"; #endif /* EBCDIC */ /* Table of special "verbs" like (*PRUNE). This is a short table, so it is searched linearly. Put all the names into a single string, in order to reduce the number of relocations when a shared library is dynamically linked. The string is built from string macros so that it works in UTF-8 mode on EBCDIC platforms. */ verbitem; static const char verbnames[] = …; static const verbitem verbs[] = …; static const int verbcount = …; /* Verb opcodes, indexed by their META code offset from META_MARK. */ static const uint32_t verbops[] = …; /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */ alasitem; static const char alasnames[] = …; static const alasitem alasmeta[] = …; static const int alascount = …; /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */ static uint32_t chartypeoffset[] = …; /* Tables of names of POSIX character classes and their lengths. The names are now all in a single string, to reduce the number of relocations when a shared library is dynamically loaded. The list of lengths is terminated by a zero length entry. The first three must be alpha, lower, upper, as this is assumed for handling case independence. The indices for several classes are needed, so identify them. */ static const char posix_names[] = …; static const uint8_t posix_name_lengths[] = …; #define PC_DIGIT … #define PC_GRAPH … #define PC_PRINT … #define PC_PUNCT … #define PC_XDIGIT … /* Table of class bit maps for each POSIX class. Each class is formed from a base map, with an optional addition or removal of another map. Then, for some classes, there is some additional tweaking: for [:blank:] the vertical space characters are removed, and for [:alpha:] and [:alnum:] the underscore character is removed. The triples in the table consist of the base map offset, second map offset or -1 if no second map, and a non-negative value for map addition or a negative value for map subtraction (if there are two maps). The absolute value of the third field has these meanings: 0 => no tweaking, 1 => remove vertical space characters, 2 => remove underscore. */ static const int posix_class_maps[] = …; #ifdef SUPPORT_UNICODE /* The POSIX class Unicode property substitutes that are used in UCP mode must be in the order of the POSIX class names, defined above. */ static int posix_substitutes[] = …; #define POSIX_SUBSIZE … #endif /* SUPPORT_UNICODE */ /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset are allowed. */ #define PUBLIC_LITERAL_COMPILE_OPTIONS … #define PUBLIC_COMPILE_OPTIONS … #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS … #define PUBLIC_COMPILE_EXTRA_OPTIONS … /* Compile time error code numbers. They are given names so that they can more easily be tracked. When a new number is added, the tables called eint1 and eint2 in pcre2posix.c may need to be updated, and a new error text must be added to compile_error_texts in pcre2_error.c. Also, the error codes in pcre2.h.in must be updated - their values are exactly 100 greater than these values. */ enum { … }; /* This is a table of start-of-pattern options such as (*UTF) and settings such as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is generic and always supported. */ enum { … }; pso; /* NB: STRING_UTFn_RIGHTPAR contains the length as well */ static const pso pso_list[] = …; /* This table is used when converting repeating opcodes into possessified versions as a result of an explicit possessive quantifier such as ++. A zero value means there is no possessified version - in those cases the item in question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT because all relevant opcodes are less than that. */ static const uint8_t opcode_possessify[] = …; #ifdef DEBUG_SHOW_PARSED /************************************************* * Show the parsed pattern for debugging * *************************************************/ /* For debugging the pre-scan, this code, which outputs the parsed data vector, can be enabled. */ static void show_parsed(compile_block *cb) { uint32_t *pptr = cb->parsed_pattern; for (;;) { int max, min; PCRE2_SIZE offset; uint32_t i; uint32_t length; uint32_t meta_arg = META_DATA(*pptr); fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr); if (*pptr < META_END) { if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr); pptr++; } else switch (META_CODE(*pptr++)) { default: fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n"); return; case META_END: fprintf(stderr, "META_END\n"); return; case META_CAPTURE: fprintf(stderr, "META_CAPTURE %d", meta_arg); break; case META_RECURSE: GETOFFSET(offset, pptr); fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset); break; case META_BACKREF: if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg]; else GETOFFSET(offset, pptr); fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset); break; case META_ESCAPE: if (meta_arg == ESC_P || meta_arg == ESC_p) { uint32_t ptype = *pptr >> 16; uint32_t pvalue = *pptr++ & 0xffff; fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p', ptype, pvalue); } else { uint32_t cc; /* There's just one escape we might have here that isn't negated in the escapes table. */ if (meta_arg == ESC_g) cc = CHAR_g; else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++) { if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break; } if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK; fprintf(stderr, "META \\%c", cc); } break; case META_MINMAX: min = *pptr++; max = *pptr++; if (max != REPEAT_UNLIMITED) fprintf(stderr, "META {%d,%d}", min, max); else fprintf(stderr, "META {%d,}", min); break; case META_MINMAX_QUERY: min = *pptr++; max = *pptr++; if (max != REPEAT_UNLIMITED) fprintf(stderr, "META {%d,%d}?", min, max); else fprintf(stderr, "META {%d,}?", min); break; case META_MINMAX_PLUS: min = *pptr++; max = *pptr++; if (max != REPEAT_UNLIMITED) fprintf(stderr, "META {%d,%d}+", min, max); else fprintf(stderr, "META {%d,}+", min); break; case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break; case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break; case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break; case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break; case META_DOT: fprintf(stderr, "META_DOT"); break; case META_ASTERISK: fprintf(stderr, "META *"); break; case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break; case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break; case META_PLUS: fprintf(stderr, "META +"); break; case META_PLUS_QUERY: fprintf(stderr, "META +?"); break; case META_PLUS_PLUS: fprintf(stderr, "META ++"); break; case META_QUERY: fprintf(stderr, "META ?"); break; case META_QUERY_QUERY: fprintf(stderr, "META ??"); break; case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break; case META_ATOMIC: fprintf(stderr, "META (?>"); break; case META_NOCAPTURE: fprintf(stderr, "META (?:"); break; case META_LOOKAHEAD: fprintf(stderr, "META (?="); break; case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break; case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break; case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break; case META_KET: fprintf(stderr, "META )"); break; case META_ALT: fprintf(stderr, "META | %d", meta_arg); break; case META_CLASS: fprintf(stderr, "META ["); break; case META_CLASS_NOT: fprintf(stderr, "META [^"); break; case META_CLASS_END: fprintf(stderr, "META ]"); break; case META_CLASS_EMPTY: fprintf(stderr, "META []"); break; case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break; case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break; case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break; case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break; case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break; case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break; case META_FAIL: fprintf(stderr, "META (*FAIL)"); break; case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break; case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break; case META_SKIP: fprintf(stderr, "META (*SKIP)"); break; case META_THEN: fprintf(stderr, "META (*THEN)"); break; case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]); pptr += 2; break; case META_LOOKBEHIND: fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr); pptr += 2; break; case META_LOOKBEHIND_NA: fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr); pptr += 2; break; case META_LOOKBEHINDNOT: fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr); pptr += 2; break; case META_CALLOUT_NUMBER: fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0], pptr[1]); pptr += 3; break; case META_CALLOUT_STRING: { uint32_t patoffset = *pptr++; /* Offset of next pattern item */ uint32_t patlength = *pptr++; /* Length of next pattern item */ fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++); GETOFFSET(offset, pptr); fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength); } break; case META_RECURSE_BYNAME: fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++); GETOFFSET(offset, pptr); fprintf(stderr, "%zd", offset); break; case META_BACKREF_BYNAME: fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++); GETOFFSET(offset, pptr); fprintf(stderr, "%zd", offset); break; case META_COND_NUMBER: fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]); GETOFFSET(offset, pptr); fprintf(stderr, "%zd", offset); pptr++; break; case META_COND_DEFINE: fprintf(stderr, "META (?(DEFINE) offset="); GETOFFSET(offset, pptr); fprintf(stderr, "%zd", offset); break; case META_COND_VERSION: fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">="); fprintf(stderr, "%d.", *pptr++); fprintf(stderr, "%d)", *pptr++); break; case META_COND_NAME: fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++); GETOFFSET(offset, pptr); fprintf(stderr, "%zd", offset); break; case META_COND_RNAME: fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++); GETOFFSET(offset, pptr); fprintf(stderr, "%zd", offset); break; /* This is kept as a name, because it might be. */ case META_COND_RNUMBER: fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++); GETOFFSET(offset, pptr); fprintf(stderr, "%zd", offset); break; case META_MARK: fprintf(stderr, "META (*MARK:"); goto SHOWARG; case META_COMMIT_ARG: fprintf(stderr, "META (*COMMIT:"); goto SHOWARG; case META_PRUNE_ARG: fprintf(stderr, "META (*PRUNE:"); goto SHOWARG; case META_SKIP_ARG: fprintf(stderr, "META (*SKIP:"); goto SHOWARG; case META_THEN_ARG: fprintf(stderr, "META (*THEN:"); SHOWARG: length = *pptr++; for (i = 0; i < length; i++) { uint32_t cc = *pptr++; if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc); else fprintf(stderr, "\\x{%x}", cc); } fprintf(stderr, ") length=%u", length); break; } fprintf(stderr, "\n"); } return; } #endif /* DEBUG_SHOW_PARSED */ /************************************************* * Copy compiled code * *************************************************/ /* Compiled JIT code cannot be copied, so the new compiled block has no associated JIT data. */ PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION pcre2_code_copy(const pcre2_code *code) { … } /************************************************* * Copy compiled code and character tables * *************************************************/ /* Compiled JIT code cannot be copied, so the new compiled block has no associated JIT data. This version of code_copy also makes a separate copy of the character tables. */ PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION pcre2_code_copy_with_tables(const pcre2_code *code) { … } /************************************************* * Free compiled code * *************************************************/ PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION pcre2_code_free(pcre2_code *code) { … } /************************************************* * Read a number, possibly signed * *************************************************/ /* This function is used to read numbers in the pattern. The initial pointer must be at the sign or first digit of the number. When relative values (introduced by + or -) are allowed, they are relative group numbers, and the result must be greater than zero. Arguments: ptrptr points to the character pointer variable ptrend points to the end of the input string allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this max_value the largest number allowed max_error the error to give for an over-large number intptr where to put the result errcodeptr where to put an error code Returns: TRUE - a number was read FALSE - errorcode == 0 => no number was found errorcode != 0 => an error occurred */ static BOOL read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign, uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr) { … } /************************************************* * Read repeat counts * *************************************************/ /* Read an item of the form {n,m} and return the values when non-NULL pointers are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a larger value is used for "unlimited". We have to use signed arguments for read_number() because it is capable of returning a signed value. As of Perl 5.34.0 either n or m may be absent, but not both. Perl also allows spaces and tabs after { and before } and between the numbers and the comma, so we do too. Arguments: ptrptr points to pointer to character after '{' ptrend pointer to end of input minp if not NULL, pointer to int for min maxp if not NULL, pointer to int for max errorcodeptr points to error code variable Returns: FALSE if not a repeat quantifier, errorcode set zero FALSE on error, with errorcode set non-zero TRUE on success, with pointer updated to point after '}' */ static BOOL read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp, uint32_t *maxp, int *errorcodeptr) { … } /************************************************* * Handle escapes * *************************************************/ /* This function is called when a \ has been encountered. It either returns a positive value for a simple escape such as \d, or 0 for a data character, which is placed in chptr. A backreference to group n is returned as negative n. On entry, ptr is pointing at the character after \. On exit, it points after the final code unit of the escape sequence. This function is also called from pcre2_substitute() to handle escape sequences in replacement strings. In this case, the cb argument is NULL, and in the case of escapes that have further processing, only sequences that define a data character are recognised. The isclass argument is not relevant; the options argument is the final value of the compiled pattern's options. Arguments: ptrptr points to the input position pointer ptrend points to the end of the input chptr points to a returned data character errorcodeptr points to the errorcode variable (containing zero) options the current options bits xoptions the current extra options bits isclass TRUE if inside a character class cb compile data block or NULL when called from pcre2_substitute() Returns: zero => a data character positive => a special escape sequence negative => a numerical back reference on error, errorcodeptr is set non-zero */ int PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr, int *errorcodeptr, uint32_t options, uint32_t xoptions, BOOL isclass, compile_block *cb) { … } #ifdef SUPPORT_UNICODE /************************************************* * Handle \P and \p * *************************************************/ /* This function is called after \P or \p has been encountered, provided that PCRE2 is compiled with support for UTF and Unicode properties. On entry, the contents of ptrptr are pointing after the P or p. On exit, it is left pointing after the final code unit of the escape sequence. Arguments: ptrptr the pattern position pointer negptr a boolean that is set TRUE for negation else FALSE ptypeptr an unsigned int that is set to the type value pdataptr an unsigned int that is set to the detailed property value errorcodeptr the error code variable cb the compile data Returns: TRUE if the type value was found, or FALSE for an invalid type */ static BOOL get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr, uint16_t *pdataptr, int *errorcodeptr, compile_block *cb) { … } #endif /************************************************* * Check for POSIX class syntax * *************************************************/ /* This function is called when the sequence "[:" or "[." or "[=" is encountered in a character class. It checks whether this is followed by a sequence of characters terminated by a matching ":]" or ".]" or "=]". If we reach an unescaped ']' without the special preceding character, return FALSE. Originally, this function only recognized a sequence of letters between the terminators, but it seems that Perl recognizes any sequence of characters, though of course unknown POSIX names are subsequently rejected. Perl gives an "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE didn't consider this to be a POSIX class. Likewise for [:1234:]. The problem in trying to be exactly like Perl is in the handling of escapes. We have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code below handles the special cases \\ and \], but does not try to do any other escape processing. This makes it different from Perl for cases such as [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize "l\ower". This is a lesser evil than not diagnosing bad classes when Perl does, I think. A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. It seems that the appearance of a nested POSIX class supersedes an apparent external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or a digit. This is handled by returning FALSE if the start of a new group with the same terminator is encountered, since the next closing sequence must close the nested group, not the outer one. In Perl, unescaped square brackets may also appear as part of class names. For example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not seem right at all. PCRE does not allow closing square brackets in POSIX class names. Arguments: ptr pointer to the character after the initial [ (colon, dot, equals) ptrend pointer to the end of the pattern endptr where to return a pointer to the terminating ':', '.', or '=' Returns: TRUE or FALSE */ static BOOL check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr) { … } /************************************************* * Check POSIX class name * *************************************************/ /* This function is called to check the name given in a POSIX-style class entry such as [:alnum:]. Arguments: ptr points to the first letter len the length of the name Returns: a value representing the name, or -1 if unknown */ static int check_posix_name(PCRE2_SPTR ptr, int len) { … } /************************************************* * Read a subpattern or VERB name * *************************************************/ /* This function is called from parse_regex() below whenever it needs to read the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial pointer must be to the preceding character. If that character is '*' we are reading a verb or alpha assertion name. The pointer is updated to point after the name, for a VERB or alpha assertion name, or after tha name's terminator for a subpattern name. Returning both the offset and the name pointer is redundant information, but some callers use one and some the other, so it is simplest just to return both. When the name is in braces, spaces and tabs are allowed (and ignored) at either end. Arguments: ptrptr points to the character pointer variable ptrend points to the end of the input string utf true if the input is UTF-encoded terminator the terminator of a subpattern name must be this offsetptr where to put the offset from the start of the pattern nameptr where to put a pointer to the name in the input namelenptr where to put the length of the name errcodeptr where to put an error code cb pointer to the compile data block Returns: TRUE if a name was read FALSE otherwise, with error code set */ static BOOL read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator, PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr, int *errorcodeptr, compile_block *cb) { … } /************************************************* * Manage callouts at start of cycle * *************************************************/ /* At the start of a new item in parse_regex() we are able to record the details of the previous item in a prior callout, and also to set up an automatic callout if enabled. Avoid having two adjacent automatic callouts, which would otherwise happen for items such as \Q that contribute nothing to the parsed pattern. Arguments: ptr current pattern pointer pcalloutptr points to a pointer to previous callout, or NULL auto_callout TRUE if auto_callouts are enabled parsed_pattern the parsed pattern pointer cb compile block Returns: possibly updated parsed_pattern pointer. */ static uint32_t * manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout, uint32_t *parsed_pattern, compile_block *cb) { … } /************************************************* * Handle \d, \D, \s, \S, \w, \W * *************************************************/ /* This function is called from parse_regex() below, both for freestanding escapes, and those within classes, to handle those escapes that may change when Unicode property support is requested. Note that PCRE2_UCP will never be set without Unicode support because that is checked when pcre2_compile() is called. Arguments: escape the ESC_... value parsed_pattern where to add the code options options bits xoptions extra options bits Returns: updated value of parsed_pattern */ static uint32_t * handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options, uint32_t xoptions) { … } /************************************************* * Parse regex and identify named groups * *************************************************/ /* This function is called first of all. It scans the pattern and does two things: (1) It identifies capturing groups and makes a table of named capturing groups so that information about them is fully available to both the compiling scans. (2) It writes a parsed version of the pattern with comments omitted and escapes processed into the parsed_pattern vector. Arguments: ptr points to the start of the pattern options compiling dynamic options (may change during the scan) has_lookbehind points to a boolean, set TRUE if a lookbehind is found cb pointer to the compile data block Returns: zero on success or a non-zero error code, with the error offset placed in the cb field */ /* A structure and some flags for dealing with nested groups. */ nest_save; #define NSF_RESET … #define NSF_CONDASSERT … #define NSF_ATOMICSR … /* Options that are changeable within the pattern must be tracked during parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing, but all must be tracked so that META_OPTIONS items set the correct values for the main compiling phase. */ #define PARSE_TRACKED_OPTIONS … #define PARSE_TRACKED_EXTRA_OPTIONS … /* States used for analyzing ranges in character classes. The two OK values must be last. */ enum { … }; /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates the storing of literal values in the main parsed pattern, where they can always be quantified. */ #if PCRE2_CODE_UNIT_WIDTH == 32 #define PARSED_LITERAL … #else #define PARSED_LITERAL(c, p) … #endif /* Here's the actual function. */ static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind, compile_block *cb) { … } /************************************************* * Find first significant opcode * *************************************************/ /* This is called by several functions that scan a compiled expression looking for a fixed first character, or an anchoring opcode etc. It skips over things that do not influence this. For some calls, it makes sense to skip negative forward and all backward assertions, and also the \b assertion; for others it does not. Arguments: code pointer to the start of the group skipassert TRUE if certain assertions are to be skipped Returns: pointer to the first significant opcode */ static const PCRE2_UCHAR* first_significant_code(PCRE2_SPTR code, BOOL skipassert) { … } #ifdef SUPPORT_UNICODE /************************************************* * Get othercase range * *************************************************/ /* This function is passed the start and end of a class range in UCP mode. For single characters the range may be just one character long. The function searches up the characters, looking for ranges of characters in the "other" case. Each call returns the next one, updating the start address. A character with multiple other cases is returned on its own with a special return value. Arguments: cptr points to starting character value; updated d end value ocptr where to put start of othercase range odptr where to put end of othercase range restricted TRUE if caseless restriction applies Yield: -1 when no more 0 when a range is returned >0 the CASESET offset for char with multiple other cases; for this return, *ocptr contains the original */ static int get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr, uint32_t *odptr, BOOL restricted) { … } #endif /* SUPPORT_UNICODE */ /************************************************* * Add a character or range to a class (internal) * *************************************************/ /* This function packages up the logic of adding a character or range of characters to a class. The character values in the arguments will be within the valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is called only from within the "add to class" group of functions, some of which are recursive and mutually recursive. The external entry point is add_to_class(). Arguments: classbits the bit map for characters < 256 uchardptr points to the pointer for extra data options the options bits xoptions the extra options bits cb compile data start start of range character end end of range character Returns: the number of < 256 characters added the pointer to extra data is updated */ static unsigned int add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, uint32_t xoptions, compile_block *cb, uint32_t start, uint32_t end) { … } #ifdef SUPPORT_UNICODE /************************************************* * Add a list of characters to a class (internal) * *************************************************/ /* This function is used for adding a list of case-equivalent characters to a class when in UTF mode. This function is called only from within add_to_class_internal(), with which it is mutually recursive. Arguments: classbits the bit map for characters < 256 uchardptr points to the pointer for extra data options the options bits xoptions the extra options bits cb contains pointers to tables etc. p points to row of 32-bit values, terminated by NOTACHAR except character to omit; this is used when adding lists of case-equivalent characters to avoid including the one we already know about Returns: the number of < 256 characters added the pointer to extra data is updated */ static unsigned int add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p, unsigned int except) { … } #endif /************************************************* * External entry point for add range to class * *************************************************/ /* This function sets the overall range so that the internal functions can try to avoid duplication when handling case-independence. Arguments: classbits the bit map for characters < 256 uchardptr points to the pointer for extra data options the options bits xoptions the extra options bits cb compile data start start of range character end end of range character Returns: the number of < 256 characters added the pointer to extra data is updated */ static unsigned int add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, uint32_t xoptions, compile_block *cb, uint32_t start, uint32_t end) { … } /************************************************* * External entry point for add list to class * *************************************************/ /* This function is used for adding a list of horizontal or vertical whitespace characters to a class. The list must be in order so that ranges of characters can be detected and handled appropriately. This function sets the overall range so that the internal functions can try to avoid duplication when handling case-independence. Arguments: classbits the bit map for characters < 256 uchardptr points to the pointer for extra data options the options bits xoptions the extra options bits cb contains pointers to tables etc. p points to row of 32-bit values, terminated by NOTACHAR except character to omit; this is used when adding lists of case-equivalent characters to avoid including the one we already know about Returns: the number of < 256 characters added the pointer to extra data is updated */ static unsigned int add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p, unsigned int except) { … } /************************************************* * Add characters not in a list to a class * *************************************************/ /* This function is used for adding the complement of a list of horizontal or vertical whitespace to a class. The list must be in order. Arguments: classbits the bit map for characters < 256 uchardptr points to the pointer for extra data options the options bits xoptions the extra options bits cb contains pointers to tables etc. p points to row of 32-bit values, terminated by NOTACHAR Returns: the number of < 256 characters added the pointer to extra data is updated */ static unsigned int add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p) { … } /************************************************* * Find details of duplicate group names * *************************************************/ /* This is called from compile_branch() when it needs to know the index and count of duplicates in the names table when processing named backreferences, either directly, or as conditions. Arguments: name points to the name length the length of the name indexptr where to put the index countptr where to put the count of duplicates errorcodeptr where to put an error code cb the compile block Returns: TRUE if OK, FALSE if not, error code set */ static BOOL find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr, int *countptr, int *errorcodeptr, compile_block *cb) { … } /************************************************* * Compile one branch * *************************************************/ /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If the options are changed during the branch, the pointer is used to change the external options bits. This function is used during the pre-compile phase when we are trying to find out the amount of memory needed, as well as during the real compile phase. The value of lengthptr distinguishes the two phases. Arguments: optionsptr pointer to the option bits xoptionsptr pointer to the extra option bits codeptr points to the pointer to the current code point pptrptr points to the current parsed pattern pointer errorcodeptr points to error code variable firstcuptr place to put the first required code unit firstcuflagsptr place to put the first code unit flags reqcuptr place to put the last required code unit reqcuflagsptr place to put the last required code unit flags bcptr points to current branch chain open_caps points to current capitem cb contains pointers to tables etc. lengthptr NULL during the real compile phase points to length accumulator during pre-compile phase Returns: 0 There's been an error, *errorcodeptr is non-zero +1 Success, this branch must match at least one character -1 Success, this branch may match an empty string */ static int compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr, uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr, uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps, compile_block *cb, PCRE2_SIZE *lengthptr) { … } /************************************************* * Compile regex: a sequence of alternatives * *************************************************/ /* On entry, pptr is pointing past the bracket meta, but on return it points to the closing bracket or META_END. The code variable is pointing at the code unit into which the BRA operator has been stored. This function is used during the pre-compile phase when we are trying to find out the amount of memory needed, as well as during the real compile phase. The value of lengthptr distinguishes the two phases. Arguments: options option bits, including any changes for this subpattern xoptions extra option bits, ditto codeptr -> the address of the current code pointer pptrptr -> the address of the current parsed pattern pointer errorcodeptr -> pointer to error code variable skipunits skip this many code units at start (for brackets and OP_COND) firstcuptr place to put the first required code unit firstcuflagsptr place to put the first code unit flags reqcuptr place to put the last required code unit reqcuflagsptr place to put the last required code unit flags bcptr pointer to the chain of currently open branches cb points to the data block with tables pointers etc. lengthptr NULL during the real compile phase points to length accumulator during pre-compile phase Returns: 0 There has been an error +1 Success, this group must match at least one character -1 Success, this group may match an empty string */ static int compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr, uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps, compile_block *cb, PCRE2_SIZE *lengthptr) { … } /************************************************* * Check for anchored pattern * *************************************************/ /* Try to find out if this is an anchored regular expression. Consider each alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then it's anchored. However, if this is a multiline pattern, then only OP_SOD will be found, because ^ generates OP_CIRCM in that mode. We can also consider a regex to be anchored if OP_SOM starts all its branches. This is the code for \G, which means "match at start of match position, taking into account the match offset". A branch is also implicitly anchored if it starts with .* and DOTALL is set, because that will try the rest of the pattern at all possible matching points, so there is no point trying again.... er .... .... except when the .* appears inside capturing parentheses, and there is a subsequent back reference to those parentheses. We haven't enough information to catch that case precisely. At first, the best we could do was to detect when .* was in capturing brackets and the highest back reference was greater than or equal to that level. However, by keeping a bitmap of the first 31 back references, we can catch some of the more common cases more precisely. ... A second exception is when the .* appears inside an atomic group, because this prevents the number of characters it matches from being adjusted. Arguments: code points to start of the compiled pattern bracket_map a bitmap of which brackets we are inside while testing; this handles up to substring 31; after that we just have to take the less precise approach cb points to the compile data block atomcount atomic group level inassert TRUE if in an assertion Returns: TRUE or FALSE */ static BOOL is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb, int atomcount, BOOL inassert) { … } /************************************************* * Check for starting with ^ or .* * *************************************************/ /* This is called to find out if every branch starts with ^ or .* so that "first char" processing can be done to speed things up in multiline matching and for non-DOTALL patterns that start with .* (which must start at the beginning or after \n). As in the case of is_anchored() (see above), we have to take account of back references to capturing brackets that contain .* because in that case we can't make the assumption. Also, the appearance of .* inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE or *SKIP does not count, because once again the assumption no longer holds. Arguments: code points to start of the compiled pattern or a group bracket_map a bitmap of which brackets we are inside while testing; this handles up to substring 31; after that we just have to take the less precise approach cb points to the compile data atomcount atomic group level inassert TRUE if in an assertion Returns: TRUE or FALSE */ static BOOL is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, int atomcount, BOOL inassert) { … } /************************************************* * Scan compiled regex for recursion reference * *************************************************/ /* This function scans through a compiled pattern until it finds an instance of OP_RECURSE. Arguments: code points to start of expression utf TRUE in UTF mode Returns: pointer to the opcode for OP_RECURSE, or NULL if not found */ static PCRE2_SPTR find_recurse(PCRE2_SPTR code, BOOL utf) { … } /************************************************* * Check for asserted fixed first code unit * *************************************************/ /* During compilation, the "first code unit" settings from forward assertions are discarded, because they can cause conflicts with actual literals that follow. However, if we end up without a first code unit setting for an unanchored pattern, it is worth scanning the regex to see if there is an initial asserted first code unit. If all branches start with the same asserted code unit, or with a non-conditional bracket all of whose alternatives start with the same asserted code unit (recurse ad lib), then we return that code unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with REQ_NONE in the flags. Arguments: code points to start of compiled pattern flags points to the first code unit flags inassert non-zero if in an assertion Returns: the fixed first code unit, or 0 with REQ_NONE in flags */ static uint32_t find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert) { … } /************************************************* * Add an entry to the name/number table * *************************************************/ /* This function is called between compiling passes to add an entry to the name/number table, maintaining alphabetical order. Checking for permitted and forbidden duplicates has already been done. Arguments: cb the compile data block name the name to add length the length of the name groupno the group number tablecount the count of names in the table so far Returns: nothing */ static void add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length, unsigned int groupno, uint32_t tablecount) { … } /************************************************* * Skip in parsed pattern * *************************************************/ /* This function is called to skip parts of the parsed pattern when finding the length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find the end of the branch, it is called to skip over an internal lookaround or (DEFINE) group, and it is also called to skip to the end of a class, during which it will never encounter nested groups (but there's no need to have special code for that). When called to find the end of a branch or group, pptr must point to the first meta code inside the branch, not the branch-starting code. In other cases it can point to the item that causes the function to be called. Arguments: pptr current pointer to skip from skiptype PSKIP_CLASS when skipping to end of class PSKIP_ALT when META_ALT ends the skip PSKIP_KET when only META_KET ends the skip Returns: new value of pptr NULL if META_END is reached - should never occur or for an unknown meta value - likewise */ static uint32_t * parsed_skip(uint32_t *pptr, uint32_t skiptype) { … } /************************************************* * Find length of a parsed group * *************************************************/ /* This is called for nested groups within a branch of a lookbehind whose length is being computed. On entry, the pointer must be at the first element after the group initializing code. On exit it points to OP_KET. Caching is used to improve processing speed when the same capturing group occurs many times. Arguments: pptrptr pointer to pointer in the parsed pattern minptr where to return the minimum length isinline FALSE if a reference or recursion; TRUE for inline group errcodeptr pointer to the errorcode lcptr pointer to the loop counter group number of captured group or -1 for a non-capturing group recurses chain of recurse_check to catch mutual recursion cb pointer to the compile data Returns: the maximum group length or a negative number */ static int get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr, int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb) { … } /************************************************* * Find length of a parsed branch * *************************************************/ /* Return fixed maximum and minimum lengths for a branch in a lookbehind, giving an error if the length is not limited. On entry, *pptrptr points to the first element inside the branch. On exit it is set to point to the ALT or KET. Arguments: pptrptr pointer to pointer in the parsed pattern minptr where to return the minimum length errcodeptr pointer to error code lcptr pointer to loop counter recurses chain of recurse_check to catch mutual recursion cb pointer to compile block Returns: the maximum length, or a negative value on error */ static int get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr, parsed_recurse_check *recurses, compile_block *cb) { … } /************************************************* * Set lengths in a lookbehind * *************************************************/ /* This function is called for each lookbehind, to set the lengths in its branches. An error occurs if any branch does not have a limited maximum length that is less than the limit (65535). On exit, the pointer must be left on the final ket. The function also maintains the max_lookbehind value. Any lookbehind branch that contains a nested lookbehind may actually look further back than the length of the branch. The additional amount is passed back from get_branchlength() as an "extra" value. Arguments: pptrptr pointer to pointer in the parsed pattern errcodeptr pointer to error code lcptr pointer to loop counter recurses chain of recurse_check to catch mutual recursion cb pointer to compile block Returns: TRUE if all is well FALSE otherwise, with error code and offset set */ static BOOL set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr, parsed_recurse_check *recurses, compile_block *cb) { … } /************************************************* * Check parsed pattern lookbehinds * *************************************************/ /* This function is called at the end of parsing a pattern if any lookbehinds were encountered. It scans the parsed pattern for them, calling set_lookbehind_lengths() for each one. At the start, the errorcode is zero and the error offset is marked unset. The enables the functions above not to override settings from deeper nestings. This function is called recursively from get_branchlength() for lookaheads in order to process any lookbehinds that they may contain. It stops when it hits a non-nested closing parenthesis in this case, returning a pointer to it. Arguments pptr points to where to start (start of pattern or start of lookahead) retptr if not NULL, return the ket pointer here recurses chain of recurse_check to catch mutual recursion cb points to the compile block lcptr points to loop counter Returns: 0 on success, or an errorcode (cb->erroroffset will be set) */ static int check_lookbehinds(uint32_t *pptr, uint32_t **retptr, parsed_recurse_check *recurses, compile_block *cb, int *lcptr) { … } /************************************************* * External function to compile a pattern * *************************************************/ /* This function reads a regular expression in the form of a string and returns a pointer to a block of store holding a compiled version of the expression. Arguments: pattern the regular expression patlen the length of the pattern, or PCRE2_ZERO_TERMINATED options option bits errorptr pointer to errorcode erroroffset pointer to error offset ccontext points to a compile context or is NULL Returns: pointer to compiled data block, or NULL on error, with errorcode and erroroffset set */ PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options, int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext) { … } /* These #undefs are here to enable unity builds with CMake. */ #undef NLBLOCK /* Block containing newline information */ #undef PSSTART /* Field containing processed string start */ #undef PSEND /* Field containing processed string end */ /* End of pcre2_compile.c */