pcre2_compile.c | Explore in Territory

/*************************************************
*      Perl-Compatible Regular Expressions       *
*************************************************/

/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
          New API code Copyright (c) 2016-2023 University of Cambridge

-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of the University of Cambridge nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/


#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#define NLBLOCK …
#define PSSTART …
#define PSEND …

#include "pcre2_internal.h"

/* In rare error cases debugging might require calling pcre2_printint(). */

#if 0
#ifdef EBCDIC
#define PRINTABLE …
#else
#define PRINTABLE …
#endif
#include "pcre2_printint.c"
#define DEBUG_CALL_PRINTINT
#endif

/* Other debugging code can be enabled by these defines. */

/* #define DEBUG_SHOW_CAPTURES */
/* #define DEBUG_SHOW_PARSED */

/* There are a few things that vary with different code unit sizes. Handle them
by defining macros in order to minimize #if usage. */

#if PCRE2_CODE_UNIT_WIDTH == 8
#define STRING_UTFn_RIGHTPAR …
#define XDIGIT …

#else  /* Either 16-bit or 32-bit */
#define XDIGIT(c) …

#if PCRE2_CODE_UNIT_WIDTH == 16
#define STRING_UTFn_RIGHTPAR …

#else  /* 32-bit */
#define STRING_UTFn_RIGHTPAR …
#endif
#endif

/* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
them will be able to (i.e. assume a 64-bit world). */

#if PCRE2_SIZE_MAX <= UINT32_MAX
#define PUTOFFSET …
#define GETOFFSET …
#define GETPLUSOFFSET …
#define READPLUSOFFSET …
#define SKIPOFFSET …
#define SIZEOFFSET …
#else
#define PUTOFFSET(s,p) …
#define GETOFFSET(s,p) …
#define GETPLUSOFFSET(s,p) …
#define READPLUSOFFSET(s,p) …
#define SKIPOFFSET(p) …
#define SIZEOFFSET …
#endif

/* Macros for manipulating elements of the parsed pattern vector. */

#define META_CODE(x) …
#define META_DATA(x) …
#define META_DIFF(x,y) …

/* Function definitions to allow mutual recursion */

#ifdef SUPPORT_UNICODE
static unsigned int
  add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, uint32_t,
    compile_block *, const uint32_t *, unsigned int);
#endif

static int
  compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
    uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
    open_capitem *, compile_block *, PCRE2_SIZE *);

static int
  get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
    compile_block *);

static BOOL
  set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
    compile_block *);

static int
  check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
    compile_block *, int *);


/*************************************************
*      Code parameters and static tables         *
*************************************************/

#define MAX_GROUP_NUMBER …
#define MAX_REPEAT_COUNT …
#define REPEAT_UNLIMITED …

/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
different ways in the different pattern scans. The parsing and group-
identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
aligned for this. Having defined the size in code units, we set up
C16_WORK_SIZE as the number of elements in the 16-bit vector.

During the first compiling phase, when determining how much memory is required,
the regex is partly compiled into this space, but the compiled parts are
discarded as soon as they can be, so that hopefully there will never be an
overrun. The code does, however, check for an overrun, which can occur for
pathological patterns. The size of the workspace depends on LINK_SIZE because
the length of compiled items varies with this.

In the real compile phase, this workspace is not currently used. */

#define COMPILE_WORK_SIZE …

#define C16_WORK_SIZE …

/* A uint32_t vector is used for caching information about the size of
capturing groups, to improve performance. A default is created on the stack of
this size. */

#define GROUPINFO_DEFAULT_SIZE …

/* The overrun tests check for a slightly smaller size so that they detect the
overrun before it actually does run off the end of the data block. */

#define WORK_SIZE_SAFETY_MARGIN …

/* This value determines the size of the initial vector that is used for
remembering named groups during the pre-compile. It is allocated on the stack,
but if it is too small, it is expanded, in a similar way to the workspace. The
value is the number of slots in the list. */

#define NAMED_GROUP_LIST_SIZE …

/* The pre-compiling pass over the pattern creates a parsed pattern in a vector
of uint32_t. For short patterns this lives on the stack, with this size. Heap
memory is used for longer patterns. */

#define PARSED_PATTERN_DEFAULT_SIZE …

/* Maximum length value to check against when making sure that the variable
that holds the compiled pattern length does not overflow. We make it a bit less
than INT_MAX to allow for adding in group terminating code units, so that we
don't have to check them every time. */

#define OFLOW_MAX …

/* Code values for parsed patterns, which are stored in a vector of 32-bit
unsigned ints. Values less than META_END are literal data values. The coding
for identifying the item is in the top 16-bits, leaving 16 bits for the
additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
macros are used to manipulate parsed pattern elements.

NOTE: When these definitions are changed, the table of extra lengths for each
code (meta_extra_lengths, just below) must be updated to remain in step. */

#define META_END …

#define META_ALT …
#define META_ATOMIC …
#define META_BACKREF …
#define META_BACKREF_BYNAME …
#define META_BIGVALUE …
#define META_CALLOUT_NUMBER …
#define META_CALLOUT_STRING …
#define META_CAPTURE …
#define META_CIRCUMFLEX …
#define META_CLASS …
#define META_CLASS_EMPTY …
#define META_CLASS_EMPTY_NOT …
#define META_CLASS_END …
#define META_CLASS_NOT …
#define META_COND_ASSERT …
#define META_COND_DEFINE …
#define META_COND_NAME …
#define META_COND_NUMBER …
#define META_COND_RNAME …
#define META_COND_RNUMBER …
#define META_COND_VERSION …
#define META_DOLLAR …
#define META_DOT …
#define META_ESCAPE …
#define META_KET …
#define META_NOCAPTURE …
#define META_OPTIONS …
#define META_POSIX …
#define META_POSIX_NEG …
#define META_RANGE_ESCAPED …
#define META_RANGE_LITERAL …
#define META_RECURSE …
#define META_RECURSE_BYNAME …
#define META_SCRIPT_RUN …

/* These must be kept together to make it easy to check that an assertion
is present where expected in a conditional group. */

#define META_LOOKAHEAD …
#define META_LOOKAHEADNOT …
#define META_LOOKBEHIND …
#define META_LOOKBEHINDNOT …

/* These cannot be conditions */

#define META_LOOKAHEAD_NA …
#define META_LOOKBEHIND_NA …

/* These must be kept in this order, with consecutive values, and the _ARG
versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
versions. */

#define META_MARK …
#define META_ACCEPT …
#define META_FAIL …
#define META_COMMIT …
#define META_COMMIT_ARG …
#define META_PRUNE …
#define META_PRUNE_ARG …
#define META_SKIP …
#define META_SKIP_ARG …
#define META_THEN …
#define META_THEN_ARG …

/* These must be kept in groups of adjacent 3 values, and all together. */

#define META_ASTERISK …
#define META_ASTERISK_PLUS …
#define META_ASTERISK_QUERY …
#define META_PLUS …
#define META_PLUS_PLUS …
#define META_PLUS_QUERY …
#define META_QUERY …
#define META_QUERY_PLUS …
#define META_QUERY_QUERY …
#define META_MINMAX …
#define META_MINMAX_PLUS …
#define META_MINMAX_QUERY …

#define META_FIRST_QUANTIFIER …
#define META_LAST_QUANTIFIER …

/* This is a special "meta code" that is used only to distinguish (*asr: from
(*sr: in the table of aphabetic assertions. It is never stored in the parsed
pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
therefore no need for it to have a length entry, so use a high value. */

#define META_ATOMIC_SCRIPT_RUN …

/* Table of extra lengths for each of the meta codes. Must be kept in step with
the definitions above. For some items these values are a basic length to which
a variable amount has to be added. */

static unsigned char meta_extra_lengths[] = …;

/* Types for skipping parts of a parsed pattern. */

enum { … };

/* Macro for setting individual bits in class bitmaps. It took some
experimenting to figure out how to stop gcc 5.3.0 from warning with
-Wconversion. This version gets a warning:

  #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))

Let's hope the apparently less efficient version isn't actually so bad if the
compiler is clever with identical subexpressions. */

#define SETBIT(a,b) …

/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
variables, which are concerned with first and required code units. A value
greater than or equal to REQ_NONE means "no code unit set"; otherwise the
matching xxcu variable is set, and the low valued bits are relevant. */

#define REQ_UNSET …
#define REQ_NONE …
#define REQ_CASELESS …
#define REQ_VARY …

/* These flags are used in the groupinfo vector. */

#define GI_SET_FIXED_LENGTH …
#define GI_NOT_FIXED_LENGTH …
#define GI_FIXED_LENGTH_MASK …

/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
and is fast (a good compiler can turn it into a subtraction and unsigned
comparison). */

#define IS_DIGIT(x) …

/* Table to identify hex digits. The tables in chartables are dependent on the
locale, and may mark arbitrary characters as digits. We want to recognize only
0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
costs 256 bytes, but it is a lot faster than doing character value tests (at
least in some simple cases I timed), and in some applications one wants PCRE2
to compile efficiently as well as match efficiently. The value in the table is
the binary hex digit value, or 0xff for non-hex digits. */

/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
UTF-8 mode. */

#ifndef EBCDIC
static const uint8_t xdigitab[] = …;/* 248-255 */

#else

/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */

static const uint8_t xdigitab[] =
  {
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
#endif  /* EBCDIC */


/* Table for handling alphanumeric escaped characters. Positive returns are
simple data values; negative values are for special things like \d and so on.
Zero means further processing is needed (for things like \x), or the escape is
invalid. */

/* This is the "normal" table for ASCII systems or for EBCDIC systems running
in UTF-8 mode. It runs from '0' to 'z'. */

#ifndef EBCDIC
#define ESCAPES_FIRST …
#define ESCAPES_LAST …
#define UPPER_CASE(c) …

static const short int escapes[] = …;

#else

/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
because it is defined as 'a', which of course picks up the ASCII value. */

#if 'a' == 0x81                    /* Check for a real EBCDIC environment */
#define ESCAPES_FIRST …
#define ESCAPES_LAST …
#define UPPER_CASE …
#else                              /* Testing in an ASCII environment */
#define ESCAPES_FIRST …
#define ESCAPES_LAST …
#define UPPER_CASE …
#endif

static const short int escapes[] = {
/*  80 */         CHAR_BEL, -ESC_b,       0, -ESC_d, CHAR_ESC, CHAR_FF,      0,
/*  88 */ -ESC_h,        0,      0,     '{',      0,        0,       0,      0,
/*  90 */      0,        0, -ESC_k,       0,      0,  CHAR_LF,       0, -ESC_p,
/*  98 */      0,  CHAR_CR,      0,     '}',      0,        0,       0,      0,
/*  A0 */      0,      '~', -ESC_s, CHAR_HT,      0,   -ESC_v,  -ESC_w,      0,
/*  A8 */      0,   -ESC_z,      0,       0,      0,      '[',       0,      0,
/*  B0 */      0,        0,      0,       0,      0,        0,       0,      0,
/*  B8 */      0,        0,      0,       0,      0,      ']',     '=',    '-',
/*  C0 */    '{',   -ESC_A, -ESC_B,  -ESC_C, -ESC_D,   -ESC_E,       0, -ESC_G,
/*  C8 */ -ESC_H,        0,      0,       0,      0,        0,       0,      0,
/*  D0 */    '}',        0, -ESC_K,       0,      0,   -ESC_N,       0, -ESC_P,
/*  D8 */ -ESC_Q,   -ESC_R,      0,       0,      0,        0,       0,      0,
/*  E0 */   '\\',        0, -ESC_S,       0,      0,   -ESC_V,  -ESC_W, -ESC_X,
/*  E8 */      0,   -ESC_Z,      0,       0,      0,        0,       0,      0,
/*  F0 */      0,        0,      0,       0,      0,        0,       0,      0,
/*  F8 */      0,        0
};

/* We also need a table of characters that may follow \c in an EBCDIC
environment for characters 0-31. */

static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";

#endif   /* EBCDIC */


/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
searched linearly. Put all the names into a single string, in order to reduce
the number of relocations when a shared library is dynamically linked. The
string is built from string macros so that it works in UTF-8 mode on EBCDIC
platforms. */

verbitem;

static const char verbnames[] = …;

static const verbitem verbs[] = …;

static const int verbcount = …;

/* Verb opcodes, indexed by their META code offset from META_MARK. */

static const uint32_t verbops[] = …;

/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */

alasitem;

static const char alasnames[] = …;

static const alasitem alasmeta[] = …;

static const int alascount = …;

/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */

static uint32_t chartypeoffset[] = …;

/* Tables of names of POSIX character classes and their lengths. The names are
now all in a single string, to reduce the number of relocations when a shared
library is dynamically loaded. The list of lengths is terminated by a zero
length entry. The first three must be alpha, lower, upper, as this is assumed
for handling case independence. The indices for several classes are needed, so
identify them. */

static const char posix_names[] = …;

static const uint8_t posix_name_lengths[] = …;

#define PC_DIGIT …
#define PC_GRAPH …
#define PC_PRINT …
#define PC_PUNCT …
#define PC_XDIGIT …

/* Table of class bit maps for each POSIX class. Each class is formed from a
base map, with an optional addition or removal of another map. Then, for some
classes, there is some additional tweaking: for [:blank:] the vertical space
characters are removed, and for [:alpha:] and [:alnum:] the underscore
character is removed. The triples in the table consist of the base map offset,
second map offset or -1 if no second map, and a non-negative value for map
addition or a negative value for map subtraction (if there are two maps). The
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
remove vertical space characters, 2 => remove underscore. */

static const int posix_class_maps[] = …;

#ifdef SUPPORT_UNICODE

/* The POSIX class Unicode property substitutes that are used in UCP mode must
be in the order of the POSIX class names, defined above. */

static int posix_substitutes[] = …;
#define POSIX_SUBSIZE …
#endif  /* SUPPORT_UNICODE */

/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
are allowed. */

#define PUBLIC_LITERAL_COMPILE_OPTIONS …

#define PUBLIC_COMPILE_OPTIONS …

#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS …

#define PUBLIC_COMPILE_EXTRA_OPTIONS …

/* Compile time error code numbers. They are given names so that they can more
easily be tracked. When a new number is added, the tables called eint1 and
eint2 in pcre2posix.c may need to be updated, and a new error text must be
added to compile_error_texts in pcre2_error.c. Also, the error codes in
pcre2.h.in must be updated - their values are exactly 100 greater than these
values. */

enum { … };

/* This is a table of start-of-pattern options such as (*UTF) and settings such
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
generic and always supported. */

enum { … };

pso;

/* NB: STRING_UTFn_RIGHTPAR contains the length as well */

static const pso pso_list[] = …;

/* This table is used when converting repeating opcodes into possessified
versions as a result of an explicit possessive quantifier such as ++. A zero
value means there is no possessified version - in those cases the item in
question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
because all relevant opcodes are less than that. */

static const uint8_t opcode_possessify[] = …;


#ifdef DEBUG_SHOW_PARSED
/*************************************************
*     Show the parsed pattern for debugging      *
*************************************************/

/* For debugging the pre-scan, this code, which outputs the parsed data vector,
can be enabled. */

static void show_parsed(compile_block *cb)
{
uint32_t *pptr = cb->parsed_pattern;

for (;;)
  {
  int max, min;
  PCRE2_SIZE offset;
  uint32_t i;
  uint32_t length;
  uint32_t meta_arg = META_DATA(*pptr);

  fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);

  if (*pptr < META_END)
    {
    if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
    pptr++;
    }

  else switch (META_CODE(*pptr++))
    {
    default:
    fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
    return;

    case META_END:
    fprintf(stderr, "META_END\n");
    return;

    case META_CAPTURE:
    fprintf(stderr, "META_CAPTURE %d", meta_arg);
    break;

    case META_RECURSE:
    GETOFFSET(offset, pptr);
    fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
    break;

    case META_BACKREF:
    if (meta_arg < 10)
      offset = cb->small_ref_offset[meta_arg];
    else
      GETOFFSET(offset, pptr);
    fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
    break;

    case META_ESCAPE:
    if (meta_arg == ESC_P || meta_arg == ESC_p)
      {
      uint32_t ptype = *pptr >> 16;
      uint32_t pvalue = *pptr++ & 0xffff;
      fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
        ptype, pvalue);
      }
    else
      {
      uint32_t cc;
      /* There's just one escape we might have here that isn't negated in the
      escapes table. */
      if (meta_arg == ESC_g) cc = CHAR_g;
      else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
        {
        if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
        }
      if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
      fprintf(stderr, "META \\%c", cc);
      }
    break;

    case META_MINMAX:
    min = *pptr++;
    max = *pptr++;
    if (max != REPEAT_UNLIMITED)
      fprintf(stderr, "META {%d,%d}", min, max);
    else
      fprintf(stderr, "META {%d,}", min);
    break;

    case META_MINMAX_QUERY:
    min = *pptr++;
    max = *pptr++;
    if (max != REPEAT_UNLIMITED)
      fprintf(stderr, "META {%d,%d}?", min, max);
    else
      fprintf(stderr, "META {%d,}?", min);
    break;

    case META_MINMAX_PLUS:
    min = *pptr++;
    max = *pptr++;
    if (max != REPEAT_UNLIMITED)
      fprintf(stderr, "META {%d,%d}+", min, max);
    else
      fprintf(stderr, "META {%d,}+", min);
    break;

    case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
    case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
    case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
    case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
    case META_DOT: fprintf(stderr, "META_DOT"); break;
    case META_ASTERISK: fprintf(stderr, "META *"); break;
    case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
    case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
    case META_PLUS: fprintf(stderr, "META +"); break;
    case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
    case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
    case META_QUERY: fprintf(stderr, "META ?"); break;
    case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
    case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;

    case META_ATOMIC: fprintf(stderr, "META (?>"); break;
    case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
    case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
    case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
    case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
    case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
    case META_KET: fprintf(stderr, "META )"); break;
    case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;

    case META_CLASS: fprintf(stderr, "META ["); break;
    case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
    case META_CLASS_END: fprintf(stderr, "META ]"); break;
    case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
    case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;

    case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
    case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;

    case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
    case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;

    case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
    case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
    case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
    case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
    case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
    case META_THEN: fprintf(stderr, "META (*THEN)"); break;

    case META_OPTIONS:
    fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
    pptr += 2;
    break;

    case META_LOOKBEHIND:
    fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
    pptr += 2;
    break;

    case META_LOOKBEHIND_NA:
    fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
    pptr += 2;
    break;

    case META_LOOKBEHINDNOT:
    fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
    pptr += 2;
    break;

    case META_CALLOUT_NUMBER:
    fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
       pptr[1]);
    pptr += 3;
    break;

    case META_CALLOUT_STRING:
      {
      uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
      uint32_t patlength = *pptr++;    /* Length of next pattern item */
      fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
      GETOFFSET(offset, pptr);
      fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
      }
    break;

    case META_RECURSE_BYNAME:
    fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
    GETOFFSET(offset, pptr);
    fprintf(stderr, "%zd", offset);
    break;

    case META_BACKREF_BYNAME:
    fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
    GETOFFSET(offset, pptr);
    fprintf(stderr, "%zd", offset);
    break;

    case META_COND_NUMBER:
    fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
    GETOFFSET(offset, pptr);
    fprintf(stderr, "%zd", offset);
    pptr++;
    break;

    case META_COND_DEFINE:
    fprintf(stderr, "META (?(DEFINE) offset=");
    GETOFFSET(offset, pptr);
    fprintf(stderr, "%zd", offset);
    break;

    case META_COND_VERSION:
    fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
    fprintf(stderr, "%d.", *pptr++);
    fprintf(stderr, "%d)", *pptr++);
    break;

    case META_COND_NAME:
    fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
    GETOFFSET(offset, pptr);
    fprintf(stderr, "%zd", offset);
    break;

    case META_COND_RNAME:
    fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
    GETOFFSET(offset, pptr);
    fprintf(stderr, "%zd", offset);
    break;

    /* This is kept as a name, because it might be. */

    case META_COND_RNUMBER:
    fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
    GETOFFSET(offset, pptr);
    fprintf(stderr, "%zd", offset);
    break;

    case META_MARK:
    fprintf(stderr, "META (*MARK:");
    goto SHOWARG;

    case META_COMMIT_ARG:
    fprintf(stderr, "META (*COMMIT:");
    goto SHOWARG;

    case META_PRUNE_ARG:
    fprintf(stderr, "META (*PRUNE:");
    goto SHOWARG;

    case META_SKIP_ARG:
    fprintf(stderr, "META (*SKIP:");
    goto SHOWARG;

    case META_THEN_ARG:
    fprintf(stderr, "META (*THEN:");
    SHOWARG:
    length = *pptr++;
    for (i = 0; i < length; i++)
      {
      uint32_t cc = *pptr++;
      if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
        else fprintf(stderr, "\\x{%x}", cc);
      }
    fprintf(stderr, ") length=%u", length);
    break;
    }
  fprintf(stderr, "\n");
  }
return;
}
#endif  /* DEBUG_SHOW_PARSED */



/*************************************************
*               Copy compiled code               *
*************************************************/

/* Compiled JIT code cannot be copied, so the new compiled block has no
associated JIT data. */

PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code *code)
{ … }



/*************************************************
*     Copy compiled code and character tables    *
*************************************************/

/* Compiled JIT code cannot be copied, so the new compiled block has no
associated JIT data. This version of code_copy also makes a separate copy of
the character tables. */

PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code *code)
{ … }



/*************************************************
*               Free compiled code               *
*************************************************/

PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code *code)
{ … }



/*************************************************
*         Read a number, possibly signed         *
*************************************************/

/* This function is used to read numbers in the pattern. The initial pointer
must be at the sign or first digit of the number. When relative values
(introduced by + or -) are allowed, they are relative group numbers, and the
result must be greater than zero.

Arguments:
  ptrptr      points to the character pointer variable
  ptrend      points to the end of the input string
  allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
  max_value   the largest number allowed
  max_error   the error to give for an over-large number
  intptr      where to put the result
  errcodeptr  where to put an error code

Returns:      TRUE  - a number was read
              FALSE - errorcode == 0 => no number was found
                      errorcode != 0 => an error occurred
*/

static BOOL
read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
  uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
{ … }



/*************************************************
*         Read repeat counts                     *
*************************************************/

/* Read an item of the form {n,m} and return the values when non-NULL pointers
are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
larger value is used for "unlimited". We have to use signed arguments for
read_number() because it is capable of returning a signed value. As of Perl
5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
tabs after { and before } and between the numbers and the comma, so we do too.

Arguments:
  ptrptr         points to pointer to character after '{'
  ptrend         pointer to end of input
  minp           if not NULL, pointer to int for min
  maxp           if not NULL, pointer to int for max
  errorcodeptr   points to error code variable

Returns:         FALSE if not a repeat quantifier, errorcode set zero
                 FALSE on error, with errorcode set non-zero
                 TRUE on success, with pointer updated to point after '}'
*/

static BOOL
read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
  uint32_t *maxp, int *errorcodeptr)
{ … }



/*************************************************
*            Handle escapes                      *
*************************************************/

/* This function is called when a \ has been encountered. It either returns a
positive value for a simple escape such as \d, or 0 for a data character, which
is placed in chptr. A backreference to group n is returned as negative n. On
entry, ptr is pointing at the character after \. On exit, it points after the
final code unit of the escape sequence.

This function is also called from pcre2_substitute() to handle escape sequences
in replacement strings. In this case, the cb argument is NULL, and in the case
of escapes that have further processing, only sequences that define a data
character are recognised. The isclass argument is not relevant; the options
argument is the final value of the compiled pattern's options.

Arguments:
  ptrptr         points to the input position pointer
  ptrend         points to the end of the input
  chptr          points to a returned data character
  errorcodeptr   points to the errorcode variable (containing zero)
  options        the current options bits
  xoptions       the current extra options bits
  isclass        TRUE if inside a character class
  cb             compile data block or NULL when called from pcre2_substitute()

Returns:         zero => a data character
                 positive => a special escape sequence
                 negative => a numerical back reference
                 on error, errorcodeptr is set non-zero
*/

int
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
  int *errorcodeptr, uint32_t options, uint32_t xoptions, BOOL isclass,
  compile_block *cb)
{ … }



#ifdef SUPPORT_UNICODE
/*************************************************
*               Handle \P and \p                 *
*************************************************/

/* This function is called after \P or \p has been encountered, provided that
PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
contents of ptrptr are pointing after the P or p. On exit, it is left pointing
after the final code unit of the escape sequence.

Arguments:
  ptrptr         the pattern position pointer
  negptr         a boolean that is set TRUE for negation else FALSE
  ptypeptr       an unsigned int that is set to the type value
  pdataptr       an unsigned int that is set to the detailed property value
  errorcodeptr   the error code variable
  cb             the compile data

Returns:         TRUE if the type value was found, or FALSE for an invalid type
*/

static BOOL
get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
  uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
{ … }
#endif



/*************************************************
*           Check for POSIX class syntax         *
*************************************************/

/* This function is called when the sequence "[:" or "[." or "[=" is
encountered in a character class. It checks whether this is followed by a
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
reach an unescaped ']' without the special preceding character, return FALSE.

Originally, this function only recognized a sequence of letters between the
terminators, but it seems that Perl recognizes any sequence of characters,
though of course unknown POSIX names are subsequently rejected. Perl gives an
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
didn't consider this to be a POSIX class. Likewise for [:1234:].

The problem in trying to be exactly like Perl is in the handling of escapes. We
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
below handles the special cases \\ and \], but does not try to do any other
escape processing. This makes it different from Perl for cases such as
[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
when Perl does, I think.

A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
It seems that the appearance of a nested POSIX class supersedes an apparent
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
a digit. This is handled by returning FALSE if the start of a new group with
the same terminator is encountered, since the next closing sequence must close
the nested group, not the outer one.

In Perl, unescaped square brackets may also appear as part of class names. For
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
seem right at all. PCRE does not allow closing square brackets in POSIX class
names.

Arguments:
  ptr      pointer to the character after the initial [ (colon, dot, equals)
  ptrend   pointer to the end of the pattern
  endptr   where to return a pointer to the terminating ':', '.', or '='

Returns:   TRUE or FALSE
*/

static BOOL
check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
{ … }



/*************************************************
*          Check POSIX class name                *
*************************************************/

/* This function is called to check the name given in a POSIX-style class entry
such as [:alnum:].

Arguments:
  ptr        points to the first letter
  len        the length of the name

Returns:     a value representing the name, or -1 if unknown
*/

static int
check_posix_name(PCRE2_SPTR ptr, int len)
{ … }



/*************************************************
*       Read a subpattern or VERB name           *
*************************************************/

/* This function is called from parse_regex() below whenever it needs to read
the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
pointer must be to the preceding character. If that character is '*' we are
reading a verb or alpha assertion name. The pointer is updated to point after
the name, for a VERB or alpha assertion name, or after tha name's terminator
for a subpattern name. Returning both the offset and the name pointer is
redundant information, but some callers use one and some the other, so it is
simplest just to return both. When the name is in braces, spaces and tabs are
allowed (and ignored) at either end.

Arguments:
  ptrptr      points to the character pointer variable
  ptrend      points to the end of the input string
  utf         true if the input is UTF-encoded
  terminator  the terminator of a subpattern name must be this
  offsetptr   where to put the offset from the start of the pattern
  nameptr     where to put a pointer to the name in the input
  namelenptr  where to put the length of the name
  errcodeptr  where to put an error code
  cb          pointer to the compile data block

Returns:    TRUE if a name was read
            FALSE otherwise, with error code set
*/

static BOOL
read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
  PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
  int *errorcodeptr, compile_block *cb)
{ … }



/*************************************************
*          Manage callouts at start of cycle     *
*************************************************/

/* At the start of a new item in parse_regex() we are able to record the
details of the previous item in a prior callout, and also to set up an
automatic callout if enabled. Avoid having two adjacent automatic callouts,
which would otherwise happen for items such as \Q that contribute nothing to
the parsed pattern.

Arguments:
  ptr              current pattern pointer
  pcalloutptr      points to a pointer to previous callout, or NULL
  auto_callout     TRUE if auto_callouts are enabled
  parsed_pattern   the parsed pattern pointer
  cb               compile block

Returns: possibly updated parsed_pattern pointer.
*/

static uint32_t *
manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
  uint32_t *parsed_pattern, compile_block *cb)
{ … }



/*************************************************
*          Handle \d, \D, \s, \S, \w, \W         *
*************************************************/

/* This function is called from parse_regex() below, both for freestanding
escapes, and those within classes, to handle those escapes that may change when
Unicode property support is requested. Note that PCRE2_UCP will never be set
without Unicode support because that is checked when pcre2_compile() is called.

Arguments:
  escape          the ESC_... value
  parsed_pattern  where to add the code
  options         options bits
  xoptions        extra options bits

Returns:          updated value of parsed_pattern
*/
static uint32_t *
handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
  uint32_t xoptions)
{ … }



/*************************************************
*      Parse regex and identify named groups     *
*************************************************/

/* This function is called first of all. It scans the pattern and does two
things: (1) It identifies capturing groups and makes a table of named capturing
groups so that information about them is fully available to both the compiling
scans. (2) It writes a parsed version of the pattern with comments omitted and
escapes processed into the parsed_pattern vector.

Arguments:
  ptr             points to the start of the pattern
  options         compiling dynamic options (may change during the scan)
  has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
  cb              pointer to the compile data block

Returns:   zero on success or a non-zero error code, with the
             error offset placed in the cb field
*/

/* A structure and some flags for dealing with nested groups. */

nest_save;

#define NSF_RESET …
#define NSF_CONDASSERT …
#define NSF_ATOMICSR …

/* Options that are changeable within the pattern must be tracked during
parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
but all must be tracked so that META_OPTIONS items set the correct values for
the main compiling phase. */

#define PARSE_TRACKED_OPTIONS …

#define PARSE_TRACKED_EXTRA_OPTIONS …

/* States used for analyzing ranges in character classes. The two OK values
must be last. */

enum { … };

/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
the storing of literal values in the main parsed pattern, where they can always
be quantified. */

#if PCRE2_CODE_UNIT_WIDTH == 32
#define PARSED_LITERAL …
#else
#define PARSED_LITERAL(c, p) …
#endif

/* Here's the actual function. */

static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
  compile_block *cb)
{ … }



/*************************************************
*       Find first significant opcode            *
*************************************************/

/* This is called by several functions that scan a compiled expression looking
for a fixed first character, or an anchoring opcode etc. It skips over things
that do not influence this. For some calls, it makes sense to skip negative
forward and all backward assertions, and also the \b assertion; for others it
does not.

Arguments:
  code         pointer to the start of the group
  skipassert   TRUE if certain assertions are to be skipped

Returns:       pointer to the first significant opcode
*/

static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code, BOOL skipassert)
{ … }



#ifdef SUPPORT_UNICODE
/*************************************************
*           Get othercase range                  *
*************************************************/

/* This function is passed the start and end of a class range in UCP mode. For
single characters the range may be just one character long. The function
searches up the characters, looking for ranges of characters in the "other"
case. Each call returns the next one, updating the start address. A character
with multiple other cases is returned on its own with a special return value.

Arguments:
  cptr        points to starting character value; updated
  d           end value
  ocptr       where to put start of othercase range
  odptr       where to put end of othercase range
  restricted  TRUE if caseless restriction applies

Yield:        -1 when no more
               0 when a range is returned
              >0 the CASESET offset for char with multiple other cases;
                 for this return, *ocptr contains the original
*/

static int
get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
  uint32_t *odptr, BOOL restricted)
{ … }
#endif  /* SUPPORT_UNICODE */



/*************************************************
* Add a character or range to a class (internal) *
*************************************************/

/* This function packages up the logic of adding a character or range of
characters to a class. The character values in the arguments will be within the
valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
called only from within the "add to class" group of functions, some of which
are recursive and mutually recursive. The external entry point is
add_to_class().

Arguments:
  classbits     the bit map for characters < 256
  uchardptr     points to the pointer for extra data
  options       the options bits
  xoptions      the extra options bits
  cb            compile data
  start         start of range character
  end           end of range character

Returns:        the number of < 256 characters added
                the pointer to extra data is updated
*/

static unsigned int
add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
  uint32_t options, uint32_t xoptions, compile_block *cb, uint32_t start,
  uint32_t end)
{ … }



#ifdef SUPPORT_UNICODE
/*************************************************
* Add a list of characters to a class (internal) *
*************************************************/

/* This function is used for adding a list of case-equivalent characters to a
class when in UTF mode. This function is called only from within
add_to_class_internal(), with which it is mutually recursive.

Arguments:
  classbits     the bit map for characters < 256
  uchardptr     points to the pointer for extra data
  options       the options bits
  xoptions      the extra options bits
  cb            contains pointers to tables etc.
  p             points to row of 32-bit values, terminated by NOTACHAR
  except        character to omit; this is used when adding lists of
                  case-equivalent characters to avoid including the one we
                  already know about

Returns:        the number of < 256 characters added
                the pointer to extra data is updated
*/

static unsigned int
add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
  uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p,
  unsigned int except)
{ … }
#endif



/*************************************************
*   External entry point for add range to class  *
*************************************************/

/* This function sets the overall range so that the internal functions can try
to avoid duplication when handling case-independence.

Arguments:
  classbits     the bit map for characters < 256
  uchardptr     points to the pointer for extra data
  options       the options bits
  xoptions      the extra options bits
  cb            compile data
  start         start of range character
  end           end of range character

Returns:        the number of < 256 characters added
                the pointer to extra data is updated
*/

static unsigned int
add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
  uint32_t xoptions, compile_block *cb, uint32_t start, uint32_t end)
{ … }


/*************************************************
*   External entry point for add list to class   *
*************************************************/

/* This function is used for adding a list of horizontal or vertical whitespace
characters to a class. The list must be in order so that ranges of characters
can be detected and handled appropriately. This function sets the overall range
so that the internal functions can try to avoid duplication when handling
case-independence.

Arguments:
  classbits     the bit map for characters < 256
  uchardptr     points to the pointer for extra data
  options       the options bits
  xoptions      the extra options bits
  cb            contains pointers to tables etc.
  p             points to row of 32-bit values, terminated by NOTACHAR
  except        character to omit; this is used when adding lists of
                  case-equivalent characters to avoid including the one we
                  already know about

Returns:        the number of < 256 characters added
                the pointer to extra data is updated
*/

static unsigned int
add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
  uint32_t xoptions, compile_block *cb, const uint32_t *p, unsigned int except)
{ … }



/*************************************************
*    Add characters not in a list to a class     *
*************************************************/

/* This function is used for adding the complement of a list of horizontal or
vertical whitespace to a class. The list must be in order.

Arguments:
  classbits     the bit map for characters < 256
  uchardptr     points to the pointer for extra data
  options       the options bits
  xoptions      the extra options bits
  cb            contains pointers to tables etc.
  p             points to row of 32-bit values, terminated by NOTACHAR

Returns:        the number of < 256 characters added
                the pointer to extra data is updated
*/

static unsigned int
add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
  uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p)
{ … }



/*************************************************
*    Find details of duplicate group names       *
*************************************************/

/* This is called from compile_branch() when it needs to know the index and
count of duplicates in the names table when processing named backreferences,
either directly, or as conditions.

Arguments:
  name          points to the name
  length        the length of the name
  indexptr      where to put the index
  countptr      where to put the count of duplicates
  errorcodeptr  where to put an error code
  cb            the compile block

Returns:        TRUE if OK, FALSE if not, error code set
*/

static BOOL
find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
  int *countptr, int *errorcodeptr, compile_block *cb)
{ … }



/*************************************************
*           Compile one branch                   *
*************************************************/

/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
the options are changed during the branch, the pointer is used to change the
external options bits. This function is used during the pre-compile phase when
we are trying to find out the amount of memory needed, as well as during the
real compile phase. The value of lengthptr distinguishes the two phases.

Arguments:
  optionsptr        pointer to the option bits
  xoptionsptr       pointer to the extra option bits
  codeptr           points to the pointer to the current code point
  pptrptr           points to the current parsed pattern pointer
  errorcodeptr      points to error code variable
  firstcuptr        place to put the first required code unit
  firstcuflagsptr   place to put the first code unit flags
  reqcuptr          place to put the last required code unit
  reqcuflagsptr     place to put the last required code unit flags
  bcptr             points to current branch chain
  open_caps         points to current capitem
  cb                contains pointers to tables etc.
  lengthptr         NULL during the real compile phase
                    points to length accumulator during pre-compile phase

Returns:            0 There's been an error, *errorcodeptr is non-zero
                   +1 Success, this branch must match at least one character
                   -1 Success, this branch may match an empty string
*/

static int
compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
  PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
  compile_block *cb, PCRE2_SIZE *lengthptr)
{ … }



/*************************************************
*   Compile regex: a sequence of alternatives    *
*************************************************/

/* On entry, pptr is pointing past the bracket meta, but on return it points to
the closing bracket or META_END. The code variable is pointing at the code unit
into which the BRA operator has been stored. This function is used during the
pre-compile phase when we are trying to find out the amount of memory needed,
as well as during the real compile phase. The value of lengthptr distinguishes
the two phases.

Arguments:
  options           option bits, including any changes for this subpattern
  xoptions          extra option bits, ditto
  codeptr           -> the address of the current code pointer
  pptrptr           -> the address of the current parsed pattern pointer
  errorcodeptr      -> pointer to error code variable
  skipunits         skip this many code units at start (for brackets and OP_COND)
  firstcuptr        place to put the first required code unit
  firstcuflagsptr   place to put the first code unit flags
  reqcuptr          place to put the last required code unit
  reqcuflagsptr     place to put the last required code unit flags
  bcptr             pointer to the chain of currently open branches
  cb                points to the data block with tables pointers etc.
  lengthptr         NULL during the real compile phase
                    points to length accumulator during pre-compile phase

Returns:            0 There has been an error
                   +1 Success, this group must match at least one character
                   -1 Success, this group may match an empty string
*/

static int
compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
  uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
  compile_block *cb, PCRE2_SIZE *lengthptr)
{ … }



/*************************************************
*          Check for anchored pattern            *
*************************************************/

/* Try to find out if this is an anchored regular expression. Consider each
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
it's anchored. However, if this is a multiline pattern, then only OP_SOD will
be found, because ^ generates OP_CIRCM in that mode.

We can also consider a regex to be anchored if OP_SOM starts all its branches.
This is the code for \G, which means "match at start of match position, taking
into account the match offset".

A branch is also implicitly anchored if it starts with .* and DOTALL is set,
because that will try the rest of the pattern at all possible matching points,
so there is no point trying again.... er ....

.... except when the .* appears inside capturing parentheses, and there is a
subsequent back reference to those parentheses. We haven't enough information
to catch that case precisely.

At first, the best we could do was to detect when .* was in capturing brackets
and the highest back reference was greater than or equal to that level.
However, by keeping a bitmap of the first 31 back references, we can catch some
of the more common cases more precisely.

... A second exception is when the .* appears inside an atomic group, because
this prevents the number of characters it matches from being adjusted.

Arguments:
  code           points to start of the compiled pattern
  bracket_map    a bitmap of which brackets we are inside while testing; this
                   handles up to substring 31; after that we just have to take
                   the less precise approach
  cb             points to the compile data block
  atomcount      atomic group level
  inassert       TRUE if in an assertion

Returns:     TRUE or FALSE
*/

static BOOL
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
  int atomcount, BOOL inassert)
{ … }



/*************************************************
*         Check for starting with ^ or .*        *
*************************************************/

/* This is called to find out if every branch starts with ^ or .* so that
"first char" processing can be done to speed things up in multiline
matching and for non-DOTALL patterns that start with .* (which must start at
the beginning or after \n). As in the case of is_anchored() (see above), we
have to take account of back references to capturing brackets that contain .*
because in that case we can't make the assumption. Also, the appearance of .*
inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
or *SKIP does not count, because once again the assumption no longer holds.

Arguments:
  code           points to start of the compiled pattern or a group
  bracket_map    a bitmap of which brackets we are inside while testing; this
                   handles up to substring 31; after that we just have to take
                   the less precise approach
  cb             points to the compile data
  atomcount      atomic group level
  inassert       TRUE if in an assertion

Returns:         TRUE or FALSE
*/

static BOOL
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
  int atomcount, BOOL inassert)
{ … }



/*************************************************
*   Scan compiled regex for recursion reference  *
*************************************************/

/* This function scans through a compiled pattern until it finds an instance of
OP_RECURSE.

Arguments:
  code        points to start of expression
  utf         TRUE in UTF mode

Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
*/

static PCRE2_SPTR
find_recurse(PCRE2_SPTR code, BOOL utf)
{ … }



/*************************************************
*    Check for asserted fixed first code unit    *
*************************************************/

/* During compilation, the "first code unit" settings from forward assertions
are discarded, because they can cause conflicts with actual literals that
follow. However, if we end up without a first code unit setting for an
unanchored pattern, it is worth scanning the regex to see if there is an
initial asserted first code unit. If all branches start with the same asserted
code unit, or with a non-conditional bracket all of whose alternatives start
with the same asserted code unit (recurse ad lib), then we return that code
unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
REQ_NONE in the flags.

Arguments:
  code       points to start of compiled pattern
  flags      points to the first code unit flags
  inassert   non-zero if in an assertion

Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
*/

static uint32_t
find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
{ … }



/*************************************************
*     Add an entry to the name/number table      *
*************************************************/

/* This function is called between compiling passes to add an entry to the
name/number table, maintaining alphabetical order. Checking for permitted
and forbidden duplicates has already been done.

Arguments:
  cb           the compile data block
  name         the name to add
  length       the length of the name
  groupno      the group number
  tablecount   the count of names in the table so far

Returns:       nothing
*/

static void
add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
  unsigned int groupno, uint32_t tablecount)
{ … }



/*************************************************
*             Skip in parsed pattern             *
*************************************************/

/* This function is called to skip parts of the parsed pattern when finding the
length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
the end of the branch, it is called to skip over an internal lookaround or
(DEFINE) group, and it is also called to skip to the end of a class, during
which it will never encounter nested groups (but there's no need to have
special code for that).

When called to find the end of a branch or group, pptr must point to the first
meta code inside the branch, not the branch-starting code. In other cases it
can point to the item that causes the function to be called.

Arguments:
  pptr       current pointer to skip from
  skiptype   PSKIP_CLASS when skipping to end of class
             PSKIP_ALT when META_ALT ends the skip
             PSKIP_KET when only META_KET ends the skip

Returns:     new value of pptr
             NULL if META_END is reached - should never occur
               or for an unknown meta value - likewise
*/

static uint32_t *
parsed_skip(uint32_t *pptr, uint32_t skiptype)
{ … }



/*************************************************
*       Find length of a parsed group            *
*************************************************/

/* This is called for nested groups within a branch of a lookbehind whose
length is being computed. On entry, the pointer must be at the first element
after the group initializing code. On exit it points to OP_KET. Caching is used
to improve processing speed when the same capturing group occurs many times.

Arguments:
  pptrptr     pointer to pointer in the parsed pattern
  minptr      where to return the minimum length
  isinline    FALSE if a reference or recursion; TRUE for inline group
  errcodeptr  pointer to the errorcode
  lcptr       pointer to the loop counter
  group       number of captured group or -1 for a non-capturing group
  recurses    chain of recurse_check to catch mutual recursion
  cb          pointer to the compile data

Returns:      the maximum group length or a negative number
*/

static int
get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
  int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
{ … }



/*************************************************
*        Find length of a parsed branch          *
*************************************************/

/* Return fixed maximum and minimum lengths for a branch in a lookbehind,
giving an error if the length is not limited. On entry, *pptrptr points to the
first element inside the branch. On exit it is set to point to the ALT or KET.

Arguments:
  pptrptr     pointer to pointer in the parsed pattern
  minptr      where to return the minimum length
  errcodeptr  pointer to error code
  lcptr       pointer to loop counter
  recurses    chain of recurse_check to catch mutual recursion
  cb          pointer to compile block

Returns:      the maximum length, or a negative value on error
*/

static int
get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
  parsed_recurse_check *recurses, compile_block *cb)
{ … }



/*************************************************
*        Set lengths in a lookbehind             *
*************************************************/

/* This function is called for each lookbehind, to set the lengths in its
branches. An error occurs if any branch does not have a limited maximum length
that is less than the limit (65535). On exit, the pointer must be left on the
final ket.

The function also maintains the max_lookbehind value. Any lookbehind branch
that contains a nested lookbehind may actually look further back than the
length of the branch. The additional amount is passed back from
get_branchlength() as an "extra" value.

Arguments:
  pptrptr     pointer to pointer in the parsed pattern
  errcodeptr  pointer to error code
  lcptr       pointer to loop counter
  recurses    chain of recurse_check to catch mutual recursion
  cb          pointer to compile block

Returns:      TRUE if all is well
              FALSE otherwise, with error code and offset set
*/

static BOOL
set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
  parsed_recurse_check *recurses, compile_block *cb)
{ … }



/*************************************************
*         Check parsed pattern lookbehinds       *
*************************************************/

/* This function is called at the end of parsing a pattern if any lookbehinds
were encountered. It scans the parsed pattern for them, calling
set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
the error offset is marked unset. The enables the functions above not to
override settings from deeper nestings.

This function is called recursively from get_branchlength() for lookaheads in
order to process any lookbehinds that they may contain. It stops when it hits a
non-nested closing parenthesis in this case, returning a pointer to it.

Arguments
  pptr      points to where to start (start of pattern or start of lookahead)
  retptr    if not NULL, return the ket pointer here
  recurses  chain of recurse_check to catch mutual recursion
  cb        points to the compile block
  lcptr     points to loop counter

Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
*/

static int
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
  parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
{ … }



/*************************************************
*     External function to compile a pattern     *
*************************************************/

/* This function reads a regular expression in the form of a string and returns
a pointer to a block of store holding a compiled version of the expression.

Arguments:
  pattern       the regular expression
  patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
  options       option bits
  errorptr      pointer to errorcode
  erroroffset   pointer to error offset
  ccontext      points to a compile context or is NULL

Returns:        pointer to compiled data block, or NULL on error,
                with errorcode and erroroffset set
*/

PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
   int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
{ … }

/* These #undefs are here to enable unity builds with CMake. */

#undef NLBLOCK /* Block containing newline information */
#undef PSSTART /* Field containing processed string start */
#undef PSEND   /* Field containing processed string end */

/* End of pcre2_compile.c */
godot/thirdparty/pcre2/src/pcre2_compile.c