pcre2_intmodedep.h | Explore in Territory

/*************************************************
*      Perl-Compatible Regular Expressions       *
*************************************************/

/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
          New API code Copyright (c) 2016-2023 University of Cambridge

-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of the University of Cambridge nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/


/* This module contains mode-dependent macro and structure definitions. The
file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.
These mode-dependent items are kept in a separate file so that they can also be
#included multiple times for different code unit widths by pcre2test in order
to have access to the hidden structures at all supported widths.

Some of the mode-dependent macros are required at different widths for
different parts of the pcre2test code (in particular, the included
pcre_printint.c file). We undefine them here so that they can be re-defined for
multiple inclusions. Not all of these are used in pcre2test, but it's easier
just to undefine them all. */

#undef ACROSSCHAR
#undef BACKCHAR
#undef BYTES2CU
#undef CHMAX_255
#undef CU2BYTES
#undef FORWARDCHAR
#undef FORWARDCHARTEST
#undef GET
#undef GET2
#undef GETCHAR
#undef GETCHARINC
#undef GETCHARINCTEST
#undef GETCHARLEN
#undef GETCHARLENTEST
#undef GETCHARTEST
#undef GET_EXTRALEN
#undef HAS_EXTRALEN
#undef IMM2_SIZE
#undef MAX_255
#undef MAX_MARK
#undef MAX_PATTERN_SIZE
#undef MAX_UTF_SINGLE_CU
#undef NOT_FIRSTCU
#undef PUT
#undef PUT2
#undef PUT2INC
#undef PUTCHAR
#undef PUTINC
#undef TABLE_GET



/* -------------------------- MACROS ----------------------------- */

/* PCRE keeps offsets in its compiled code as at least 16-bit quantities
(always stored in big-endian order in 8-bit mode) by default. These are used,
for example, to link from the start of a subpattern to its alternatives and its
end. The use of 16 bits per offset limits the size of an 8-bit compiled regex
to around 64K, which is big enough for almost everybody. However, I received a
request for an even bigger limit. For this reason, and also to make the code
easier to maintain, the storing and loading of offsets from the compiled code
unit string is now handled by the macros that are defined here.

The macros are controlled by the value of LINK_SIZE. This defaults to 2, but
values of 3 or 4 are also supported. */

/* ------------------- 8-bit support  ------------------ */

#if PCRE2_CODE_UNIT_WIDTH == 8

#if LINK_SIZE == 2
#define PUT …
#define GET …
#define MAX_PATTERN_SIZE …

#elif LINK_SIZE == 3
#define PUT …
#define GET …
#define MAX_PATTERN_SIZE …

#elif LINK_SIZE == 4
#define PUT …
#define GET …
#define MAX_PATTERN_SIZE …

#else
#error LINK_SIZE must be 2, 3, or 4
#endif


/* ------------------- 16-bit support  ------------------ */

#elif PCRE2_CODE_UNIT_WIDTH == 16

#if LINK_SIZE == 2
#undef LINK_SIZE
#define LINK_SIZE …
#define PUT(a,n,d) …
#define GET(a,n) …
#define MAX_PATTERN_SIZE …

#elif LINK_SIZE == 3 || LINK_SIZE == 4
#undef LINK_SIZE
#define LINK_SIZE …
#define PUT …
#define GET …
#define MAX_PATTERN_SIZE …

#else
#error LINK_SIZE must be 2, 3, or 4
#endif


/* ------------------- 32-bit support  ------------------ */

#elif PCRE2_CODE_UNIT_WIDTH == 32
#undef LINK_SIZE
#define LINK_SIZE …
#define PUT …
#define GET …
#define MAX_PATTERN_SIZE …

#else
#error Unsupported compiling mode
#endif


/* --------------- Other mode-specific macros ----------------- */

/* PCRE uses some other (at least) 16-bit quantities that do not change when
the size of offsets changes. There are used for repeat counts and for other
things such as capturing parenthesis numbers in back references.

Define the number of code units required to hold a 16-bit count/offset, and
macros to load and store such a value. For reasons that I do not understand,
the expression in the 8-bit GET2 macro is treated by gcc as a signed
expression, even when a is declared as unsigned. It seems that any kind of
arithmetic results in a signed value. Hence the cast. */

#if PCRE2_CODE_UNIT_WIDTH == 8
#define IMM2_SIZE …
#define GET2 …
#define PUT2 …

#else  /* Code units are 16 or 32 bits */
#define IMM2_SIZE …
#define GET2(a,n) …
#define PUT2(a,n,d) …
#endif

/* Other macros that are different for 8-bit mode. The MAX_255 macro checks
whether its argument, which is assumed to be one code unit, is less than 256.
The CHMAX_255 macro does not assume one code unit. The maximum length of a MARK
name must fit in one code unit; currently it is set to 255 or 65535. The
TABLE_GET macro is used to access elements of tables containing exactly 256
items. Its argument is a code unit. When code points can be greater than 255, a
check is needed before accessing these tables. */

#if PCRE2_CODE_UNIT_WIDTH == 8
#define MAX_255 …
#define MAX_MARK …
#define TABLE_GET …
#ifdef SUPPORT_UNICODE
#define SUPPORT_WIDE_CHARS
#define CHMAX_255 …
#else
#define CHMAX_255 …
#endif  /* SUPPORT_UNICODE */

#else  /* Code units are 16 or 32 bits */
#define CHMAX_255(c) …
#define MAX_255(c) …
#define MAX_MARK …
#define SUPPORT_WIDE_CHARS
#define TABLE_GET(c, table, default) …
#endif


/* ----------------- Character-handling macros ----------------- */

/* There is a proposed future special "UTF-21" mode, in which only the lowest
21 bits of a 32-bit character are interpreted as UTF, with the remaining 11
high-order bits available to the application for other uses. In preparation for
the future implementation of this mode, there are macros that load a data item
and, if in this special mode, mask it to 21 bits. These macros all have names
starting with UCHAR21. In all other modes, including the normal 32-bit
library, the macros all have the same simple definitions. When the new mode is
implemented, it is expected that these definitions will be varied appropriately
using #ifdef when compiling the library that supports the special mode. */

#define UCHAR21(eptr) …
#define UCHAR21TEST(eptr) …
#define UCHAR21INC(eptr) …
#define UCHAR21INCTEST(eptr) …

/* When UTF encoding is being used, a character is no longer just a single
byte in 8-bit mode or a single short in 16-bit mode. The macros for character
handling generate simple sequences when used in the basic mode, and more
complicated ones for UTF characters. GETCHARLENTEST and other macros are not
used when UTF is not supported. To make sure they can never even appear when
UTF support is omitted, we don't even define them. */

#ifndef SUPPORT_UNICODE

/* #define MAX_UTF_SINGLE_CU */
/* #define HAS_EXTRALEN(c) */
/* #define GET_EXTRALEN(c) */
/* #define NOT_FIRSTCU(c) */
#define GETCHAR …
#define GETCHARTEST …
#define GETCHARINC …
#define GETCHARINCTEST …
#define GETCHARLEN …
#define PUTCHAR …
/* #define GETCHARLENTEST(c, eptr, len) */
/* #define BACKCHAR(eptr) */
/* #define FORWARDCHAR(eptr) */
/* #define FORWARCCHARTEST(eptr,end) */
/* #define ACROSSCHAR(condition, eptr, action) */

#else   /* SUPPORT_UNICODE */

/* ------------------- 8-bit support  ------------------ */

#if PCRE2_CODE_UNIT_WIDTH == 8
#define MAYBE_UTF_MULTI …

/* The largest UTF code point that can be encoded as a single code unit. */

#define MAX_UTF_SINGLE_CU …

/* Tests whether the code point needs extra characters to decode. */

#define HAS_EXTRALEN …

/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
Otherwise it has an undefined behaviour. */

#define GET_EXTRALEN …

/* Returns TRUE, if the given value is not the first code unit of a UTF
sequence. */

#define NOT_FIRSTCU …

/* Get the next UTF-8 character, not advancing the pointer. This is called when
we know we are in UTF-8 mode. */

#define GETCHAR …

/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
pointer. */

#define GETCHARTEST …

/* Get the next UTF-8 character, advancing the pointer. This is called when we
know we are in UTF-8 mode. */

#define GETCHARINC …

/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
This is called when we don't know if we are in UTF-8 mode. */

#define GETCHARINCTEST …

/* Get the next UTF-8 character, not advancing the pointer, incrementing length
if there are extra bytes. This is called when we know we are in UTF-8 mode. */

#define GETCHARLEN …

/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
pointer, incrementing length if there are extra bytes. This is called when we
do not know if we are in UTF-8 mode. */

#define GETCHARLENTEST …

/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-8 mode - we don't put a test within the macro
because almost all calls are already within a block of UTF-8 only code. */

#define BACKCHAR …

/* Same as above, just in the other direction. */
#define FORWARDCHAR …
#define FORWARDCHARTEST …

/* Same as above, but it allows a fully customizable form. */
#define ACROSSCHAR …

/* Deposit a character into memory, returning the number of code units. */

#define PUTCHAR …


/* ------------------- 16-bit support  ------------------ */

#elif PCRE2_CODE_UNIT_WIDTH == 16
#define MAYBE_UTF_MULTI …

/* The largest UTF code point that can be encoded as a single code unit. */

#define MAX_UTF_SINGLE_CU …

/* Tests whether the code point needs extra characters to decode. */

#define HAS_EXTRALEN(c) …

/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
Otherwise it has an undefined behaviour. */

#define GET_EXTRALEN(c) …

/* Returns TRUE, if the given value is not the first code unit of a UTF
sequence. */

#define NOT_FIRSTCU(c) …

/* Base macro to pick up the low surrogate of a UTF-16 character, not
advancing the pointer. */

#define GETUTF16(c, eptr) …

/* Get the next UTF-16 character, not advancing the pointer. This is called when
we know we are in UTF-16 mode. */

#define GETCHAR(c, eptr) …

/* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
pointer. */

#define GETCHARTEST(c, eptr) …

/* Base macro to pick up the low surrogate of a UTF-16 character, advancing
the pointer. */

#define GETUTF16INC(c, eptr) …

/* Get the next UTF-16 character, advancing the pointer. This is called when we
know we are in UTF-16 mode. */

#define GETCHARINC(c, eptr) …

/* Get the next character, testing for UTF-16 mode, and advancing the pointer.
This is called when we don't know if we are in UTF-16 mode. */

#define GETCHARINCTEST(c, eptr) …

/* Base macro to pick up the low surrogate of a UTF-16 character, not
advancing the pointer, incrementing the length. */

#define GETUTF16LEN(c, eptr, len) …

/* Get the next UTF-16 character, not advancing the pointer, incrementing
length if there is a low surrogate. This is called when we know we are in
UTF-16 mode. */

#define GETCHARLEN(c, eptr, len) …

/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
pointer, incrementing length if there is a low surrogate. This is called when
we do not know if we are in UTF-16 mode. */

#define GETCHARLENTEST(c, eptr, len) …

/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-16 mode - we don't put a test within the
macro because almost all calls are already within a block of UTF-16 only
code. */

#define BACKCHAR(eptr) …

/* Same as above, just in the other direction. */
#define FORWARDCHAR(eptr) …
#define FORWARDCHARTEST(eptr,end) …

/* Same as above, but it allows a fully customizable form. */
#define ACROSSCHAR(condition, eptr, action) …

/* Deposit a character into memory, returning the number of code units. */

#define PUTCHAR(c, p) …


/* ------------------- 32-bit support  ------------------ */

#else

/* These are trivial for the 32-bit library, since all UTF-32 characters fit
into one PCRE2_UCHAR unit. */

#define MAX_UTF_SINGLE_CU …
#define HAS_EXTRALEN …
#define GET_EXTRALEN …
#define NOT_FIRSTCU …

/* Get the next UTF-32 character, not advancing the pointer. This is called when
we know we are in UTF-32 mode. */

#define GETCHAR …

/* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the
pointer. */

#define GETCHARTEST …

/* Get the next UTF-32 character, advancing the pointer. This is called when we
know we are in UTF-32 mode. */

#define GETCHARINC …

/* Get the next character, testing for UTF-32 mode, and advancing the pointer.
This is called when we don't know if we are in UTF-32 mode. */

#define GETCHARINCTEST …

/* Get the next UTF-32 character, not advancing the pointer, not incrementing
length (since all UTF-32 is of length 1). This is called when we know we are in
UTF-32 mode. */

#define GETCHARLEN …

/* Get the next UTF-32character, testing for UTF-32 mode, not advancing the
pointer, not incrementing the length (since all UTF-32 is of length 1).
This is called when we do not know if we are in UTF-32 mode. */

#define GETCHARLENTEST …

/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-32 mode - we don't put a test within the
macro because almost all calls are already within a block of UTF-32 only
code.

These are all no-ops since all UTF-32 characters fit into one PCRE2_UCHAR. */

#define BACKCHAR …

/* Same as above, just in the other direction. */

#define FORWARDCHAR …
#define FORWARDCHARTEST …

/* Same as above, but it allows a fully customizable form. */

#define ACROSSCHAR …

/* Deposit a character into memory, returning the number of code units. */

#define PUTCHAR …

#endif  /* UTF-32 character handling */
#endif  /* SUPPORT_UNICODE */


/* Mode-dependent macros that have the same definition in all modes. */

#define CU2BYTES(x) …
#define BYTES2CU(x) …
#define PUTINC(a,n,d) …
#define PUT2INC(a,n,d) …


/* ----------------------- HIDDEN STRUCTURES ----------------------------- */

/* NOTE: All these structures *must* start with a pcre2_memctl structure. The
code that uses them is simpler because it assumes this. */

/* The real general context structure. At present it holds only data for custom
memory control. */

pcre2_real_general_context_16;

/* The real compile context structure */

pcre2_real_compile_context_16;

/* The real match context structure. */

pcre2_real_match_context_16;

/* The real convert context structure. */

pcre2_real_convert_context_16;

/* The real compiled code structure. The type for the blocksize field is
defined specially because it is required in pcre2_serialize_decode() when
copying the size from possibly unaligned memory into a variable of the same
type. Use a macro rather than a typedef to avoid compiler warnings when this
file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
largest lookbehind that is supported. (OP_REVERSE and OP_VREVERSE in a pattern
have 16-bit arguments in 8-bit and 16-bit modes, so we need no more than a
16-bit field here.) */

#undef  CODE_BLOCKSIZE_TYPE
#define CODE_BLOCKSIZE_TYPE …

#undef  LOOKBEHIND_MAX
#define LOOKBEHIND_MAX …

pcre2_real_code_16;

/* The real match data structure. Define ovector as large as it can ever
actually be so that array bound checkers don't grumble. Memory for this
structure is obtained by calling pcre2_match_data_create(), which sets the size
as the offset of ovector plus a pair of elements for each capturable string, so
the size varies from call to call. As the maximum number of capturing
subpatterns is 65535 we must allow for 65536 strings to include the overall
match. (See also the heapframe structure below.) */

struct heapframe;  /* Forward reference */

pcre2_real_match_data_16;


/* ----------------------- PRIVATE STRUCTURES ----------------------------- */

/* These structures are not needed for pcre2test. */

#ifndef PCRE2_PCRE2TEST

/* Structures for checking for mutual function recursion when scanning compiled
or parsed code. */

recurse_check;

parsed_recurse_check;

/* Structure for building a cache when filling in pattern recursion offsets. */

recurse_cache;

/* Structure for maintaining a chain of pointers to the currently incomplete
branches, for testing for left recursion while compiling. */

branch_chain_16;

/* Structure for building a list of named groups during the first pass of
compiling. */

named_group_16;

/* Structure for passing "static" information around between the functions
doing the compiling, so that they are thread-safe. */

compile_block_16;

/* Structure for keeping the properties of the in-memory stack used
by the JIT matcher. */

pcre2_real_jit_stack_16;

/* Structure for items in a linked list that represents an explicit recursive
call within the pattern when running pcre2_dfa_match(). */

dfa_recursion_info;

/* Structure for "stack" frames that are used for remembering backtracking
positions during matching. As these are used in a vector, with the ovector item
being extended, the size of the structure must be a multiple of PCRE2_SIZE. The
only way to check this at compile time is to force an error by generating an
array with a negative size. By putting this in a typedef (which is never used),
we don't generate any code when all is well. */

heapframe;

/* This typedef is a check that the size of the heapframe structure is a
multiple of PCRE2_SIZE. See various comments above. */

check_heapframe_size;

/* Structure for computing the alignment of heapframe. */

heapframe_align;

/* This define is the minimum alignment required for a heapframe, in bytes. */

#define HEAPFRAME_ALIGNMENT …

/* Structure for passing "static" information around between the functions
doing traditional NFA matching (pcre2_match() and friends). */

match_block_16;

/* A similar structure is used for the same purpose by the DFA matching
functions. */

dfa_match_block_16;

#endif  /* PCRE2_PCRE2TEST */

/* End of pcre2_intmodedep.h */
godot/thirdparty/pcre2/src/pcre2_intmodedep.h