/************************************************* * Perl-Compatible Regular Expressions * *************************************************/ /* PCRE is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge New API code Copyright (c) 2015-2024 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the University of Cambridge nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "pcre2_internal.h" /* These defines enable debugging code */ /* #define DEBUG_FRAMES_DISPLAY */ /* #define DEBUG_SHOW_OPS */ /* #define DEBUG_SHOW_RMATCH */ #ifdef DEBUG_FRAMES_DISPLAY #include <stdarg.h> #endif #ifdef DEBUG_SHOW_OPS static const char *OP_names[] = { OP_NAME_LIST }; #endif /* These defines identify the name of the block containing "static" information, and fields within it. */ #define NLBLOCK … #define PSSTART … #define PSEND … #define RECURSE_UNSET … /* Masks for identifying the public options that are permitted at match time. */ #define PUBLIC_MATCH_OPTIONS … #define PUBLIC_JIT_MATCH_OPTIONS … /* Non-error returns from and within the match() function. Error returns are externally defined PCRE2_ERROR_xxx codes, which are all negative. */ #define MATCH_MATCH … #define MATCH_NOMATCH … /* Special internal returns used in the match() function. Make them sufficiently negative to avoid the external error codes. */ #define MATCH_ACCEPT … #define MATCH_KETRPOS … /* The next 5 must be kept together and in sequence so that a test that checks for any one of them can use a range. */ #define MATCH_COMMIT … #define MATCH_PRUNE … #define MATCH_SKIP … #define MATCH_SKIP_ARG … #define MATCH_THEN … #define MATCH_BACKTRACK_MAX … #define MATCH_BACKTRACK_MIN … /* Group frame type values. Zero means the frame is not a group frame. The lower 16 bits are used for data (e.g. the capture number). Group frames are used for most groups so that information about the start is easily available at the end without having to scan back through intermediate frames (backtrack points). */ #define GF_CAPTURE … #define GF_NOCAPTURE … #define GF_CONDASSERT … #define GF_RECURSE … /* Masks for the identity and data parts of the group frame type. */ #define GF_IDMASK(a) … #define GF_DATAMASK(a) … /* Repetition types */ enum { … }; /* Min and max values for the common repeats; a maximum of UINT32_MAX => infinity. */ static const uint32_t rep_min[] = …; /* OP_CRPOS{STAR, PLUS, QUERY} */ static const uint32_t rep_max[] = …; /* OP_CRPOS{STAR, PLUS, QUERY} */ /* Repetition types - must include OP_CRPOSRANGE (not needed above) */ static const uint32_t rep_typ[] = …; /* OP_CRPOSQUERY, OP_CRPOSRANGE */ /* Numbers for RMATCH calls at backtracking points. When these lists are changed, the code at RETURN_SWITCH below must be updated in sync. */ enum { … }; #ifdef SUPPORT_WIDE_CHARS enum { … }; #endif #ifdef SUPPORT_UNICODE enum { … }; #endif /* Define short names for general fields in the current backtrack frame, which is always pointed to by the F variable. Occasional references to fields in other frames are written out explicitly. There are also some fields in the current frame whose names start with "temp" that are used for short-term, localised backtracking memory. These are #defined with Lxxx names at the point of use and undefined afterwards. */ #define Fback_frame … #define Fcapture_last … #define Fcurrent_recurse … #define Fecode … #define Feptr … #define Fgroup_frame_type … #define Flast_group_offset … #define Flength … #define Fmark … #define Frdepth … #define Fstart_match … #define Foffset_top … #define Foccu … #define Fop … #define Fovector … #define Freturn_id … #ifdef DEBUG_FRAMES_DISPLAY /************************************************* * Display current frames and contents * *************************************************/ /* This debugging function displays the current set of frames and their contents. It is not called automatically from anywhere, the intention being that calls can be inserted where necessary when debugging frame-related problems. Arguments: f the file to write to F the current top frame P a previous frame of interest frame_size the frame size mb points to the match block match_data points to the match data block s identification text Returns: nothing */ static void display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size, match_block *mb, pcre2_match_data *match_data, const char *s, ...) { uint32_t i; heapframe *Q; va_list ap; va_start(ap, s); fprintf(f, "FRAMES "); vfprintf(f, s, ap); va_end(ap); if (P != NULL) fprintf(f, " P=%lu", ((char *)P - (char *)(match_data->heapframes))/frame_size); fprintf(f, "\n"); for (i = 0, Q = match_data->heapframes; Q <= F; i++, Q = (heapframe *)((char *)Q + frame_size)) { fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d", i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode), Q->back_frame, Q->return_id); if (Q->last_group_offset == PCRE2_UNSET) fprintf(f, " lgoffset=unset\n"); else fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size); } } #endif /************************************************* * Process a callout * *************************************************/ /* This function is called for all callouts, whether "standalone" or at the start of a conditional group. Feptr will be pointing to either OP_CALLOUT or OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized with fixed values. Arguments: F points to the current backtracking frame mb points to the match block lengthptr where to return the length of the callout item Returns: the return from the callout or 0 if no callout function exists */ static int do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr) { … } /************************************************* * Match a back-reference * *************************************************/ /* This function is called only when it is known that the offset lies within the offsets that have so far been used in the match. Note that in caseless UTF-8 mode, the number of subject bytes matched may be different to the number of reference bytes. (In theory this could also happen in UTF-16 mode, but it seems unlikely.) Arguments: offset index into the offset vector caseless TRUE if caseless F the current backtracking frame pointer mb points to match block lengthptr pointer for returning the length matched Returns: = 0 sucessful match; number of code units matched is set < 0 no match > 0 partial match */ static int match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr) { … } /****************************************************************************** ******************************************************************************* "Recursion" in the match() function The original match() function was highly recursive, but this proved to be the source of a number of problems over the years, mostly because of the relatively small system stacks that are commonly found. As new features were added to patterns, various kludges were invented to reduce the amount of stack used, making the code hard to understand in places. A version did exist that used individual frames on the heap instead of calling match() recursively, but this ran substantially slower. The current version is a refactoring that uses a vector of frames to remember backtracking points. This runs no slower, and possibly even a bit faster than the original recursive implementation. At first, an initial vector of size START_FRAMES_SIZE (enough for maybe 50 frames) was allocated on the system stack. If this was not big enough, the heap was used for a larger vector. However, it turns out that there are environments where taking as little as 20KiB from the system stack is an embarrassment. After another refactoring, the heap is used exclusively, but a pointer the frames vector and its size are cached in the match_data block, so that there is no new memory allocation if the same match_data block is used for multiple matches (unless the frames vector has to be extended). ******************************************************************************* ******************************************************************************/ /************************************************* * Macros for the match() function * *************************************************/ /* These macros pack up tests that are used for partial matching several times in the code. The second one is used when we already know we are past the end of the subject. We set the "hit end" flag if the pointer is at the end of the subject and either (a) the pointer is past the earliest inspected character (i.e. something has been matched, even if not part of the actual matched string), or (b) the pattern contains a lookbehind. These are the conditions for which adding more characters may allow the current match to continue. For hard partial matching, we immediately return a partial match. Otherwise, carrying on means that a complete match on the current subject will be sought. A partial match is returned only if no complete match can be found. */ #define CHECK_PARTIAL() … #define SCHECK_PARTIAL() … /* These macros are used to implement backtracking. They simulate a recursive call to the match() function by means of a local vector of frames which remember the backtracking points. */ #define RMATCH(ra,rb) … #define RRETURN(ra) … /************************************************* * Match from current position * *************************************************/ /* This function is called to run one match attempt at a single starting point in the subject. Performance note: It might be tempting to extract commonly used fields from the mb structure (e.g. end_subject) into individual variables to improve performance. Tests using gcc on a SPARC disproved this; in the first case, it made performance worse. Arguments: start_eptr starting character in subject start_ecode starting position in compiled code top_bracket number of capturing parentheses in the pattern frame_size size of each backtracking frame match_data pointer to the match_data block mb pointer to "static" variables block Returns: MATCH_MATCH if matched ) these values are >= 0 MATCH_NOMATCH if failed to match ) negative MATCH_xxx value for PRUNE, SKIP, etc negative PCRE2_ERROR_xxx value if aborted by an error condition (e.g. stopped by repeated call or depth limit) */ static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb) { … } /************************************************* * Match a Regular Expression * *************************************************/ /* This function applies a compiled pattern to a subject string and picks out portions of the string if it matches. Two elements in the vector are set for each substring: the offsets to the start and end of the substring. Arguments: code points to the compiled expression subject points to the subject string length length of subject string (may contain binary zeros) start_offset where to start in the subject string options option bits match_data points to a match_data block mcontext points a PCRE2 context Returns: > 0 => success; value is the number of ovector pairs filled = 0 => success, but ovector is not big enough = -1 => failed to match (PCRE2_ERROR_NOMATCH) = -2 => partial match (PCRE2_ERROR_PARTIAL) < -2 => some kind of unexpected problem */ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext) { … } /* These #undefs are here to enable unity builds with CMake. */ #undef NLBLOCK /* Block containing newline information */ #undef PSSTART /* Field containing processed string start */ #undef PSEND /* Field containing processed string end */ /* End of pcre2_match.c */