/************************************************* * Perl-Compatible Regular Expressions * *************************************************/ /* PCRE is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge New API code Copyright (c) 2016-2022 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the University of Cambridge nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- */ /* This module contains functions that scan a compiled pattern and change repeats into possessive repeats where possible. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "pcre2_internal.h" /************************************************* * Tables for auto-possessification * *************************************************/ /* This table is used to check whether auto-possessification is possible between adjacent character-type opcodes. The left-hand (repeated) opcode is used to select the row, and the right-hand opcode is use to select the column. A value of 1 means that auto-possessification is OK. For example, the second value in the first row means that \D+\d can be turned into \D++\d. The Unicode property types (\P and \p) have to be present to fill out the table because of what their opcode values are, but the table values should always be zero because property types are handled separately in the code. The last four columns apply to items that cannot be repeated, so there is no need to have rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ #define APTROWS … #define APTCOLS … static const uint8_t autoposstab[APTROWS][APTCOLS] = …; #ifdef SUPPORT_UNICODE /* This table is used to check whether auto-possessification is possible between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The left-hand (repeated) opcode is used to select the row, and the right-hand opcode is used to select the column. The values are as follows: 0 Always return FALSE (never auto-possessify) 1 Character groups are distinct (possessify if both are OP_PROP) 2 Check character categories in the same group (general or particular) 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP) 4 Check left general category vs right particular category 5 Check right general category vs left particular category 6 Left alphanum vs right general category 7 Left space vs right general category 8 Left word vs right general category 9 Right alphanum vs left general category 10 Right space vs left general category 11 Right word vs left general category 12 Left alphanum vs right particular category 13 Left space vs right particular category 14 Left word vs right particular category 15 Right alphanum vs left particular category 16 Right space vs left particular category 17 Right word vs left particular category */ static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = …; /* This table is used to check whether auto-possessification is possible between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one specifies a general category and the other specifies a particular category. The row is selected by the general category and the column by the particular category. The value is 1 if the particular category is not part of the general category. */ static const uint8_t catposstab[7][30] = …; /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against a general or particular category. The properties in each row are those that apply to the character set in question. Duplication means that a little unnecessary work is done when checking, but this keeps things much simpler because they can all use the same code. For more details see the comment where this table is used. Note: SPACE and PXSPACE used to be different because Perl excluded VT from "space", but from Perl 5.18 it's included, so both categories are treated the same here. */ static const uint8_t posspropstab[3][4] = …; #endif /* SUPPORT_UNICODE */ #ifdef SUPPORT_UNICODE /************************************************* * Check a character and a property * *************************************************/ /* This function is called by compare_opcodes() when a property item is adjacent to a fixed character. Arguments: c the character ptype the property type pdata the data for the type negated TRUE if it's a negated property (\P or \p{^) Returns: TRUE if auto-possessifying is OK */ static BOOL check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata, BOOL negated) { … } #endif /* SUPPORT_UNICODE */ /************************************************* * Base opcode of repeated opcodes * *************************************************/ /* Returns the base opcode for repeated single character type opcodes. If the opcode is not a repeated character type, it returns with the original value. Arguments: c opcode Returns: base opcode for the type */ static PCRE2_UCHAR get_repeat_base(PCRE2_UCHAR c) { … } /************************************************* * Fill the character property list * *************************************************/ /* Checks whether the code points to an opcode that can take part in auto- possessification, and if so, fills a list with its properties. Arguments: code points to start of expression utf TRUE if in UTF mode ucp TRUE if in UCP mode fcc points to the case-flipping table list points to output list list[0] will be filled with the opcode list[1] will be non-zero if this opcode can match an empty character string list[2..7] depends on the opcode Returns: points to the start of the next opcode if *code is accepted NULL if *code is not accepted */ static PCRE2_SPTR get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc, uint32_t *list) { … } /************************************************* * Scan further character sets for match * *************************************************/ /* Checks whether the base and the current opcode have a common character, in which case the base cannot be possessified. Arguments: code points to the byte code utf TRUE in UTF mode ucp TRUE in UCP mode cb compile data block base_list the data list of the base opcode base_end the end of the base opcode rec_limit points to recursion depth counter Returns: TRUE if the auto-possessification is possible */ static BOOL compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb, const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit) { … } /************************************************* * Scan compiled regex for auto-possession * *************************************************/ /* Replaces single character iterations with their possessive alternatives if appropriate. This function modifies the compiled opcode! Hitting a non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches overly complicated or large patterns. In these cases, the check just stops, leaving the remainder of the pattern unpossessified. Arguments: code points to start of the byte code cb compile data block Returns: 0 for success -1 if a non-existant opcode is encountered */ int PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb) { … } /* End of pcre2_auto_possess.c */