pcre2_dfa_match.c | Explore in Territory

/*************************************************
*      Perl-Compatible Regular Expressions       *
*************************************************/

/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
          New API code Copyright (c) 2016-2023 University of Cambridge

-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of the University of Cambridge nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/


/* This module contains the external function pcre2_dfa_match(), which is an
alternative matching function that uses a sort of DFA algorithm (not a true
FSM). This is NOT Perl-compatible, but it has advantages in certain
applications. */


/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
the performance of his patterns greatly. I could not use it as it stood, as it
was not thread safe, and made assumptions about pattern sizes. Also, it caused
test 7 to loop, and test 9 to crash with a segfault.

The issue is the check for duplicate states, which is done by a simple linear
search up the state list. (Grep for "duplicate" below to find the code.) For
many patterns, there will never be many states active at one time, so a simple
linear search is fine. In patterns that have many active states, it might be a
bottleneck. The suggested code used an indexing scheme to remember which states
had previously been used for each character, and avoided the linear search when
it knew there was no chance of a duplicate. This was implemented when adding
states to the state lists.

I wrote some thread-safe, not-limited code to try something similar at the time
of checking for duplicates (instead of when adding states), using index vectors
on the stack. It did give a 13% improvement with one specially constructed
pattern for certain subject strings, but on other strings and on many of the
simpler patterns in the test suite it did worse. The major problem, I think,
was the extra time to initialize the index. This had to be done for each call
of internal_dfa_match(). (The supplied patch used a static vector, initialized
only once - I suspect this was the cause of the problems with the tests.)

Overall, I concluded that the gains in some cases did not outweigh the losses
in others, so I abandoned this code. */


#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#define NLBLOCK …
#define PSSTART …
#define PSEND …

#include "pcre2_internal.h"

#define PUBLIC_DFA_MATCH_OPTIONS …


/*************************************************
*      Code parameters and static tables         *
*************************************************/

/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
into others, under special conditions. A gap of 20 between the blocks should be
enough. The resulting opcodes don't have to be less than 256 because they are
never stored, so we push them well clear of the normal opcodes. */

#define OP_PROP_EXTRA …
#define OP_EXTUNI_EXTRA …
#define OP_ANYNL_EXTRA …
#define OP_HSPACE_EXTRA …
#define OP_VSPACE_EXTRA …


/* This table identifies those opcodes that are followed immediately by a
character that is to be tested in some way. This makes it possible to
centralize the loading of these characters. In the case of Type * etc, the
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
small value. Non-zero values in the table are the offsets from the opcode where
the character is to be found. ***NOTE*** If the start of this table is
modified, the three tables that follow must also be modified. */

static const uint8_t coptable[] = …;

/* This table identifies those opcodes that inspect a character. It is used to
remember the fact that a character could have been inspected when the end of
the subject is reached. ***NOTE*** If the start of this table is modified, the
two tables that follow must also be modified. */

static const uint8_t poptable[] = …;

/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
and \w */

static const uint8_t toptable1[] = …;

static const uint8_t toptable2[] = …;


/* Structure for holding data about a particular state, which is in effect the
current data for an active path through the match tree. It must consist
entirely of ints because the working vector we are passed, and which we put
these structures in, is a vector of ints. */

stateblock;

#define INTS_PER_STATEBLOCK …


/* Before version 10.32 the recursive calls of internal_dfa_match() were passed
local working space and output vectors that were created on the stack. This has
caused issues for some patterns, especially in small-stack environments such as
Windows. A new scheme is now in use which sets up a vector on the stack, but if
this is too small, heap memory is used, up to the heap_limit. The main
parameters are all numbers of ints because the workspace is a vector of ints.

The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
defined in pcre2_internal.h so as to be available to pcre2test when it is
finding the minimum heap requirement for a match. */

#define OVEC_UNIT …

#define RWS_BASE_SIZE …
#define RWS_RSIZE …
#define RWS_OVEC_RSIZE …
#define RWS_OVEC_OSIZE …

/* This structure is at the start of each workspace block. */

RWS_anchor;

#define RWS_ANCHOR_SIZE …



/*************************************************
*               Process a callout                *
*************************************************/

/* This function is called to perform a callout.

Arguments:
  code              current code pointer
  offsets           points to current capture offsets
  current_subject   start of current subject match
  ptr               current position in subject
  mb                the match block
  extracode         extra code offset when called from condition
  lengthptr         where to return the callout length

Returns:            the return from the callout
*/

static int
do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
  PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
  PCRE2_SIZE *lengthptr)
{ … }



/*************************************************
*         Expand local workspace memory          *
*************************************************/

/* This function is called when internal_dfa_match() is about to be called
recursively and there is insufficient working space left in the current
workspace block. If there's an existing next block, use it; otherwise get a new
block unless the heap limit is reached.

Arguments:
  rwsptr     pointer to block pointer (updated)
  ovecsize   space needed for an ovector
  mb         the match block

Returns:     0 rwsptr has been updated
            !0 an error code
*/

static int
more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
{ … }



/*************************************************
*     Match a Regular Expression - DFA engine    *
*************************************************/

/* This internal function applies a compiled pattern to a subject string,
starting at a given point, using a DFA engine. This function is called from the
external one, possibly multiple times if the pattern is not anchored. The
function calls itself recursively for some kinds of subpattern.

Arguments:
  mb                the match_data block with fixed information
  this_start_code   the opening bracket of this subexpression's code
  current_subject   where we currently are in the subject string
  start_offset      start offset in the subject string
  offsets           vector to contain the matching string offsets
  offsetcount       size of same
  workspace         vector of workspace
  wscount           size of same
  rlevel            function call recursion level

Returns:            > 0 => number of match offset pairs placed in offsets
                    = 0 => offsets overflowed; longest matches are present
                     -1 => failed to match
                   < -1 => some kind of unexpected problem

The following macros are used for adding states to the two state vectors (one
for the current character, one for the following character). */

#define ADD_ACTIVE(x,y) …

#define ADD_ACTIVE_DATA(x,y,z) …

#define ADD_NEW(x,y) …

#define ADD_NEW_DATA(x,y,z) …

/* And now, here is the code */

static int
internal_dfa_match(
  dfa_match_block *mb,
  PCRE2_SPTR this_start_code,
  PCRE2_SPTR current_subject,
  PCRE2_SIZE start_offset,
  PCRE2_SIZE *offsets,
  uint32_t offsetcount,
  int *workspace,
  int wscount,
  uint32_t rlevel,
  int *RWS)
{ … }



/*************************************************
*     Match a pattern using the DFA algorithm    *
*************************************************/

/* This function matches a compiled pattern to a subject string, using the
alternate matching algorithm that finds all matches at once.

Arguments:
  code          points to the compiled pattern
  subject       subject string
  length        length of subject string
  startoffset   where to start matching in the subject
  options       option bits
  match_data    points to a match data structure
  gcontext      points to a match context
  workspace     pointer to workspace
  wscount       size of workspace

Returns:        > 0 => number of match offset pairs placed in offsets
                = 0 => offsets overflowed; longest matches are present
                 -1 => failed to match
               < -1 => some kind of unexpected problem
*/

PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
  PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
  pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
{ … }

/* These #undefs are here to enable unity builds with CMake. */

#undef NLBLOCK /* Block containing newline information */
#undef PSSTART /* Field containing processed string start */
#undef PSEND   /* Field containing processed string end */

/* End of pcre2_dfa_match.c */
godot/thirdparty/pcre2/src/pcre2_dfa_match.c