regcomp.c | Explore in Territory

/*-
 * This code is derived from OpenBSD's libc/regex, original license follows:
 *
 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
 * Copyright (c) 1992, 1993, 1994
 *	The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Henry Spencer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
 */

#include <sys/types.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <stdlib.h>
#include "regex_impl.h"

#include "regutils.h"
#include "regex2.h"

#include "llvm/Config/config.h"
#include "llvm/Support/Compiler.h"

/* character-class table */
static struct cclass { … } cclasses[] = …;

/* character-name table */
static struct cname { … } cnames[] = …;

/*
 * parse structure, passed up and down to avoid global variables and
 * other clumsinesses
 */
struct parse { … };

static void p_ere(struct parse *, int);
static void p_ere_exp(struct parse *);
static void p_str(struct parse *);
static void p_bre(struct parse *, int, int);
static int p_simp_re(struct parse *, int);
static int p_count(struct parse *);
static void p_bracket(struct parse *);
static void p_b_term(struct parse *, cset *);
static void p_b_cclass(struct parse *, cset *);
static void p_b_eclass(struct parse *, cset *);
static char p_b_symbol(struct parse *);
static char p_b_coll_elem(struct parse *, int);
static char othercase(int);
static void bothcases(struct parse *, int);
static void ordinary(struct parse *, int);
static void nonnewline(struct parse *);
static void repeat(struct parse *, sopno, int, int);
static int seterr(struct parse *, int);
static cset *allocset(struct parse *);
static void freeset(struct parse *, cset *);
static int freezeset(struct parse *, cset *);
static int firstch(struct parse *, cset *);
static int nch(struct parse *, cset *);
static void mcadd(struct parse *, cset *, const char *);
static void mcinvert(struct parse *, cset *);
static void mccase(struct parse *, cset *);
static int isinsets(struct re_guts *, int);
static int samesets(struct re_guts *, int, int);
static void categorize(struct parse *, struct re_guts *);
static sopno dupl(struct parse *, sopno, sopno);
static void doemit(struct parse *, sop, size_t);
static void doinsert(struct parse *, sop, size_t, sopno);
static void dofwd(struct parse *, sopno, sop);
static void enlarge(struct parse *, sopno);
static void stripsnug(struct parse *, struct re_guts *);
static void findmust(struct parse *, struct re_guts *);
static sopno pluscount(struct parse *, struct re_guts *);

static char nuls[10];		/* place to point scanner in event of error */

/*
 * macros for use with parse structure
 * BEWARE:  these know that the parse structure is named `p' !!!
 */
#define PEEK() …
#define PEEK2() …
#define MORE() …
#define MORE2() …
#define SEE(c) …
#define SEETWO(a, b) …
#define EAT(c) …
#define EATTWO(a, b) …
#define NEXT() …
#define NEXT2() …
#define NEXTn(n) …
#define GETNEXT() …
#define SETERROR(e) …
#define REQUIRE(co, e) …
#define MUSTSEE(c, e) …
#define MUSTEAT(c, e) …
#define MUSTNOTSEE(c, e) …
#define EMIT(op, sopnd) …
#define INSERT(op, pos) …
#define AHEAD(pos) …
#define ASTERN(sop, pos) …
#define HERE() …
#define THERE() …
#define THERETHERE() …
#define DROP(n) …

#ifdef	_POSIX2_RE_DUP_MAX
#define DUPMAX …
#else
#define DUPMAX …
#endif
#define REGINFINITY …

#ifndef NDEBUG
static int never = 0;		/* for use in asserts; shuts lint up */
#else
#define never …
#endif

/*
 - llvm_regcomp - interface for parser and compilation
 */
int				/* 0 success, otherwise REG_something */
llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
{ … }

/*
 - p_ere - ERE parser top level, concatenation and alternation
 */
static void
p_ere(struct parse *p, int stop)	/* character this ERE should end at */
{ … }

/*
 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
 */
static void
p_ere_exp(struct parse *p)
{ … }

/*
 - p_str - string (no metacharacters) "parser"
 */
static void
p_str(struct parse *p)
{ … }

/*
 - p_bre - BRE parser top level, anchoring and concatenation
 * Giving end1 as OUT essentially eliminates the end1/end2 check.
 *
 * This implementation is a bit of a kludge, in that a trailing $ is first
 * taken as an ordinary character and then revised to be an anchor.  The
 * only undesirable side effect is that '$' gets included as a character
 * category in such cases.  This is fairly harmless; not worth fixing.
 * The amount of lookahead needed to avoid this kludge is excessive.
 */
static void
p_bre(struct parse *p,
    int end1,		/* first terminating character */
    int end2)		/* second terminating character */
{ … }

/*
 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
 */
static int			/* was the simple RE an unbackslashed $? */
p_simp_re(struct parse *p,
    int starordinary)		/* is a leading * an ordinary character? */
{ … }

/*
 - p_count - parse a repetition count
 */
static int			/* the value */
p_count(struct parse *p)
{ … }

/*
 - p_bracket - parse a bracketed character list
 *
 * Note a significant property of this code:  if the allocset() did SETERROR,
 * no set operations are done.
 */
static void
p_bracket(struct parse *p)
{ … }

/*
 - p_b_term - parse one term of a bracketed character list
 */
static void
p_b_term(struct parse *p, cset *cs)
{ … }

/*
 - p_b_cclass - parse a character-class name and deal with it
 */
static void
p_b_cclass(struct parse *p, cset *cs)
{ … }

/*
 - p_b_eclass - parse an equivalence-class name and deal with it
 *
 * This implementation is incomplete. xxx
 */
static void
p_b_eclass(struct parse *p, cset *cs)
{ … }

/*
 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
 */
static char			/* value of symbol */
p_b_symbol(struct parse *p)
{ … }

/*
 - p_b_coll_elem - parse a collating-element name and look it up
 */
static char			/* value of collating element */
p_b_coll_elem(struct parse *p,
    int endc)			/* name ended by endc,']' */
{ … }

/*
 - othercase - return the case counterpart of an alphabetic
 */
static char			/* if no counterpart, return ch */
othercase(int ch)
{ … }

/*
 - bothcases - emit a dualcase version of a two-case character
 *
 * Boy, is this implementation ever a kludge...
 */
static void
bothcases(struct parse *p, int ch)
{ … }

/*
 - ordinary - emit an ordinary character
 */
static void
ordinary(struct parse *p, int ch)
{ … }

/*
 - nonnewline - emit REG_NEWLINE version of OANY
 *
 * Boy, is this implementation ever a kludge...
 */
static void
nonnewline(struct parse *p)
{ … }

/*
 - repeat - generate code for a bounded repetition, recursively if needed
 */
static void
repeat(struct parse *p,
    sopno start,		/* operand from here to end of strip */
    int from,			/* repeated from this number */
    int to)			/* to this number of times (maybe INFINITY) */
{ … }

/*
 - seterr - set an error condition
 */
static int			/* useless but makes type checking happy */
seterr(struct parse *p, int e)
{ … }

/*
 - allocset - allocate a set of characters for []
 */
static cset *
allocset(struct parse *p)
{ … }

/*
 - freeset - free a now-unused set
 */
static void
freeset(struct parse *p, cset *cs)
{ … }

/*
 - freezeset - final processing on a set of characters
 *
 * The main task here is merging identical sets.  This is usually a waste
 * of time (although the hash code minimizes the overhead), but can win
 * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
 * is done using addition rather than xor -- all ASCII [aA] sets xor to
 * the same value!
 */
static int			/* set number */
freezeset(struct parse *p, cset *cs)
{ … }

/*
 - firstch - return first character in a set (which must have at least one)
 */
static int			/* character; there is no "none" value */
firstch(struct parse *p, cset *cs)
{ … }

/*
 - nch - number of characters in a set
 */
static int
nch(struct parse *p, cset *cs)
{ … }

/*
 - mcadd - add a collating element to a cset
 */
static void
mcadd( struct parse *p, cset *cs, const char *cp)
{ … }

/*
 - mcinvert - invert the list of collating elements in a cset
 *
 * This would have to know the set of possibilities.  Implementation
 * is deferred.
 */
/* ARGSUSED */
static void
mcinvert(struct parse *p, cset *cs)
{ … }

/*
 - mccase - add case counterparts of the list of collating elements in a cset
 *
 * This would have to know the set of possibilities.  Implementation
 * is deferred.
 */
/* ARGSUSED */
static void
mccase(struct parse *p, cset *cs)
{ … }

/*
 - isinsets - is this character in any sets?
 */
static int			/* predicate */
isinsets(struct re_guts *g, int c)
{ … }

/*
 - samesets - are these two characters in exactly the same sets?
 */
static int			/* predicate */
samesets(struct re_guts *g, int c1, int c2)
{ … }

/*
 - categorize - sort out character categories
 */
static void
categorize(struct parse *p, struct re_guts *g)
{ … }

/*
 - dupl - emit a duplicate of a bunch of sops
 */
static sopno			/* start of duplicate */
dupl(struct parse *p,
    sopno start,		/* from here */
    sopno finish)		/* to this less one */
{ … }

/*
 - doemit - emit a strip operator
 *
 * It might seem better to implement this as a macro with a function as
 * hard-case backup, but it's just too big and messy unless there are
 * some changes to the data structures.  Maybe later.
 */
static void
doemit(struct parse *p, sop op, size_t opnd)
{ … }

/*
 - doinsert - insert a sop into the strip
 */
static void
doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
{ … }

/*
 - dofwd - complete a forward reference
 */
static void
dofwd(struct parse *p, sopno pos, sop value)
{ … }

/*
 - enlarge - enlarge the strip
 */
static void
enlarge(struct parse *p, sopno size)
{ … }

/*
 - stripsnug - compact the strip
 */
static void
stripsnug(struct parse *p, struct re_guts *g)
{ … }

/*
 - findmust - fill in must and mlen with longest mandatory literal string
 *
 * This algorithm could do fancy things like analyzing the operands of |
 * for common subsequences.  Someday.  This code is simple and finds most
 * of the interesting cases.
 *
 * Note that must and mlen got initialized during setup.
 */
static void
findmust(struct parse *p, struct re_guts *g)
{ … }

/*
 - pluscount - count + nesting
 */
static sopno			/* nesting depth */
pluscount(struct parse *p, struct re_guts *g)
{ … }
llvm/llvm/lib/Support/regcomp.c