/*- * This code is derived from OpenBSD's libc/regex, original license follows: * * Copyright (c) 1992, 1993, 1994 Henry Spencer. * Copyright (c) 1992, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Henry Spencer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 */ #include <sys/types.h> #include <stdint.h> #include <stdio.h> #include <string.h> #include <ctype.h> #include <limits.h> #include <stdlib.h> #include "regex_impl.h" #include "regutils.h" #include "regex2.h" #include "llvm/Config/config.h" #include "llvm/Support/Compiler.h" /* character-class table */ static struct cclass { … } cclasses[] = …; /* character-name table */ static struct cname { … } cnames[] = …; /* * parse structure, passed up and down to avoid global variables and * other clumsinesses */ struct parse { … }; static void p_ere(struct parse *, int); static void p_ere_exp(struct parse *); static void p_str(struct parse *); static void p_bre(struct parse *, int, int); static int p_simp_re(struct parse *, int); static int p_count(struct parse *); static void p_bracket(struct parse *); static void p_b_term(struct parse *, cset *); static void p_b_cclass(struct parse *, cset *); static void p_b_eclass(struct parse *, cset *); static char p_b_symbol(struct parse *); static char p_b_coll_elem(struct parse *, int); static char othercase(int); static void bothcases(struct parse *, int); static void ordinary(struct parse *, int); static void nonnewline(struct parse *); static void repeat(struct parse *, sopno, int, int); static int seterr(struct parse *, int); static cset *allocset(struct parse *); static void freeset(struct parse *, cset *); static int freezeset(struct parse *, cset *); static int firstch(struct parse *, cset *); static int nch(struct parse *, cset *); static void mcadd(struct parse *, cset *, const char *); static void mcinvert(struct parse *, cset *); static void mccase(struct parse *, cset *); static int isinsets(struct re_guts *, int); static int samesets(struct re_guts *, int, int); static void categorize(struct parse *, struct re_guts *); static sopno dupl(struct parse *, sopno, sopno); static void doemit(struct parse *, sop, size_t); static void doinsert(struct parse *, sop, size_t, sopno); static void dofwd(struct parse *, sopno, sop); static void enlarge(struct parse *, sopno); static void stripsnug(struct parse *, struct re_guts *); static void findmust(struct parse *, struct re_guts *); static sopno pluscount(struct parse *, struct re_guts *); static char nuls[10]; /* place to point scanner in event of error */ /* * macros for use with parse structure * BEWARE: these know that the parse structure is named `p' !!! */ #define PEEK() … #define PEEK2() … #define MORE() … #define MORE2() … #define SEE(c) … #define SEETWO(a, b) … #define EAT(c) … #define EATTWO(a, b) … #define NEXT() … #define NEXT2() … #define NEXTn(n) … #define GETNEXT() … #define SETERROR(e) … #define REQUIRE(co, e) … #define MUSTSEE(c, e) … #define MUSTEAT(c, e) … #define MUSTNOTSEE(c, e) … #define EMIT(op, sopnd) … #define INSERT(op, pos) … #define AHEAD(pos) … #define ASTERN(sop, pos) … #define HERE() … #define THERE() … #define THERETHERE() … #define DROP(n) … #ifdef _POSIX2_RE_DUP_MAX #define DUPMAX … #else #define DUPMAX … #endif #define REGINFINITY … #ifndef NDEBUG static int never = 0; /* for use in asserts; shuts lint up */ #else #define never … #endif /* - llvm_regcomp - interface for parser and compilation */ int /* 0 success, otherwise REG_something */ llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags) { … } /* - p_ere - ERE parser top level, concatenation and alternation */ static void p_ere(struct parse *p, int stop) /* character this ERE should end at */ { … } /* - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op */ static void p_ere_exp(struct parse *p) { … } /* - p_str - string (no metacharacters) "parser" */ static void p_str(struct parse *p) { … } /* - p_bre - BRE parser top level, anchoring and concatenation * Giving end1 as OUT essentially eliminates the end1/end2 check. * * This implementation is a bit of a kludge, in that a trailing $ is first * taken as an ordinary character and then revised to be an anchor. The * only undesirable side effect is that '$' gets included as a character * category in such cases. This is fairly harmless; not worth fixing. * The amount of lookahead needed to avoid this kludge is excessive. */ static void p_bre(struct parse *p, int end1, /* first terminating character */ int end2) /* second terminating character */ { … } /* - p_simp_re - parse a simple RE, an atom possibly followed by a repetition */ static int /* was the simple RE an unbackslashed $? */ p_simp_re(struct parse *p, int starordinary) /* is a leading * an ordinary character? */ { … } /* - p_count - parse a repetition count */ static int /* the value */ p_count(struct parse *p) { … } /* - p_bracket - parse a bracketed character list * * Note a significant property of this code: if the allocset() did SETERROR, * no set operations are done. */ static void p_bracket(struct parse *p) { … } /* - p_b_term - parse one term of a bracketed character list */ static void p_b_term(struct parse *p, cset *cs) { … } /* - p_b_cclass - parse a character-class name and deal with it */ static void p_b_cclass(struct parse *p, cset *cs) { … } /* - p_b_eclass - parse an equivalence-class name and deal with it * * This implementation is incomplete. xxx */ static void p_b_eclass(struct parse *p, cset *cs) { … } /* - p_b_symbol - parse a character or [..]ed multicharacter collating symbol */ static char /* value of symbol */ p_b_symbol(struct parse *p) { … } /* - p_b_coll_elem - parse a collating-element name and look it up */ static char /* value of collating element */ p_b_coll_elem(struct parse *p, int endc) /* name ended by endc,']' */ { … } /* - othercase - return the case counterpart of an alphabetic */ static char /* if no counterpart, return ch */ othercase(int ch) { … } /* - bothcases - emit a dualcase version of a two-case character * * Boy, is this implementation ever a kludge... */ static void bothcases(struct parse *p, int ch) { … } /* - ordinary - emit an ordinary character */ static void ordinary(struct parse *p, int ch) { … } /* - nonnewline - emit REG_NEWLINE version of OANY * * Boy, is this implementation ever a kludge... */ static void nonnewline(struct parse *p) { … } /* - repeat - generate code for a bounded repetition, recursively if needed */ static void repeat(struct parse *p, sopno start, /* operand from here to end of strip */ int from, /* repeated from this number */ int to) /* to this number of times (maybe INFINITY) */ { … } /* - seterr - set an error condition */ static int /* useless but makes type checking happy */ seterr(struct parse *p, int e) { … } /* - allocset - allocate a set of characters for [] */ static cset * allocset(struct parse *p) { … } /* - freeset - free a now-unused set */ static void freeset(struct parse *p, cset *cs) { … } /* - freezeset - final processing on a set of characters * * The main task here is merging identical sets. This is usually a waste * of time (although the hash code minimizes the overhead), but can win * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash * is done using addition rather than xor -- all ASCII [aA] sets xor to * the same value! */ static int /* set number */ freezeset(struct parse *p, cset *cs) { … } /* - firstch - return first character in a set (which must have at least one) */ static int /* character; there is no "none" value */ firstch(struct parse *p, cset *cs) { … } /* - nch - number of characters in a set */ static int nch(struct parse *p, cset *cs) { … } /* - mcadd - add a collating element to a cset */ static void mcadd( struct parse *p, cset *cs, const char *cp) { … } /* - mcinvert - invert the list of collating elements in a cset * * This would have to know the set of possibilities. Implementation * is deferred. */ /* ARGSUSED */ static void mcinvert(struct parse *p, cset *cs) { … } /* - mccase - add case counterparts of the list of collating elements in a cset * * This would have to know the set of possibilities. Implementation * is deferred. */ /* ARGSUSED */ static void mccase(struct parse *p, cset *cs) { … } /* - isinsets - is this character in any sets? */ static int /* predicate */ isinsets(struct re_guts *g, int c) { … } /* - samesets - are these two characters in exactly the same sets? */ static int /* predicate */ samesets(struct re_guts *g, int c1, int c2) { … } /* - categorize - sort out character categories */ static void categorize(struct parse *p, struct re_guts *g) { … } /* - dupl - emit a duplicate of a bunch of sops */ static sopno /* start of duplicate */ dupl(struct parse *p, sopno start, /* from here */ sopno finish) /* to this less one */ { … } /* - doemit - emit a strip operator * * It might seem better to implement this as a macro with a function as * hard-case backup, but it's just too big and messy unless there are * some changes to the data structures. Maybe later. */ static void doemit(struct parse *p, sop op, size_t opnd) { … } /* - doinsert - insert a sop into the strip */ static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos) { … } /* - dofwd - complete a forward reference */ static void dofwd(struct parse *p, sopno pos, sop value) { … } /* - enlarge - enlarge the strip */ static void enlarge(struct parse *p, sopno size) { … } /* - stripsnug - compact the strip */ static void stripsnug(struct parse *p, struct re_guts *g) { … } /* - findmust - fill in must and mlen with longest mandatory literal string * * This algorithm could do fancy things like analyzing the operands of | * for common subsequences. Someday. This code is simple and finds most * of the interesting cases. * * Note that must and mlen got initialized during setup. */ static void findmust(struct parse *p, struct re_guts *g) { … } /* - pluscount - count + nesting */ static sopno /* nesting depth */ pluscount(struct parse *p, struct re_guts *g) { … }