"Fossies" - the Fresh Open Source Software Archive

Member "heaplayers-351/benchmarks/perl/regcomp.h" (20 Sep 2003, 7241 Bytes) of package /linux/misc/old/heaplayers_3_5_1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "regcomp.h" see the Fossies "Dox" file reference documentation.

    1 /* $Header: /nfs/loki/data3/emery/REPOSITORY/heaplayers/benchmarks/perl/regcomp.h,v 1.1 2003/09/20 12:59:59 emery Exp $
    2  *
    3  * $Log: regcomp.h,v $
    4  * Revision 1.1  2003/09/20 12:59:59  emery
    5  * Wilson's perl benchmarks.
    6  *
    7  * Revision 4.0  91/03/20  01:39:09  lwall
    8  * 4.0 baseline.
    9  * 
   10  */
   11 
   12 /*
   13  * The "internal use only" fields in regexp.h are present to pass info from
   14  * compile to execute that permits the execute phase to run lots faster on
   15  * simple cases.  They are:
   16  *
   17  * regstart str that must begin a match; Nullch if none obvious
   18  * reganch  is the match anchored (at beginning-of-line only)?
   19  * regmust  string (pointer into program) that match must include, or NULL
   20  *  [regmust changed to STR* for bminstr()--law]
   21  * regmlen  length of regmust string
   22  *  [regmlen not used currently]
   23  *
   24  * Regstart and reganch permit very fast decisions on suitable starting points
   25  * for a match, cutting down the work a lot.  Regmust permits fast rejection
   26  * of lines that cannot possibly match.  The regmust tests are costly enough
   27  * that regcomp() supplies a regmust only if the r.e. contains something
   28  * potentially expensive (at present, the only such thing detected is * or +
   29  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
   30  * supplied because the test in regexec() needs it and regcomp() is computing
   31  * it anyway.
   32  * [regmust is now supplied always.  The tests that use regmust have a
   33  * heuristic that disables the test if it usually matches.]
   34  *
   35  * [In fact, we now use regmust in many cases to locate where the search
   36  * starts in the string, so if regback is >= 0, the regmust search is never
   37  * wasted effort.  The regback variable says how many characters back from
   38  * where regmust matched is the earliest possible start of the match.
   39  * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.]
   40  */
   41 
   42 /*
   43  * Structure for regexp "program".  This is essentially a linear encoding
   44  * of a nondeterministic finite-state machine (aka syntax charts or
   45  * "railroad normal form" in parsing technology).  Each node is an opcode
   46  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
   47  * all nodes except BRANCH implement concatenation; a "next" pointer with
   48  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
   49  * have one of the subtle syntax dependencies:  an individual BRANCH (as
   50  * opposed to a collection of them) is never concatenated with anything
   51  * because of operator precedence.)  The operand of some types of node is
   52  * a literal string; for others, it is a node leading into a sub-FSM.  In
   53  * particular, the operand of a BRANCH node is the first node of the branch.
   54  * (NB this is *not* a tree structure:  the tail of the branch connects
   55  * to the thing following the set of BRANCHes.)  The opcodes are:
   56  */
   57 
   58 /* definition   number  opnd?   meaning */
   59 #define END 0   /* no   End of program. */
   60 #define BOL 1   /* no   Match "" at beginning of line. */
   61 #define EOL 2   /* no   Match "" at end of line. */
   62 #define ANY 3   /* no   Match any one character. */
   63 #define ANYOF   4   /* str  Match character in (or not in) this class. */
   64 #define CURLY   5   /* str  Match this simple thing {n,m} times. */
   65 #define BRANCH  6   /* node Match this alternative, or the next... */
   66 #define BACK    7   /* no   Match "", "next" ptr points backward. */
   67 #define EXACTLY 8   /* str  Match this string (preceded by length). */
   68 #define NOTHING 9   /* no   Match empty string. */
   69 #define STAR    10  /* node Match this (simple) thing 0 or more times. */
   70 #define PLUS    11  /* node Match this (simple) thing 1 or more times. */
   71 #define ALNUM   12  /* no   Match any alphanumeric character */
   72 #define NALNUM  13  /* no   Match any non-alphanumeric character */
   73 #define BOUND   14  /* no   Match "" at any word boundary */
   74 #define NBOUND  15  /* no   Match "" at any word non-boundary */
   75 #define SPACE   16  /* no   Match any whitespace character */
   76 #define NSPACE  17  /* no   Match any non-whitespace character */
   77 #define DIGIT   18  /* no   Match any numeric character */
   78 #define NDIGIT  19  /* no   Match any non-numeric character */
   79 #define REF 20  /* num  Match some already matched string */
   80 #define OPEN    21  /* num  Mark this point in input as start of #n. */
   81 #define CLOSE   22  /* num  Analogous to OPEN. */
   82 
   83 /*
   84  * Opcode notes:
   85  *
   86  * BRANCH   The set of branches constituting a single choice are hooked
   87  *      together with their "next" pointers, since precedence prevents
   88  *      anything being concatenated to any individual branch.  The
   89  *      "next" pointer of the last BRANCH in a choice points to the
   90  *      thing following the whole choice.  This is also where the
   91  *      final "next" pointer of each individual branch points; each
   92  *      branch starts with the operand node of a BRANCH node.
   93  *
   94  * BACK     Normal "next" pointers all implicitly point forward; BACK
   95  *      exists to make loop structures possible.
   96  *
   97  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
   98  *      BRANCH structures using BACK.  Simple cases (one character
   99  *      per match) are implemented with STAR and PLUS for speed
  100  *      and to minimize recursive plunges.
  101  *
  102  * OPEN,CLOSE   ...are numbered at compile time.
  103  */
  104 
  105 #ifndef DOINIT
  106 extern char regarglen[];
  107 #else
  108 char regarglen[] = {0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2};
  109 #endif
  110 
  111 /* The following have no fixed length. */
  112 #ifndef DOINIT
  113 extern char varies[];
  114 #else
  115 char varies[] = {BRANCH,BACK,STAR,PLUS,CURLY,REF,0};
  116 #endif
  117 
  118 /* The following always have a length of 1. */
  119 #ifndef DOINIT
  120 extern char simple[];
  121 #else
  122 char simple[] = {ANY,ANYOF,ALNUM,NALNUM,SPACE,NSPACE,DIGIT,NDIGIT,0};
  123 #endif
  124 
  125 EXT char regdummy;
  126 
  127 /*
  128  * A node is one char of opcode followed by two chars of "next" pointer.
  129  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
  130  * value is a positive offset from the opcode of the node containing it.
  131  * An operand, if any, simply follows the node.  (Note that much of the
  132  * code generation knows about this implicit relationship.)
  133  *
  134  * Using two bytes for the "next" pointer is vast overkill for most things,
  135  * but allows patterns to get big without disasters.
  136  *
  137  * [If REGALIGN is defined, the "next" pointer is always aligned on an even
  138  * boundary, and reads the offset directly as a short.  Also, there is no
  139  * special test to reverse the sign of BACK pointers since the offset is
  140  * stored negative.]
  141  */
  142 
  143 #ifndef gould
  144 #ifndef cray
  145 #ifndef eta10
  146 #define REGALIGN
  147 #endif
  148 #endif
  149 #endif
  150 
  151 #define OP(p)   (*(p))
  152 
  153 #ifndef lint
  154 #ifdef REGALIGN
  155 #define NEXT(p) (*(short*)(p+1))
  156 #define ARG1(p) (*(unsigned short*)(p+3))
  157 #define ARG2(p) (*(unsigned short*)(p+5))
  158 #else
  159 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
  160 #define ARG1(p) (((*((p)+3)&0377)<<8) + (*((p)+4)&0377))
  161 #define ARG2(p) (((*((p)+5)&0377)<<8) + (*((p)+6)&0377))
  162 #endif
  163 #else /* lint */
  164 #define NEXT(p) 0
  165 #endif /* lint */
  166 
  167 #define OPERAND(p)  ((p) + 3)
  168 
  169 #ifdef REGALIGN
  170 #define NEXTOPER(p) ((p) + 4)
  171 #else
  172 #define NEXTOPER(p) ((p) + 3)
  173 #endif
  174 
  175 #define MAGIC 0234
  176 
  177 /*
  178  * Utility definitions.
  179  */
  180 #ifndef lint
  181 #ifndef CHARBITS
  182 #define UCHARAT(p)  ((int)*(unsigned char *)(p))
  183 #else
  184 #define UCHARAT(p)  ((int)*(p)&CHARBITS)
  185 #endif
  186 #else /* lint */
  187 #define UCHARAT(p)  regdummy
  188 #endif /* lint */
  189 
  190 #define FAIL(m) fatal("/%s/: %s",regprecomp,m)
  191 
  192 char *regnext();
  193 #ifdef DEBUGGING
  194 void regdump();
  195 char *regprop();
  196 #endif
  197