"Fossies" - the Fresh Open Source Software Archive

Member "gawk-5.1.0/support/dfa.c" (15 Mar 2020, 136797 Bytes) of package /linux/misc/gawk-5.1.0.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "dfa.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 5.0.1_vs_5.1.0.

    1 /* dfa.c - deterministic extended regexp routines for GNU
    2    Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2020 Free Software
    3    Foundation, Inc.
    4 
    5    This program is free software; you can redistribute it and/or modify
    6    it under the terms of the GNU General Public License as published by
    7    the Free Software Foundation; either version 3, or (at your option)
    8    any later version.
    9 
   10    This program is distributed in the hope that it will be useful,
   11    but WITHOUT ANY WARRANTY; without even the implied warranty of
   12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   13    GNU General Public License for more details.
   14 
   15    You should have received a copy of the GNU General Public License
   16    along with this program; if not, write to the Free Software
   17    Foundation, Inc.,
   18    51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */
   19 
   20 /* Written June, 1988 by Mike Haertel
   21    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
   22 
   23 #include <config.h>
   24 
   25 #include <assert.h>
   26 #include <ctype.h>
   27 #include <stdint.h>
   28 #include <stdio.h>
   29 #include <stdlib.h>
   30 #include <limits.h>
   31 #include <string.h>
   32 
   33 #include "dfa.h"
   34 #include "flexmember.h"
   35 
   36 /* Another name for ptrdiff_t, for sizes of objects and nonnegative
   37    indexes into objects.  It is signed to help catch integer overflow.
   38    It has its own name because it is for nonnegative values only.  */
   39 typedef ptrdiff_t idx_t;
   40 static idx_t const IDX_MAX = PTRDIFF_MAX;
   41 
   42 static bool
   43 streq (char const *a, char const *b)
   44 {
   45   return strcmp (a, b) == 0;
   46 }
   47 
   48 static bool
   49 isasciidigit (char c)
   50 {
   51   return '0' <= c && c <= '9';
   52 }
   53 
   54 #include "gettext.h"
   55 #define _(str) gettext (str)
   56 
   57 #include <wchar.h>
   58 
   59 #include "intprops.h"
   60 #include "xalloc.h"
   61 #include "localeinfo.h"
   62 
   63 #ifndef FALLTHROUGH
   64 # if __GNUC__ < 7
   65 #  define FALLTHROUGH ((void) 0)
   66 # else
   67 #  define FALLTHROUGH __attribute__ ((__fallthrough__))
   68 # endif
   69 #endif
   70 
   71 #ifndef MIN
   72 # define MIN(a,b) ((a) < (b) ? (a) : (b))
   73 #endif
   74 
   75 /* HPUX defines these as macros in sys/param.h.  */
   76 #ifdef setbit
   77 # undef setbit
   78 #endif
   79 #ifdef clrbit
   80 # undef clrbit
   81 #endif
   82 
   83 /* For code that does not use Gnulib’s isblank module.  */
   84 #if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
   85 # define isblank dfa_isblank
   86 static int
   87 isblank (int c)
   88 {
   89   return c == ' ' || c == '\t';
   90 }
   91 #endif
   92 
   93 /* First integer value that is greater than any character code.  */
   94 enum { NOTCHAR = 1 << CHAR_BIT };
   95 
   96 #ifdef UINT_LEAST64_MAX
   97 
   98 /* Number of bits used in a charclass word.  */
   99 enum { CHARCLASS_WORD_BITS = 64 };
  100 
  101 /* This represents part of a character class.  It must be unsigned and
  102    at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
  103 typedef uint_least64_t charclass_word;
  104 
  105 /* Part of a charclass initializer that represents 64 bits' worth of a
  106    charclass, where LO and HI are the low and high-order 32 bits of
  107    the 64-bit quantity.  */
  108 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
  109 
  110 #else
  111 /* Fallbacks for pre-C99 hosts that lack 64-bit integers.  */
  112 enum { CHARCLASS_WORD_BITS = 32 };
  113 typedef unsigned long charclass_word;
  114 # define CHARCLASS_PAIR(lo, hi) lo, hi
  115 #endif
  116 
  117 /* An initializer for a charclass whose 32-bit words are A through H.  */
  118 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h)      \
  119    {{                           \
  120       CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
  121       CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h)  \
  122    }}
  123 
  124 /* The maximum useful value of a charclass_word; all used bits are 1.  */
  125 static charclass_word const CHARCLASS_WORD_MASK
  126   = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
  127 
  128 /* Number of words required to hold a bit for every character.  */
  129 enum
  130 {
  131   CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
  132 };
  133 
  134 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
  135 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
  136 
  137 /* Convert a possibly-signed character to an unsigned character.  This is
  138    a bit safer than casting to unsigned char, since it catches some type
  139    errors that the cast doesn't.  */
  140 static unsigned char
  141 to_uchar (char ch)
  142 {
  143   return ch;
  144 }
  145 
  146 /* Contexts tell us whether a character is a newline or a word constituent.
  147    Word-constituent characters are those that satisfy iswalnum, plus '_'.
  148    Each character has a single CTX_* value; bitmasks of CTX_* values denote
  149    a particular character class.
  150 
  151    A state also stores a context value, which is a bitmask of CTX_* values.
  152    A state's context represents a set of characters that the state's
  153    predecessors must match.  For example, a state whose context does not
  154    include CTX_LETTER will never have transitions where the previous
  155    character is a word constituent.  A state whose context is CTX_ANY
  156    might have transitions from any character.  */
  157 
  158 enum
  159   {
  160     CTX_NONE = 1,
  161     CTX_LETTER = 2,
  162     CTX_NEWLINE = 4,
  163     CTX_ANY = 7
  164   };
  165 
  166 /* Sometimes characters can only be matched depending on the surrounding
  167    context.  Such context decisions depend on what the previous character
  168    was, and the value of the current (lookahead) character.  Context
  169    dependent constraints are encoded as 9-bit integers.  Each bit that
  170    is set indicates that the constraint succeeds in the corresponding
  171    context.
  172 
  173    bit 6-8  - valid contexts when next character is CTX_NEWLINE
  174    bit 3-5  - valid contexts when next character is CTX_LETTER
  175    bit 0-2  - valid contexts when next character is CTX_NONE
  176 
  177    succeeds_in_context determines whether a given constraint
  178    succeeds in a particular context.  Prev is a bitmask of possible
  179    context values for the previous character, curr is the (single-bit)
  180    context value for the lookahead character.  */
  181 static int
  182 newline_constraint (int constraint)
  183 {
  184   return (constraint >> 6) & 7;
  185 }
  186 static int
  187 letter_constraint (int constraint)
  188 {
  189   return (constraint >> 3) & 7;
  190 }
  191 static int
  192 other_constraint (int constraint)
  193 {
  194   return constraint & 7;
  195 }
  196 
  197 static bool
  198 succeeds_in_context (int constraint, int prev, int curr)
  199 {
  200   return !! (((curr & CTX_NONE      ? other_constraint (constraint) : 0) \
  201               | (curr & CTX_LETTER  ? letter_constraint (constraint) : 0) \
  202               | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
  203              & prev);
  204 }
  205 
  206 /* The following describe what a constraint depends on.  */
  207 static bool
  208 prev_newline_dependent (int constraint)
  209 {
  210   return ((constraint ^ constraint >> 2) & 0111) != 0;
  211 }
  212 static bool
  213 prev_letter_dependent (int constraint)
  214 {
  215   return ((constraint ^ constraint >> 1) & 0111) != 0;
  216 }
  217 
  218 /* Tokens that match the empty string subject to some constraint actually
  219    work by applying that constraint to determine what may follow them,
  220    taking into account what has gone before.  The following values are
  221    the constraints corresponding to the special tokens previously defined.  */
  222 enum
  223   {
  224     NO_CONSTRAINT = 0777,
  225     BEGLINE_CONSTRAINT = 0444,
  226     ENDLINE_CONSTRAINT = 0700,
  227     BEGWORD_CONSTRAINT = 0050,
  228     ENDWORD_CONSTRAINT = 0202,
  229     LIMWORD_CONSTRAINT = 0252,
  230     NOTLIMWORD_CONSTRAINT = 0525
  231   };
  232 
  233 /* The regexp is parsed into an array of tokens in postfix form.  Some tokens
  234    are operators and others are terminal symbols.  Most (but not all) of these
  235    codes are returned by the lexical analyzer.  */
  236 
  237 typedef ptrdiff_t token;
  238 static token const TOKEN_MAX = PTRDIFF_MAX;
  239 
  240 /* States are indexed by state_num values.  These are normally
  241    nonnegative but -1 is used as a special value.  */
  242 typedef ptrdiff_t state_num;
  243 
  244 /* Predefined token values.  */
  245 enum
  246 {
  247   END = -1,                     /* END is a terminal symbol that matches the
  248                                    end of input; any value of END or less in
  249                                    the parse tree is such a symbol.  Accepting
  250                                    states of the DFA are those that would have
  251                                    a transition on END.  This is -1, not some
  252                                    more-negative value, to tweak the speed of
  253                                    comparisons to END.  */
  254 
  255   /* Ordinary character values are terminal symbols that match themselves.  */
  256 
  257   /* CSET must come last in the following list of special tokens.  Otherwise,
  258      the list order matters only for performance.  Related special tokens
  259      should have nearby values so that code like (t == ANYCHAR || t == MBCSET
  260      || CSET <= t) can be done with a single machine-level comparison.  */
  261 
  262   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
  263                                    the empty string.  */
  264 
  265   QMARK,                        /* QMARK is an operator of one argument that
  266                                    matches zero or one occurrences of its
  267                                    argument.  */
  268 
  269   STAR,                         /* STAR is an operator of one argument that
  270                                    matches the Kleene closure (zero or more
  271                                    occurrences) of its argument.  */
  272 
  273   PLUS,                         /* PLUS is an operator of one argument that
  274                                    matches the positive closure (one or more
  275                                    occurrences) of its argument.  */
  276 
  277   REPMN,                        /* REPMN is a lexical token corresponding
  278                                    to the {m,n} construct.  REPMN never
  279                                    appears in the compiled token vector.  */
  280 
  281   CAT,                          /* CAT is an operator of two arguments that
  282                                    matches the concatenation of its
  283                                    arguments.  CAT is never returned by the
  284                                    lexical analyzer.  */
  285 
  286   OR,                           /* OR is an operator of two arguments that
  287                                    matches either of its arguments.  */
  288 
  289   LPAREN,                       /* LPAREN never appears in the parse tree,
  290                                    it is only a lexeme.  */
  291 
  292   RPAREN,                       /* RPAREN never appears in the parse tree.  */
  293 
  294   WCHAR,                        /* Only returned by lex.  wctok contains
  295                                    the wide character representation.  */
  296 
  297   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
  298                                    a valid multibyte (or single byte) character.
  299                                    It is used only if MB_CUR_MAX > 1.  */
  300 
  301   BEG,                          /* BEG is an initial symbol that matches the
  302                                    beginning of input.  */
  303 
  304   BEGLINE,                      /* BEGLINE is a terminal symbol that matches
  305                                    the empty string at the beginning of a
  306                                    line.  */
  307 
  308   ENDLINE,                      /* ENDLINE is a terminal symbol that matches
  309                                    the empty string at the end of a line.  */
  310 
  311   BEGWORD,                      /* BEGWORD is a terminal symbol that matches
  312                                    the empty string at the beginning of a
  313                                    word.  */
  314 
  315   ENDWORD,                      /* ENDWORD is a terminal symbol that matches
  316                                    the empty string at the end of a word.  */
  317 
  318   LIMWORD,                      /* LIMWORD is a terminal symbol that matches
  319                                    the empty string at the beginning or the
  320                                    end of a word.  */
  321 
  322   NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
  323                                    matches the empty string not at
  324                                    the beginning or end of a word.  */
  325 
  326   BACKREF,                      /* BACKREF is generated by \<digit>
  327                                    or by any other construct that
  328                                    is not completely handled.  If the scanner
  329                                    detects a transition on backref, it returns
  330                                    a kind of "semi-success" indicating that
  331                                    the match will have to be verified with
  332                                    a backtracking matcher.  */
  333 
  334   MBCSET,                       /* MBCSET is similar to CSET, but for
  335                                    multibyte characters.  */
  336 
  337   CSET                          /* CSET and (and any value greater) is a
  338                                    terminal symbol that matches any of a
  339                                    class of characters.  */
  340 };
  341 
  342 
  343 /* States of the recognizer correspond to sets of positions in the parse
  344    tree, together with the constraints under which they may be matched.
  345    So a position is encoded as an index into the parse tree together with
  346    a constraint.  */
  347 typedef struct
  348 {
  349   idx_t index;          /* Index into the parse array.  */
  350   unsigned int constraint;      /* Constraint for matching this position.  */
  351 } position;
  352 
  353 /* Sets of positions are stored as arrays.  */
  354 typedef struct
  355 {
  356   position *elems;              /* Elements of this position set.  */
  357   idx_t nelem;          /* Number of elements in this set.  */
  358   idx_t alloc;          /* Number of elements allocated in ELEMS.  */
  359 } position_set;
  360 
  361 /* A state of the dfa consists of a set of positions, some flags,
  362    and the token value of the lowest-numbered position of the state that
  363    contains an END token.  */
  364 typedef struct
  365 {
  366   size_t hash;                  /* Hash of the positions of this state.  */
  367   position_set elems;           /* Positions this state could match.  */
  368   unsigned char context;        /* Context from previous state.  */
  369   unsigned short constraint;    /* Constraint for this state to accept.  */
  370   token first_end;              /* Token value of the first END in elems.  */
  371   position_set mbps;            /* Positions which can match multibyte
  372                                    characters or the follows, e.g., period.
  373                                    Used only if MB_CUR_MAX > 1.  */
  374   state_num mb_trindex;         /* Index of this state in MB_TRANS, or
  375                                    negative if the state does not have
  376                                    ANYCHAR.  */
  377 } dfa_state;
  378 
  379 /* Maximum for any transition table count.  This should be at least 3,
  380    for the initial state setup.  */
  381 enum { MAX_TRCOUNT = 1024 };
  382 
  383 /* A bracket operator.
  384    e.g., [a-c], [[:alpha:]], etc.  */
  385 struct mb_char_classes
  386 {
  387   ptrdiff_t cset;
  388   bool invert;
  389   wchar_t *chars;               /* Normal characters.  */
  390   idx_t nchars;
  391   idx_t nchars_alloc;
  392 };
  393 
  394 struct regex_syntax
  395 {
  396   /* Syntax bits controlling the behavior of the lexical analyzer.  */
  397   reg_syntax_t syntax_bits;
  398   bool syntax_bits_set;
  399 
  400   /* Flag for case-folding letters into sets.  */
  401   bool case_fold;
  402 
  403   /* True if ^ and $ match only the start and end of data, and do not match
  404      end-of-line within data.  */
  405   bool anchor;
  406 
  407   /* End-of-line byte in data.  */
  408   unsigned char eolbyte;
  409 
  410   /* Cache of char-context values.  */
  411   char sbit[NOTCHAR];
  412 
  413   /* If never_trail[B], the byte B cannot be a non-initial byte in a
  414      multibyte character.  */
  415   bool never_trail[NOTCHAR];
  416 
  417   /* Set of characters considered letters.  */
  418   charclass letters;
  419 
  420   /* Set of characters that are newline.  */
  421   charclass newline;
  422 };
  423 
  424 /* Lexical analyzer.  All the dross that deals with the obnoxious
  425    GNU Regex syntax bits is located here.  The poor, suffering
  426    reader is referred to the GNU Regex documentation for the
  427    meaning of the @#%!@#%^!@ syntax bits.  */
  428 struct lexer_state
  429 {
  430   char const *ptr;  /* Pointer to next input character.  */
  431   idx_t left;       /* Number of characters remaining.  */
  432   token lasttok;    /* Previous token returned; initially END.  */
  433   idx_t parens;     /* Count of outstanding left parens.  */
  434   int minrep, maxrep;   /* Repeat counts for {m,n}.  */
  435 
  436   /* Wide character representation of the current multibyte character,
  437      or WEOF if there was an encoding error.  Used only if
  438      MB_CUR_MAX > 1.  */
  439   wint_t wctok;
  440 
  441   /* The most recently analyzed multibyte bracket expression.  */
  442   struct mb_char_classes brack;
  443 
  444   /* We're separated from beginning or (, | only by zero-width characters.  */
  445   bool laststart;
  446 };
  447 
  448 /* Recursive descent parser for regular expressions.  */
  449 
  450 struct parser_state
  451 {
  452   token tok;               /* Lookahead token.  */
  453   idx_t depth;         /* Current depth of a hypothetical stack
  454                               holding deferred productions.  This is
  455                               used to determine the depth that will be
  456                               required of the real stack later on in
  457                               dfaanalyze.  */
  458 };
  459 
  460 /* A compiled regular expression.  */
  461 struct dfa
  462 {
  463   /* Fields filled by the scanner.  */
  464   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
  465   idx_t cindex;         /* Index for adding new charclasses.  */
  466   idx_t calloc;         /* Number of charclasses allocated.  */
  467   ptrdiff_t canychar;           /* Index of anychar class, or -1.  */
  468 
  469   /* Scanner state */
  470   struct lexer_state lex;
  471 
  472   /* Parser state */
  473   struct parser_state parse;
  474 
  475   /* Fields filled by the parser.  */
  476   token *tokens;                /* Postfix parse array.  */
  477   idx_t tindex;         /* Index for adding new tokens.  */
  478   idx_t talloc;         /* Number of tokens currently allocated.  */
  479   idx_t depth;          /* Depth required of an evaluation stack
  480                                    used for depth-first traversal of the
  481                                    parse tree.  */
  482   idx_t nleaves;        /* Number of leaves on the parse tree.  */
  483   idx_t nregexps;       /* Count of parallel regexps being built
  484                                    with dfaparse.  */
  485   bool fast;            /* The DFA is fast.  */
  486   token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales.  */
  487   mbstate_t mbs;        /* Multibyte conversion state.  */
  488 
  489   /* The following are valid only if MB_CUR_MAX > 1.  */
  490 
  491   /* The value of multibyte_prop[i] is defined by following rule.
  492      if tokens[i] < NOTCHAR
  493      bit 0 : tokens[i] is the first byte of a character, including
  494      single-byte characters.
  495      bit 1 : tokens[i] is the last byte of a character, including
  496      single-byte characters.
  497 
  498      e.g.
  499      tokens
  500      = 'single_byte_a', 'multi_byte_A', single_byte_b'
  501      = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
  502      multibyte_prop
  503      = 3     , 1               ,  0              ,  2              , 3
  504    */
  505   char *multibyte_prop;
  506 
  507   /* Fields filled by the superset.  */
  508   struct dfa *superset;             /* Hint of the dfa.  */
  509 
  510   /* Fields filled by the state builder.  */
  511   dfa_state *states;            /* States of the dfa.  */
  512   state_num sindex;             /* Index for adding new states.  */
  513   idx_t salloc;         /* Number of states currently allocated.  */
  514 
  515   /* Fields filled by the parse tree->NFA conversion.  */
  516   position_set *follows;        /* Array of follow sets, indexed by position
  517                                    index.  The follow of a position is the set
  518                                    of positions containing characters that
  519                                    could conceivably follow a character
  520                                    matching the given position in a string
  521                                    matching the regexp.  Allocated to the
  522                                    maximum possible position index.  */
  523   bool searchflag;      /* We are supposed to build a searching
  524                                    as opposed to an exact matcher.  A searching
  525                                    matcher finds the first and shortest string
  526                                    matching a regexp anywhere in the buffer,
  527                                    whereas an exact matcher finds the longest
  528                                    string matching, but anchored to the
  529                                    beginning of the buffer.  */
  530 
  531   /* Fields filled by dfaanalyze.  */
  532   int *constraints;             /* Array of union of accepting constraints
  533                                    in the follow of a position.  */
  534   int *separates;               /* Array of contexts on follow of a
  535                                    position.  */
  536 
  537   /* Fields filled by dfaexec.  */
  538   state_num tralloc;            /* Number of transition tables that have
  539                                    slots so far, not counting trans[-1] and
  540                                    trans[-2].  */
  541   int trcount;                  /* Number of transition tables that have
  542                                    been built, other than for initial
  543                                    states.  */
  544   int min_trcount;              /* Number of initial states.  Equivalently,
  545                                    the minimum state number for which trcount
  546                                    counts transitions.  */
  547   state_num **trans;            /* Transition tables for states that can
  548                                    never accept.  If the transitions for a
  549                                    state have not yet been computed, or the
  550                                    state could possibly accept, its entry in
  551                                    this table is NULL.  This points to two
  552                                    past the start of the allocated array,
  553                                    and trans[-1] and trans[-2] are always
  554                                    NULL.  */
  555   state_num **fails;            /* Transition tables after failing to accept
  556                                    on a state that potentially could do so.
  557                                    If trans[i] is non-null, fails[i] must
  558                                    be null.  */
  559   char *success;                /* Table of acceptance conditions used in
  560                                    dfaexec and computed in build_state.  */
  561   state_num *newlines;          /* Transitions on newlines.  The entry for a
  562                                    newline in any transition table is always
  563                                    -1 so we can count lines without wasting
  564                                    too many cycles.  The transition for a
  565                                    newline is stored separately and handled
  566                                    as a special case.  Newline is also used
  567                                    as a sentinel at the end of the buffer.  */
  568   state_num initstate_notbol;   /* Initial state for CTX_LETTER and CTX_NONE
  569                                    context in multibyte locales, in which we
  570                                    do not distinguish between their contexts,
  571                                    as not supported word.  */
  572   position_set mb_follows;      /* Follow set added by ANYCHAR on demand.  */
  573   state_num **mb_trans;         /* Transition tables for states with
  574                                    ANYCHAR.  */
  575   state_num mb_trcount;         /* Number of transition tables for states with
  576                                    ANYCHAR that have actually been built.  */
  577 
  578   /* Syntax configuration.  This is near the end so that dfacopysyntax
  579      can memset up to here.  */
  580   struct regex_syntax syntax;
  581 
  582   /* Information derived from the locale.  This is at the end so that
  583      a quick memset need not clear it specially.  */
  584 
  585   /* dfaexec implementation.  */
  586   char *(*dfaexec) (struct dfa *, char const *, char *,
  587                     bool, ptrdiff_t *, bool *);
  588 
  589   /* Other cached information derived from the locale.  */
  590   struct localeinfo localeinfo;
  591 };
  592 
  593 /* User access to dfa internals.  */
  594 
  595 /* S could possibly be an accepting state of R.  */
  596 static bool
  597 accepting (state_num s, struct dfa const *r)
  598 {
  599   return r->states[s].constraint != 0;
  600 }
  601 
  602 /* STATE accepts in the specified context.  */
  603 static bool
  604 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
  605 {
  606   return succeeds_in_context (dfa->states[state].constraint, prev, curr);
  607 }
  608 
  609 static void regexp (struct dfa *dfa);
  610 
  611 /* Store into *PWC the result of converting the leading bytes of the
  612    multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
  613    and updating the conversion state in *D.  On conversion error,
  614    convert just a single byte, to WEOF.  Return the number of bytes
  615    converted.
  616 
  617    This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
  618 
  619    * PWC points to wint_t, not to wchar_t.
  620    * The last arg is a dfa *D instead of merely a multibyte conversion
  621      state D->mbs.
  622    * N must be at least 1.
  623    * S[N - 1] must be a sentinel byte.
  624    * Shift encodings are not supported.
  625    * The return value is always in the range 1..N.
  626    * D->mbs is always valid afterwards.
  627    * *PWC is always set to something.  */
  628 static int
  629 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
  630 {
  631   unsigned char uc = s[0];
  632   wint_t wc = d->localeinfo.sbctowc[uc];
  633 
  634   if (wc == WEOF)
  635     {
  636       wchar_t wch;
  637       size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
  638       if (0 < nbytes && nbytes < (size_t) -2)
  639         {
  640           *pwc = wch;
  641           return nbytes;
  642         }
  643       memset (&d->mbs, 0, sizeof d->mbs);
  644     }
  645 
  646   *pwc = wc;
  647   return 1;
  648 }
  649 
  650 #ifdef DEBUG
  651 
  652 static void
  653 prtok (token t)
  654 {
  655   if (t <= END)
  656     fprintf (stderr, "END");
  657   else if (0 <= t && t < NOTCHAR)
  658     {
  659       unsigned int ch = t;
  660       fprintf (stderr, "0x%02x", ch);
  661     }
  662   else
  663     {
  664       char const *s;
  665       switch (t)
  666         {
  667         case BEG:
  668           s = "BEG";
  669           break;
  670         case EMPTY:
  671           s = "EMPTY";
  672           break;
  673         case BACKREF:
  674           s = "BACKREF";
  675           break;
  676         case BEGLINE:
  677           s = "BEGLINE";
  678           break;
  679         case ENDLINE:
  680           s = "ENDLINE";
  681           break;
  682         case BEGWORD:
  683           s = "BEGWORD";
  684           break;
  685         case ENDWORD:
  686           s = "ENDWORD";
  687           break;
  688         case LIMWORD:
  689           s = "LIMWORD";
  690           break;
  691         case NOTLIMWORD:
  692           s = "NOTLIMWORD";
  693           break;
  694         case QMARK:
  695           s = "QMARK";
  696           break;
  697         case STAR:
  698           s = "STAR";
  699           break;
  700         case PLUS:
  701           s = "PLUS";
  702           break;
  703         case CAT:
  704           s = "CAT";
  705           break;
  706         case OR:
  707           s = "OR";
  708           break;
  709         case LPAREN:
  710           s = "LPAREN";
  711           break;
  712         case RPAREN:
  713           s = "RPAREN";
  714           break;
  715         case ANYCHAR:
  716           s = "ANYCHAR";
  717           break;
  718         case MBCSET:
  719           s = "MBCSET";
  720           break;
  721         default:
  722           s = "CSET";
  723           break;
  724         }
  725       fprintf (stderr, "%s", s);
  726     }
  727 }
  728 #endif /* DEBUG */
  729 
  730 /* Stuff pertaining to charclasses.  */
  731 
  732 static bool
  733 tstbit (unsigned int b, charclass const *c)
  734 {
  735   return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
  736 }
  737 
  738 static void
  739 setbit (unsigned int b, charclass *c)
  740 {
  741   charclass_word one = 1;
  742   c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
  743 }
  744 
  745 static void
  746 clrbit (unsigned int b, charclass *c)
  747 {
  748   charclass_word one = 1;
  749   c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
  750 }
  751 
  752 static void
  753 zeroset (charclass *s)
  754 {
  755   memset (s, 0, sizeof *s);
  756 }
  757 
  758 static void
  759 fillset (charclass *s)
  760 {
  761   for (int i = 0; i < CHARCLASS_WORDS; i++)
  762     s->w[i] = CHARCLASS_WORD_MASK;
  763 }
  764 
  765 static void
  766 notset (charclass *s)
  767 {
  768   for (int i = 0; i < CHARCLASS_WORDS; ++i)
  769     s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
  770 }
  771 
  772 static bool
  773 equal (charclass const *s1, charclass const *s2)
  774 {
  775   charclass_word w = 0;
  776   for (int i = 0; i < CHARCLASS_WORDS; i++)
  777     w |= s1->w[i] ^ s2->w[i];
  778   return w == 0;
  779 }
  780 
  781 static bool
  782 emptyset (charclass const *s)
  783 {
  784   charclass_word w = 0;
  785   for (int i = 0; i < CHARCLASS_WORDS; i++)
  786     w |= s->w[i];
  787   return w == 0;
  788 }
  789 
  790 /* Grow PA, which points to an array of *NITEMS items, and return the
  791    location of the reallocated array, updating *NITEMS to reflect its
  792    new size.  The new array will contain at least NITEMS_INCR_MIN more
  793    items, but will not contain more than NITEMS_MAX items total.
  794    ITEM_SIZE is the size of each item, in bytes.
  795 
  796    ITEM_SIZE and NITEMS_INCR_MIN must be positive.  *NITEMS must be
  797    nonnegative.  If NITEMS_MAX is -1, it is treated as if it were
  798    infinity.
  799 
  800    If PA is null, then allocate a new array instead of reallocating
  801    the old one.
  802 
  803    Thus, to grow an array A without saving its old contents, do
  804    { free (A); A = xpalloc (NULL, &AITEMS, ...); }.  */
  805 
  806 static void *
  807 xpalloc (void *pa, idx_t *nitems, idx_t nitems_incr_min,
  808          ptrdiff_t nitems_max, idx_t item_size)
  809 {
  810   idx_t n0 = *nitems;
  811 
  812   /* The approximate size to use for initial small allocation
  813      requests.  This is the largest "small" request for the GNU C
  814      library malloc.  */
  815   enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 };
  816 
  817   /* If the array is tiny, grow it to about (but no greater than)
  818      DEFAULT_MXFAST bytes.  Otherwise, grow it by about 50%.
  819      Adjust the growth according to three constraints: NITEMS_INCR_MIN,
  820      NITEMS_MAX, and what the C language can represent safely.  */
  821 
  822   idx_t n, nbytes;
  823   if (INT_ADD_WRAPV (n0, n0 >> 1, &n))
  824     n = IDX_MAX;
  825   if (0 <= nitems_max && nitems_max < n)
  826     n = nitems_max;
  827 
  828   idx_t adjusted_nbytes
  829     = ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes)
  830        ? MIN (IDX_MAX, SIZE_MAX)
  831        : nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0);
  832   if (adjusted_nbytes)
  833     {
  834       n = adjusted_nbytes / item_size;
  835       nbytes = adjusted_nbytes - adjusted_nbytes % item_size;
  836     }
  837 
  838   if (! pa)
  839     *nitems = 0;
  840   if (n - n0 < nitems_incr_min
  841       && (INT_ADD_WRAPV (n0, nitems_incr_min, &n)
  842           || (0 <= nitems_max && nitems_max < n)
  843           || INT_MULTIPLY_WRAPV (n, item_size, &nbytes)))
  844     xalloc_die ();
  845   pa = xrealloc (pa, nbytes);
  846   *nitems = n;
  847   return pa;
  848 }
  849 
  850 /* Ensure that the array addressed by PA holds at least I + 1 items.
  851    Either return PA, or reallocate the array and return its new address.
  852    Although PA may be null, the returned value is never null.
  853 
  854    The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
  855    is updated on reallocation.  If PA is null, *NITEMS must be zero.
  856    Do not allocate more than NITEMS_MAX items total; -1 means no limit.
  857    ITEM_SIZE is the size of one item; it must be positive.
  858    Avoid O(N**2) behavior on arrays growing linearly.  */
  859 static void *
  860 maybe_realloc (void *pa, idx_t i, idx_t *nitems,
  861                ptrdiff_t nitems_max, idx_t item_size)
  862 {
  863   if (i < *nitems)
  864     return pa;
  865   return xpalloc (pa, nitems, 1, nitems_max, item_size);
  866 }
  867 
  868 /* In DFA D, find the index of charclass S, or allocate a new one.  */
  869 static idx_t
  870 charclass_index (struct dfa *d, charclass const *s)
  871 {
  872   idx_t i;
  873 
  874   for (i = 0; i < d->cindex; ++i)
  875     if (equal (s, &d->charclasses[i]))
  876       return i;
  877   d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
  878                                   TOKEN_MAX - CSET, sizeof *d->charclasses);
  879   ++d->cindex;
  880   d->charclasses[i] = *s;
  881   return i;
  882 }
  883 
  884 static bool
  885 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
  886 {
  887   return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
  888 }
  889 
  890 static int
  891 char_context (struct dfa const *dfa, unsigned char c)
  892 {
  893   if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
  894     return CTX_NEWLINE;
  895   if (unibyte_word_constituent (dfa, c))
  896     return CTX_LETTER;
  897   return CTX_NONE;
  898 }
  899 
  900 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
  901    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
  902    this may happen when folding case in weird Turkish locales where
  903    dotless i/dotted I are not included in the chosen character set.
  904    Return whether a bit was set in the charclass.  */
  905 static bool
  906 setbit_wc (wint_t wc, charclass *c)
  907 {
  908   int b = wctob (wc);
  909   if (b < 0)
  910     return false;
  911 
  912   setbit (b, c);
  913   return true;
  914 }
  915 
  916 /* Set a bit for B and its case variants in the charclass C.
  917    MB_CUR_MAX must be 1.  */
  918 static void
  919 setbit_case_fold_c (int b, charclass *c)
  920 {
  921   int ub = toupper (b);
  922   for (int i = 0; i < NOTCHAR; i++)
  923     if (toupper (i) == ub)
  924       setbit (i, c);
  925 }
  926 
  927 /* Fetch the next lexical input character from the pattern.  There
  928    must at least one byte of pattern input.  Set DFA->lex.wctok to the
  929    value of the character or to WEOF depending on whether the input is
  930    a valid multibyte character (possibly of length 1).  Then return
  931    the next input byte value, except return EOF if the input is a
  932    multibyte character of length greater than 1.  */
  933 static int
  934 fetch_wc (struct dfa *dfa)
  935 {
  936   int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
  937                              dfa);
  938   int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
  939   dfa->lex.ptr += nbytes;
  940   dfa->lex.left -= nbytes;
  941   return c;
  942 }
  943 
  944 /* If there is no more input, report an error about unbalanced brackets.
  945    Otherwise, behave as with fetch_wc (DFA).  */
  946 static int
  947 bracket_fetch_wc (struct dfa *dfa)
  948 {
  949   if (! dfa->lex.left)
  950     dfaerror (_("unbalanced ["));
  951   return fetch_wc (dfa);
  952 }
  953 
  954 typedef int predicate (int);
  955 
  956 /* The following list maps the names of the Posix named character classes
  957    to predicate functions that determine whether a given character is in
  958    the class.  The leading [ has already been eaten by the lexical
  959    analyzer.  */
  960 struct dfa_ctype
  961 {
  962   const char *name;
  963   predicate *func;
  964   bool single_byte_only;
  965 };
  966 
  967 static const struct dfa_ctype prednames[] = {
  968   {"alpha", isalpha, false},
  969   {"upper", isupper, false},
  970   {"lower", islower, false},
  971   {"digit", isdigit, true},
  972   {"xdigit", isxdigit, false},
  973   {"space", isspace, false},
  974   {"punct", ispunct, false},
  975   {"alnum", isalnum, false},
  976   {"print", isprint, false},
  977   {"graph", isgraph, false},
  978   {"cntrl", iscntrl, false},
  979   {"blank", isblank, false},
  980   {NULL, NULL, false}
  981 };
  982 
  983 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
  984 find_pred (const char *str)
  985 {
  986   for (int i = 0; prednames[i].name; i++)
  987     if (streq (str, prednames[i].name))
  988       return &prednames[i];
  989   return NULL;
  990 }
  991 
  992 /* Parse a bracket expression, which possibly includes multibyte
  993    characters.  */
  994 static token
  995 parse_bracket_exp (struct dfa *dfa)
  996 {
  997   /* This is a bracket expression that dfaexec is known to
  998      process correctly.  */
  999   bool known_bracket_exp = true;
 1000 
 1001   /* Used to warn about [:space:].
 1002      Bit 0 = first character is a colon.
 1003      Bit 1 = last character is a colon.
 1004      Bit 2 = includes any other character but a colon.
 1005      Bit 3 = includes ranges, char/equiv classes or collation elements.  */
 1006   int colon_warning_state;
 1007 
 1008   dfa->lex.brack.nchars = 0;
 1009   charclass ccl;
 1010   zeroset (&ccl);
 1011   int c = bracket_fetch_wc (dfa);
 1012   bool invert = c == '^';
 1013   if (invert)
 1014     {
 1015       c = bracket_fetch_wc (dfa);
 1016       known_bracket_exp = dfa->localeinfo.simple;
 1017     }
 1018   wint_t wc = dfa->lex.wctok;
 1019   int c1;
 1020   wint_t wc1;
 1021   colon_warning_state = (c == ':');
 1022   do
 1023     {
 1024       c1 = NOTCHAR; /* Mark c1 as not initialized.  */
 1025       colon_warning_state &= ~2;
 1026 
 1027       /* Note that if we're looking at some other [:...:] construct,
 1028          we just treat it as a bunch of ordinary characters.  We can do
 1029          this because we assume regex has checked for syntax errors before
 1030          dfa is ever called.  */
 1031       if (c == '[')
 1032         {
 1033           c1 = bracket_fetch_wc (dfa);
 1034           wc1 = dfa->lex.wctok;
 1035 
 1036           if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
 1037               || c1 == '.' || c1 == '=')
 1038             {
 1039               enum { MAX_BRACKET_STRING_LEN = 32 };
 1040               char str[MAX_BRACKET_STRING_LEN + 1];
 1041               int len = 0;
 1042               for (;;)
 1043                 {
 1044                   c = bracket_fetch_wc (dfa);
 1045                   if (dfa->lex.left == 0
 1046                       || (c == c1 && dfa->lex.ptr[0] == ']'))
 1047                     break;
 1048                   if (len < MAX_BRACKET_STRING_LEN)
 1049                     str[len++] = c;
 1050                   else
 1051                     /* This is in any case an invalid class name.  */
 1052                     str[0] = '\0';
 1053                 }
 1054               str[len] = '\0';
 1055 
 1056               /* Fetch bracket.  */
 1057               c = bracket_fetch_wc (dfa);
 1058               wc = dfa->lex.wctok;
 1059               if (c1 == ':')
 1060                 /* Build character class.  POSIX allows character
 1061                    classes to match multicharacter collating elements,
 1062                    but the regex code does not support that, so do not
 1063                    worry about that possibility.  */
 1064                 {
 1065                   char const *class
 1066                     = (dfa->syntax.case_fold && (streq (str, "upper")
 1067                                                  || streq (str, "lower"))
 1068                        ? "alpha" : str);
 1069                   const struct dfa_ctype *pred = find_pred (class);
 1070                   if (!pred)
 1071                     dfaerror (_("invalid character class"));
 1072 
 1073                   if (dfa->localeinfo.multibyte && !pred->single_byte_only)
 1074                     known_bracket_exp = false;
 1075                   else
 1076                     for (int c2 = 0; c2 < NOTCHAR; ++c2)
 1077                       if (pred->func (c2))
 1078                         setbit (c2, &ccl);
 1079                 }
 1080               else
 1081                 known_bracket_exp = false;
 1082 
 1083               colon_warning_state |= 8;
 1084 
 1085               /* Fetch new lookahead character.  */
 1086               c1 = bracket_fetch_wc (dfa);
 1087               wc1 = dfa->lex.wctok;
 1088               continue;
 1089             }
 1090 
 1091           /* We treat '[' as a normal character here.  c/c1/wc/wc1
 1092              are already set up.  */
 1093         }
 1094 
 1095       if (c == '\\'
 1096           && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
 1097         {
 1098           c = bracket_fetch_wc (dfa);
 1099           wc = dfa->lex.wctok;
 1100         }
 1101 
 1102       if (c1 == NOTCHAR)
 1103         {
 1104           c1 = bracket_fetch_wc (dfa);
 1105           wc1 = dfa->lex.wctok;
 1106         }
 1107 
 1108       if (c1 == '-')
 1109         /* build range characters.  */
 1110         {
 1111           int c2 = bracket_fetch_wc (dfa);
 1112           wint_t wc2 = dfa->lex.wctok;
 1113 
 1114           /* A bracket expression like [a-[.aa.]] matches an unknown set.
 1115              Treat it like [-a[.aa.]] while parsing it, and
 1116              remember that the set is unknown.  */
 1117           if (c2 == '[' && dfa->lex.ptr[0] == '.')
 1118             {
 1119               known_bracket_exp = false;
 1120               c2 = ']';
 1121             }
 1122 
 1123           if (c2 == ']')
 1124             {
 1125               /* In the case [x-], the - is an ordinary hyphen,
 1126                  which is left in c1, the lookahead character.  */
 1127               dfa->lex.ptr--;
 1128               dfa->lex.left++;
 1129             }
 1130           else
 1131             {
 1132               if (c2 == '\\' && (dfa->syntax.syntax_bits
 1133                                  & RE_BACKSLASH_ESCAPE_IN_LISTS))
 1134                 {
 1135                   c2 = bracket_fetch_wc (dfa);
 1136                   wc2 = dfa->lex.wctok;
 1137                 }
 1138 
 1139               colon_warning_state |= 8;
 1140               c1 = bracket_fetch_wc (dfa);
 1141               wc1 = dfa->lex.wctok;
 1142 
 1143               /* Treat [x-y] as a range if x != y.  */
 1144               if (wc != wc2 || wc == WEOF)
 1145                 {
 1146                   if (dfa->localeinfo.simple
 1147                       || (isasciidigit (c) & isasciidigit (c2)))
 1148                     {
 1149                       for (int ci = c; ci <= c2; ci++)
 1150                         if (dfa->syntax.case_fold && isalpha (ci))
 1151                           setbit_case_fold_c (ci, &ccl);
 1152                         else
 1153                           setbit (ci, &ccl);
 1154                     }
 1155                   else
 1156                     known_bracket_exp = false;
 1157 
 1158                   continue;
 1159                 }
 1160             }
 1161         }
 1162 
 1163       colon_warning_state |= (c == ':') ? 2 : 4;
 1164 
 1165       if (!dfa->localeinfo.multibyte)
 1166         {
 1167           if (dfa->syntax.case_fold && isalpha (c))
 1168             setbit_case_fold_c (c, &ccl);
 1169           else
 1170             setbit (c, &ccl);
 1171           continue;
 1172         }
 1173 
 1174       if (wc == WEOF)
 1175         known_bracket_exp = false;
 1176       else
 1177         {
 1178           wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
 1179           int n = (dfa->syntax.case_fold
 1180                    ? case_folded_counterparts (wc, folded + 1) + 1
 1181                    : 1);
 1182           folded[0] = wc;
 1183           for (int i = 0; i < n; i++)
 1184             if (!setbit_wc (folded[i], &ccl))
 1185               {
 1186                 dfa->lex.brack.chars
 1187                   = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
 1188                                    &dfa->lex.brack.nchars_alloc, -1,
 1189                                    sizeof *dfa->lex.brack.chars);
 1190                 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
 1191               }
 1192         }
 1193     }
 1194   while ((wc = wc1, (c = c1) != ']'));
 1195 
 1196   if (colon_warning_state == 7)
 1197     dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
 1198 
 1199   if (! known_bracket_exp)
 1200     return BACKREF;
 1201 
 1202   if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
 1203     {
 1204       dfa->lex.brack.invert = invert;
 1205       dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
 1206       return MBCSET;
 1207     }
 1208 
 1209   if (invert)
 1210     {
 1211       notset (&ccl);
 1212       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
 1213         clrbit ('\n', &ccl);
 1214     }
 1215 
 1216   return CSET + charclass_index (dfa, &ccl);
 1217 }
 1218 
 1219 struct lexptr
 1220 {
 1221   char const *ptr;
 1222   idx_t left;
 1223 };
 1224 
 1225 static void
 1226 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
 1227 {
 1228   ls->ptr = dfa->lex.ptr;
 1229   ls->left = dfa->lex.left;
 1230   dfa->lex.ptr = s;
 1231   dfa->lex.left = strlen (s);
 1232 }
 1233 
 1234 static void
 1235 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
 1236 {
 1237   dfa->lex.ptr = ls->ptr;
 1238   dfa->lex.left = ls->left;
 1239 }
 1240 
 1241 static token
 1242 lex (struct dfa *dfa)
 1243 {
 1244   bool backslash = false;
 1245 
 1246   /* Basic plan: We fetch a character.  If it's a backslash,
 1247      we set the backslash flag and go through the loop again.
 1248      On the plus side, this avoids having a duplicate of the
 1249      main switch inside the backslash case.  On the minus side,
 1250      it means that just about every case begins with
 1251      "if (backslash) ...".  */
 1252   for (int i = 0; i < 2; ++i)
 1253     {
 1254       if (! dfa->lex.left)
 1255         return dfa->lex.lasttok = END;
 1256       int c = fetch_wc (dfa);
 1257 
 1258       switch (c)
 1259         {
 1260         case '\\':
 1261           if (backslash)
 1262             goto normal_char;
 1263           if (dfa->lex.left == 0)
 1264             dfaerror (_("unfinished \\ escape"));
 1265           backslash = true;
 1266           break;
 1267 
 1268         case '^':
 1269           if (backslash)
 1270             goto normal_char;
 1271           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
 1272               || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
 1273               || dfa->lex.lasttok == OR)
 1274             return dfa->lex.lasttok = BEGLINE;
 1275           goto normal_char;
 1276 
 1277         case '$':
 1278           if (backslash)
 1279             goto normal_char;
 1280           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
 1281               || dfa->lex.left == 0
 1282               || ((dfa->lex.left
 1283                    > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
 1284                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
 1285                                    & (dfa->lex.ptr[0] == '\\')]
 1286                       == ')'))
 1287               || ((dfa->lex.left
 1288                    > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
 1289                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
 1290                                    & (dfa->lex.ptr[0] == '\\')]
 1291                       == '|'))
 1292               || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
 1293                   && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
 1294             return dfa->lex.lasttok = ENDLINE;
 1295           goto normal_char;
 1296 
 1297         case '1':
 1298         case '2':
 1299         case '3':
 1300         case '4':
 1301         case '5':
 1302         case '6':
 1303         case '7':
 1304         case '8':
 1305         case '9':
 1306           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
 1307             {
 1308               dfa->lex.laststart = false;
 1309               return dfa->lex.lasttok = BACKREF;
 1310             }
 1311           goto normal_char;
 1312 
 1313         case '`':
 1314           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
 1315             {
 1316               /* FIXME: should be beginning of string */
 1317               return dfa->lex.lasttok = BEGLINE;
 1318             }
 1319           goto normal_char;
 1320 
 1321         case '\'':
 1322           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
 1323             {
 1324               /* FIXME: should be end of string */
 1325               return dfa->lex.lasttok = ENDLINE;
 1326             }
 1327           goto normal_char;
 1328 
 1329         case '<':
 1330           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
 1331             return dfa->lex.lasttok = BEGWORD;
 1332           goto normal_char;
 1333 
 1334         case '>':
 1335           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
 1336             return dfa->lex.lasttok = ENDWORD;
 1337           goto normal_char;
 1338 
 1339         case 'b':
 1340           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
 1341             return dfa->lex.lasttok = LIMWORD;
 1342           goto normal_char;
 1343 
 1344         case 'B':
 1345           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
 1346             return dfa->lex.lasttok = NOTLIMWORD;
 1347           goto normal_char;
 1348 
 1349         case '?':
 1350           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
 1351             goto normal_char;
 1352           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
 1353             goto normal_char;
 1354           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
 1355               && dfa->lex.laststart)
 1356             goto normal_char;
 1357           return dfa->lex.lasttok = QMARK;
 1358 
 1359         case '*':
 1360           if (backslash)
 1361             goto normal_char;
 1362           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
 1363               && dfa->lex.laststart)
 1364             goto normal_char;
 1365           return dfa->lex.lasttok = STAR;
 1366 
 1367         case '+':
 1368           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
 1369             goto normal_char;
 1370           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
 1371             goto normal_char;
 1372           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
 1373               && dfa->lex.laststart)
 1374             goto normal_char;
 1375           return dfa->lex.lasttok = PLUS;
 1376 
 1377         case '{':
 1378           if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
 1379             goto normal_char;
 1380           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
 1381             goto normal_char;
 1382           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
 1383               && dfa->lex.laststart)
 1384             goto normal_char;
 1385 
 1386           /* Cases:
 1387              {M} - exact count
 1388              {M,} - minimum count, maximum is infinity
 1389              {,N} - 0 through N
 1390              {,} - 0 to infinity (same as '*')
 1391              {M,N} - M through N */
 1392           {
 1393             char const *p = dfa->lex.ptr;
 1394             char const *lim = p + dfa->lex.left;
 1395             dfa->lex.minrep = dfa->lex.maxrep = -1;
 1396             for (; p != lim && isasciidigit (*p); p++)
 1397               dfa->lex.minrep = (dfa->lex.minrep < 0
 1398                                  ? *p - '0'
 1399                                  : MIN (RE_DUP_MAX + 1,
 1400                                         dfa->lex.minrep * 10 + *p - '0'));
 1401             if (p != lim)
 1402               {
 1403                 if (*p != ',')
 1404                   dfa->lex.maxrep = dfa->lex.minrep;
 1405                 else
 1406                   {
 1407                     if (dfa->lex.minrep < 0)
 1408                       dfa->lex.minrep = 0;
 1409                     while (++p != lim && isasciidigit (*p))
 1410                       dfa->lex.maxrep
 1411                         = (dfa->lex.maxrep < 0
 1412                            ? *p - '0'
 1413                            : MIN (RE_DUP_MAX + 1,
 1414                                   dfa->lex.maxrep * 10 + *p - '0'));
 1415                   }
 1416               }
 1417             if (! ((! backslash || (p != lim && *p++ == '\\'))
 1418                    && p != lim && *p++ == '}'
 1419                    && 0 <= dfa->lex.minrep
 1420                    && (dfa->lex.maxrep < 0
 1421                        || dfa->lex.minrep <= dfa->lex.maxrep)))
 1422               {
 1423                 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
 1424                   goto normal_char;
 1425                 dfaerror (_("invalid content of \\{\\}"));
 1426               }
 1427             if (RE_DUP_MAX < dfa->lex.maxrep)
 1428               dfaerror (_("regular expression too big"));
 1429             dfa->lex.ptr = p;
 1430             dfa->lex.left = lim - p;
 1431           }
 1432           dfa->lex.laststart = false;
 1433           return dfa->lex.lasttok = REPMN;
 1434 
 1435         case '|':
 1436           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
 1437             goto normal_char;
 1438           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
 1439             goto normal_char;
 1440           dfa->lex.laststart = true;
 1441           return dfa->lex.lasttok = OR;
 1442 
 1443         case '\n':
 1444           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
 1445               || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
 1446             goto normal_char;
 1447           dfa->lex.laststart = true;
 1448           return dfa->lex.lasttok = OR;
 1449 
 1450         case '(':
 1451           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
 1452             goto normal_char;
 1453           dfa->lex.parens++;
 1454           dfa->lex.laststart = true;
 1455           return dfa->lex.lasttok = LPAREN;
 1456 
 1457         case ')':
 1458           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
 1459             goto normal_char;
 1460           if (dfa->lex.parens == 0
 1461               && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
 1462             goto normal_char;
 1463           dfa->lex.parens--;
 1464           dfa->lex.laststart = false;
 1465           return dfa->lex.lasttok = RPAREN;
 1466 
 1467         case '.':
 1468           if (backslash)
 1469             goto normal_char;
 1470           if (dfa->canychar < 0)
 1471             {
 1472               charclass ccl;
 1473               fillset (&ccl);
 1474               if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
 1475                 clrbit ('\n', &ccl);
 1476               if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
 1477                 clrbit ('\0', &ccl);
 1478               if (dfa->localeinfo.multibyte)
 1479                 for (int c2 = 0; c2 < NOTCHAR; c2++)
 1480                   if (dfa->localeinfo.sbctowc[c2] == WEOF)
 1481                     clrbit (c2, &ccl);
 1482               dfa->canychar = charclass_index (dfa, &ccl);
 1483             }
 1484           dfa->lex.laststart = false;
 1485           return dfa->lex.lasttok = (dfa->localeinfo.multibyte
 1486                                      ? ANYCHAR
 1487                                      : CSET + dfa->canychar);
 1488 
 1489         case 's':
 1490         case 'S':
 1491           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
 1492             goto normal_char;
 1493           if (!dfa->localeinfo.multibyte)
 1494             {
 1495               charclass ccl;
 1496               zeroset (&ccl);
 1497               for (int c2 = 0; c2 < NOTCHAR; ++c2)
 1498                 if (isspace (c2))
 1499                   setbit (c2, &ccl);
 1500               if (c == 'S')
 1501                 notset (&ccl);
 1502               dfa->lex.laststart = false;
 1503               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
 1504             }
 1505 
 1506           /* FIXME: see if optimizing this, as is done with ANYCHAR and
 1507              add_utf8_anychar, makes sense.  */
 1508 
 1509           /* \s and \S are documented to be equivalent to [[:space:]] and
 1510              [^[:space:]] respectively, so tell the lexer to process those
 1511              strings, each minus its "already processed" '['.  */
 1512           {
 1513             struct lexptr ls;
 1514             push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
 1515             dfa->lex.lasttok = parse_bracket_exp (dfa);
 1516             pop_lex_state (dfa, &ls);
 1517           }
 1518 
 1519           dfa->lex.laststart = false;
 1520           return dfa->lex.lasttok;
 1521 
 1522         case 'w':
 1523         case 'W':
 1524           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
 1525             goto normal_char;
 1526 
 1527           if (!dfa->localeinfo.multibyte)
 1528             {
 1529               charclass ccl;
 1530               zeroset (&ccl);
 1531               for (int c2 = 0; c2 < NOTCHAR; ++c2)
 1532                 if (dfa->syntax.sbit[c2] == CTX_LETTER)
 1533                   setbit (c2, &ccl);
 1534               if (c == 'W')
 1535                 notset (&ccl);
 1536               dfa->lex.laststart = false;
 1537               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
 1538             }
 1539 
 1540           /* FIXME: see if optimizing this, as is done with ANYCHAR and
 1541              add_utf8_anychar, makes sense.  */
 1542 
 1543           /* \w and \W are documented to be equivalent to [_[:alnum:]] and
 1544              [^_[:alnum:]] respectively, so tell the lexer to process those
 1545              strings, each minus its "already processed" '['.  */
 1546           {
 1547             struct lexptr ls;
 1548             push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
 1549             dfa->lex.lasttok = parse_bracket_exp (dfa);
 1550             pop_lex_state (dfa, &ls);
 1551           }
 1552 
 1553           dfa->lex.laststart = false;
 1554           return dfa->lex.lasttok;
 1555 
 1556         case '[':
 1557           if (backslash)
 1558             goto normal_char;
 1559           dfa->lex.laststart = false;
 1560           return dfa->lex.lasttok = parse_bracket_exp (dfa);
 1561 
 1562         default:
 1563         normal_char:
 1564           dfa->lex.laststart = false;
 1565           /* For multibyte character sets, folding is done in atom.  Always
 1566              return WCHAR.  */
 1567           if (dfa->localeinfo.multibyte)
 1568             return dfa->lex.lasttok = WCHAR;
 1569 
 1570           if (dfa->syntax.case_fold && isalpha (c))
 1571             {
 1572               charclass ccl;
 1573               zeroset (&ccl);
 1574               setbit_case_fold_c (c, &ccl);
 1575               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
 1576             }
 1577 
 1578           return dfa->lex.lasttok = c;
 1579         }
 1580     }
 1581 
 1582   /* The above loop should consume at most a backslash
 1583      and some other character.  */
 1584   abort ();
 1585   return END;                   /* keeps pedantic compilers happy.  */
 1586 }
 1587 
 1588 static void
 1589 addtok_mb (struct dfa *dfa, token t, char mbprop)
 1590 {
 1591   if (dfa->talloc == dfa->tindex)
 1592     {
 1593       dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
 1594                              sizeof *dfa->tokens);
 1595       if (dfa->localeinfo.multibyte)
 1596         dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
 1597                                          sizeof *dfa->multibyte_prop);
 1598     }
 1599   if (dfa->localeinfo.multibyte)
 1600     dfa->multibyte_prop[dfa->tindex] = mbprop;
 1601   dfa->tokens[dfa->tindex++] = t;
 1602 
 1603   switch (t)
 1604     {
 1605     case QMARK:
 1606     case STAR:
 1607     case PLUS:
 1608       break;
 1609 
 1610     case CAT:
 1611     case OR:
 1612       dfa->parse.depth--;
 1613       break;
 1614 
 1615     case BACKREF:
 1616       dfa->fast = false;
 1617       FALLTHROUGH;
 1618     default:
 1619       dfa->nleaves++;
 1620       FALLTHROUGH;
 1621     case EMPTY:
 1622       dfa->parse.depth++;
 1623       break;
 1624     }
 1625   if (dfa->parse.depth > dfa->depth)
 1626     dfa->depth = dfa->parse.depth;
 1627 }
 1628 
 1629 static void addtok_wc (struct dfa *dfa, wint_t wc);
 1630 
 1631 /* Add the given token to the parse tree, maintaining the depth count and
 1632    updating the maximum depth if necessary.  */
 1633 static void
 1634 addtok (struct dfa *dfa, token t)
 1635 {
 1636   if (dfa->localeinfo.multibyte && t == MBCSET)
 1637     {
 1638       bool need_or = false;
 1639 
 1640       /* Extract wide characters into alternations for better performance.
 1641          This does not require UTF-8.  */
 1642       for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
 1643         {
 1644           addtok_wc (dfa, dfa->lex.brack.chars[i]);
 1645           if (need_or)
 1646             addtok (dfa, OR);
 1647           need_or = true;
 1648         }
 1649       dfa->lex.brack.nchars = 0;
 1650 
 1651       /* Wide characters have been handled above, so it is possible
 1652          that the set is empty now.  Do nothing in that case.  */
 1653       if (dfa->lex.brack.cset != -1)
 1654         {
 1655           addtok (dfa, CSET + dfa->lex.brack.cset);
 1656           if (need_or)
 1657             addtok (dfa, OR);
 1658         }
 1659     }
 1660   else
 1661     {
 1662       addtok_mb (dfa, t, 3);
 1663     }
 1664 }
 1665 
 1666 /* We treat a multibyte character as a single atom, so that DFA
 1667    can treat a multibyte character as a single expression.
 1668 
 1669    e.g., we construct the following tree from "<mb1><mb2>".
 1670    <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
 1671    <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
 1672 static void
 1673 addtok_wc (struct dfa *dfa, wint_t wc)
 1674 {
 1675   unsigned char buf[MB_LEN_MAX];
 1676   mbstate_t s = { 0 };
 1677   size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
 1678   int buflen;
 1679 
 1680   if (stored_bytes != (size_t) -1)
 1681     buflen = stored_bytes;
 1682   else
 1683     {
 1684       /* This is merely stop-gap.  buf[0] is undefined, yet skipping
 1685          the addtok_mb call altogether can corrupt the heap.  */
 1686       buflen = 1;
 1687       buf[0] = 0;
 1688     }
 1689 
 1690   addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
 1691   for (int i = 1; i < buflen; i++)
 1692     {
 1693       addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
 1694       addtok (dfa, CAT);
 1695     }
 1696 }
 1697 
 1698 static void
 1699 add_utf8_anychar (struct dfa *dfa)
 1700 {
 1701   /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
 1702      UTF-8 byte sequence has been defined as follows:
 1703 
 1704      ([\x00-\x7f]
 1705      |[\xc2-\xdf][\x80-\xbf]
 1706      |[\xe0][\xa0-\xbf][\x80-\xbf]
 1707      |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
 1708      |[\xed][\x80-\x9f][\x80-\xbf]
 1709      |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
 1710      |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
 1711      |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
 1712 
 1713      which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
 1714      where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
 1715      D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
 1716      H = [\x80-\x9f], I = [\xf0],
 1717      J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
 1718 
 1719      This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C".  */
 1720 
 1721   /* Mnemonics for classes containing two or more bytes.  */
 1722   enum { A, B, C, E, F, H, J, K, M };
 1723 
 1724   /* Mnemonics for single-byte tokens.  */
 1725   enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
 1726 
 1727   static charclass const utf8_classes[] = {
 1728     /* A. 00-7f: 1-byte sequence.  */
 1729     CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
 1730 
 1731     /* B. c2-df: 1st byte of a 2-byte sequence.  */
 1732     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
 1733 
 1734     /* C. 80-bf: non-leading bytes.  */
 1735     CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
 1736 
 1737     /* D. e0 (just a token).  */
 1738 
 1739     /* E. a0-bf: 2nd byte of a "DEC" sequence.  */
 1740     CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
 1741 
 1742     /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence.  */
 1743     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
 1744 
 1745     /* G. ed (just a token).  */
 1746 
 1747     /* H. 80-9f: 2nd byte of a "GHC" sequence.  */
 1748     CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
 1749 
 1750     /* I. f0 (just a token).  */
 1751 
 1752     /* J. 90-bf: 2nd byte of an "IJCC" sequence.  */
 1753     CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
 1754 
 1755     /* K. f1-f3: 1st byte of a "KCCC" sequence.  */
 1756     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
 1757 
 1758     /* L. f4 (just a token).  */
 1759 
 1760     /* M. 80-8f: 2nd byte of a "LMCC" sequence.  */
 1761     CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
 1762   };
 1763 
 1764   /* Define the character classes that are needed below.  */
 1765   if (dfa->utf8_anychar_classes[0] == 0)
 1766     {
 1767       charclass c = utf8_classes[0];
 1768       if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
 1769         clrbit ('\n', &c);
 1770       if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
 1771         clrbit ('\0', &c);
 1772       dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
 1773 
 1774       for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
 1775         dfa->utf8_anychar_classes[i]
 1776           = CSET + charclass_index (dfa, &utf8_classes[i]);
 1777     }
 1778 
 1779   /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
 1780      The token buffer is in reverse Polish order, so we get
 1781      "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
 1782       C CAT OR C CAT OR C CAT OR".  */
 1783   addtok (dfa, dfa->utf8_anychar_classes[A]);
 1784   addtok (dfa, dfa->utf8_anychar_classes[B]);
 1785   addtok (dfa, D_token);
 1786   addtok (dfa, dfa->utf8_anychar_classes[E]);
 1787   addtok (dfa, CAT);
 1788   addtok (dfa, OR);
 1789   addtok (dfa, G_token);
 1790   addtok (dfa, dfa->utf8_anychar_classes[H]);
 1791   addtok (dfa, CAT);
 1792   addtok (dfa, OR);
 1793   addtok (dfa, dfa->utf8_anychar_classes[F]);
 1794   addtok (dfa, I_token);
 1795   addtok (dfa, dfa->utf8_anychar_classes[J]);
 1796   addtok (dfa, CAT);
 1797   addtok (dfa, OR);
 1798   addtok (dfa, L_token);
 1799   addtok (dfa, dfa->utf8_anychar_classes[M]);
 1800   addtok (dfa, CAT);
 1801   addtok (dfa, OR);
 1802   addtok (dfa, dfa->utf8_anychar_classes[K]);
 1803   for (int i = 0; i < 3; i++)
 1804     {
 1805       addtok (dfa, dfa->utf8_anychar_classes[C]);
 1806       addtok (dfa, CAT);
 1807       addtok (dfa, OR);
 1808     }
 1809 }
 1810 
 1811 /* The grammar understood by the parser is as follows.
 1812 
 1813    regexp:
 1814      regexp OR branch
 1815      branch
 1816 
 1817    branch:
 1818      branch closure
 1819      closure
 1820 
 1821    closure:
 1822      closure QMARK
 1823      closure STAR
 1824      closure PLUS
 1825      closure REPMN
 1826      atom
 1827 
 1828    atom:
 1829      <normal character>
 1830      <multibyte character>
 1831      ANYCHAR
 1832      MBCSET
 1833      CSET
 1834      BACKREF
 1835      BEGLINE
 1836      ENDLINE
 1837      BEGWORD
 1838      ENDWORD
 1839      LIMWORD
 1840      NOTLIMWORD
 1841      LPAREN regexp RPAREN
 1842      <empty>
 1843 
 1844    The parser builds a parse tree in postfix form in an array of tokens.  */
 1845 
 1846 static void
 1847 atom (struct dfa *dfa)
 1848 {
 1849   if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
 1850       || dfa->parse.tok >= CSET
 1851       || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
 1852       || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
 1853       || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
 1854       || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
 1855       || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
 1856     {
 1857       if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
 1858         {
 1859           /* For UTF-8 expand the period to a series of CSETs that define a
 1860              valid UTF-8 character.  This avoids using the slow multibyte
 1861              path.  I'm pretty sure it would be both profitable and correct to
 1862              do it for any encoding; however, the optimization must be done
 1863              manually as it is done above in add_utf8_anychar.  So, let's
 1864              start with UTF-8: it is the most used, and the structure of the
 1865              encoding makes the correctness more obvious.  */
 1866           add_utf8_anychar (dfa);
 1867         }
 1868       else
 1869         addtok (dfa, dfa->parse.tok);
 1870       dfa->parse.tok = lex (dfa);
 1871     }
 1872   else if (dfa->parse.tok == WCHAR)
 1873     {
 1874       if (dfa->lex.wctok == WEOF)
 1875         addtok (dfa, BACKREF);
 1876       else
 1877         {
 1878           addtok_wc (dfa, dfa->lex.wctok);
 1879 
 1880           if (dfa->syntax.case_fold)
 1881             {
 1882               wchar_t folded[CASE_FOLDED_BUFSIZE];
 1883               int n = case_folded_counterparts (dfa->lex.wctok, folded);
 1884               for (int i = 0; i < n; i++)
 1885                 {
 1886                   addtok_wc (dfa, folded[i]);
 1887                   addtok (dfa, OR);
 1888                 }
 1889             }
 1890         }
 1891 
 1892       dfa->parse.tok = lex (dfa);
 1893     }
 1894   else if (dfa->parse.tok == LPAREN)
 1895     {
 1896       dfa->parse.tok = lex (dfa);
 1897       regexp (dfa);
 1898       if (dfa->parse.tok != RPAREN)
 1899         dfaerror (_("unbalanced ("));
 1900       dfa->parse.tok = lex (dfa);
 1901     }
 1902   else
 1903     addtok (dfa, EMPTY);
 1904 }
 1905 
 1906 /* Return the number of tokens in the given subexpression.  */
 1907 static idx_t _GL_ATTRIBUTE_PURE
 1908 nsubtoks (struct dfa const *dfa, idx_t tindex)
 1909 {
 1910   switch (dfa->tokens[tindex - 1])
 1911     {
 1912     default:
 1913       return 1;
 1914     case QMARK:
 1915     case STAR:
 1916     case PLUS:
 1917       return 1 + nsubtoks (dfa, tindex - 1);
 1918     case CAT:
 1919     case OR:
 1920       {
 1921         idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
 1922         return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
 1923       }
 1924     }
 1925 }
 1926 
 1927 /* Copy the given subexpression to the top of the tree.  */
 1928 static void
 1929 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
 1930 {
 1931   if (dfa->localeinfo.multibyte)
 1932     for (idx_t i = 0; i < ntokens; i++)
 1933       addtok_mb (dfa, dfa->tokens[tindex + i],
 1934                  dfa->multibyte_prop[tindex + i]);
 1935   else
 1936     for (idx_t i = 0; i < ntokens; i++)
 1937       addtok_mb (dfa, dfa->tokens[tindex + i], 3);
 1938 }
 1939 
 1940 static void
 1941 closure (struct dfa *dfa)
 1942 {
 1943   atom (dfa);
 1944   while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
 1945          || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
 1946     if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
 1947       {
 1948         idx_t ntokens = nsubtoks (dfa, dfa->tindex);
 1949         idx_t tindex = dfa->tindex - ntokens;
 1950         if (dfa->lex.maxrep < 0)
 1951           addtok (dfa, PLUS);
 1952         if (dfa->lex.minrep == 0)
 1953           addtok (dfa, QMARK);
 1954         int i;
 1955         for (i = 1; i < dfa->lex.minrep; i++)
 1956           {
 1957             copytoks (dfa, tindex, ntokens);
 1958             addtok (dfa, CAT);
 1959           }
 1960         for (; i < dfa->lex.maxrep; i++)
 1961           {
 1962             copytoks (dfa, tindex, ntokens);
 1963             addtok (dfa, QMARK);
 1964             addtok (dfa, CAT);
 1965           }
 1966         dfa->parse.tok = lex (dfa);
 1967       }
 1968     else if (dfa->parse.tok == REPMN)
 1969       {
 1970         dfa->tindex -= nsubtoks (dfa, dfa->tindex);
 1971         dfa->parse.tok = lex (dfa);
 1972         closure (dfa);
 1973       }
 1974     else
 1975       {
 1976         addtok (dfa, dfa->parse.tok);
 1977         dfa->parse.tok = lex (dfa);
 1978       }
 1979 }
 1980 
 1981 static void
 1982 branch (struct dfa* dfa)
 1983 {
 1984   closure (dfa);
 1985   while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
 1986          && dfa->parse.tok >= 0)
 1987     {
 1988       closure (dfa);
 1989       addtok (dfa, CAT);
 1990     }
 1991 }
 1992 
 1993 static void
 1994 regexp (struct dfa *dfa)
 1995 {
 1996   branch (dfa);
 1997   while (dfa->parse.tok == OR)
 1998     {
 1999       dfa->parse.tok = lex (dfa);
 2000       branch (dfa);
 2001       addtok (dfa, OR);
 2002     }
 2003 }
 2004 
 2005 /* Parse a string S of length LEN into D.  S can include NUL characters.
 2006    This is the main entry point for the parser.  */
 2007 void
 2008 dfaparse (char const *s, idx_t len, struct dfa *d)
 2009 {
 2010   d->lex.ptr = s;
 2011   d->lex.left = len;
 2012   d->lex.lasttok = END;
 2013   d->lex.laststart = true;
 2014 
 2015   if (!d->syntax.syntax_bits_set)
 2016     dfaerror (_("no syntax specified"));
 2017 
 2018   if (!d->nregexps)
 2019     addtok (d, BEG);
 2020 
 2021   d->parse.tok = lex (d);
 2022   d->parse.depth = d->depth;
 2023 
 2024   regexp (d);
 2025 
 2026   if (d->parse.tok != END)
 2027     dfaerror (_("unbalanced )"));
 2028 
 2029   addtok (d, END - d->nregexps);
 2030   addtok (d, CAT);
 2031 
 2032   if (d->nregexps)
 2033     addtok (d, OR);
 2034 
 2035   ++d->nregexps;
 2036 }
 2037 
 2038 /* Some primitives for operating on sets of positions.  */
 2039 
 2040 /* Copy one set to another.  */
 2041 static void
 2042 copy (position_set const *src, position_set *dst)
 2043 {
 2044   if (dst->alloc < src->nelem)
 2045     {
 2046       free (dst->elems);
 2047       dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
 2048                             sizeof *dst->elems);
 2049     }
 2050   dst->nelem = src->nelem;
 2051   if (src->nelem != 0)
 2052     memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
 2053 }
 2054 
 2055 static void
 2056 alloc_position_set (position_set *s, idx_t size)
 2057 {
 2058   s->elems = xnmalloc (size, sizeof *s->elems);
 2059   s->alloc = size;
 2060   s->nelem = 0;
 2061 }
 2062 
 2063 /* Insert position P in set S.  S is maintained in sorted order on
 2064    decreasing index.  If there is already an entry in S with P.index
 2065    then merge (logically-OR) P's constraints into the one in S.
 2066    S->elems must point to an array large enough to hold the resulting set.  */
 2067 static void
 2068 insert (position p, position_set *s)
 2069 {
 2070   idx_t count = s->nelem;
 2071   idx_t lo = 0, hi = count;
 2072   while (lo < hi)
 2073     {
 2074       idx_t mid = (lo + hi) >> 1;
 2075       if (s->elems[mid].index < p.index)
 2076         lo = mid + 1;
 2077       else if (s->elems[mid].index == p.index)
 2078         {
 2079           s->elems[mid].constraint |= p.constraint;
 2080           return;
 2081         }
 2082       else
 2083         hi = mid;
 2084     }
 2085 
 2086   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
 2087   for (idx_t i = count; i > lo; i--)
 2088     s->elems[i] = s->elems[i - 1];
 2089   s->elems[lo] = p;
 2090   ++s->nelem;
 2091 }
 2092 
 2093 static void
 2094 append (position p, position_set *s)
 2095 {
 2096   idx_t count = s->nelem;
 2097   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
 2098   s->elems[s->nelem++] = p;
 2099 }
 2100 
 2101 /* Merge S1 and S2 (with the additional constraint C2) into M.  The
 2102    result is as if the positions of S1, and of S2 with the additional
 2103    constraint C2, were inserted into an initially empty set.  */
 2104 static void
 2105 merge_constrained (position_set const *s1, position_set const *s2,
 2106                    unsigned int c2, position_set *m)
 2107 {
 2108   idx_t i = 0, j = 0;
 2109 
 2110   if (m->alloc - s1->nelem < s2->nelem)
 2111     {
 2112       free (m->elems);
 2113       m->alloc = s1->nelem;
 2114       m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
 2115     }
 2116   m->nelem = 0;
 2117   while (i < s1->nelem || j < s2->nelem)
 2118     if (! (j < s2->nelem)
 2119         || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
 2120       {
 2121         unsigned int c = ((i < s1->nelem && j < s2->nelem
 2122                            && s1->elems[i].index == s2->elems[j].index)
 2123                           ? s2->elems[j++].constraint & c2
 2124                           : 0);
 2125         m->elems[m->nelem].index = s1->elems[i].index;
 2126         m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
 2127       }
 2128     else
 2129       {
 2130         if (s2->elems[j].constraint & c2)
 2131           {
 2132             m->elems[m->nelem].index = s2->elems[j].index;
 2133             m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
 2134           }
 2135         j++;
 2136       }
 2137 }
 2138 
 2139 /* Merge two sets of positions into a third.  The result is exactly as if
 2140    the positions of both sets were inserted into an initially empty set.  */
 2141 static void
 2142 merge (position_set const *s1, position_set const *s2, position_set *m)
 2143 {
 2144   merge_constrained (s1, s2, -1, m);
 2145 }
 2146 
 2147 static void
 2148 merge2 (position_set *dst, position_set const *src, position_set *m)
 2149 {
 2150   if (src->nelem < 4)
 2151     {
 2152       for (idx_t i = 0; i < src->nelem; i++)
 2153         insert (src->elems[i], dst);
 2154     }
 2155    else
 2156     {
 2157       merge (src, dst, m);
 2158       copy (m, dst);
 2159     }
 2160 }
 2161 
 2162 /* Delete a position from a set.  Return the nonzero constraint of the
 2163    deleted position, or zero if there was no such position.  */
 2164 static unsigned int
 2165 delete (idx_t del, position_set *s)
 2166 {
 2167   idx_t count = s->nelem;
 2168   idx_t lo = 0, hi = count;
 2169   while (lo < hi)
 2170     {
 2171       idx_t mid = (lo + hi) >> 1;
 2172       if (s->elems[mid].index < del)
 2173         lo = mid + 1;
 2174       else if (s->elems[mid].index == del)
 2175         {
 2176           unsigned int c = s->elems[mid].constraint;
 2177           idx_t i;
 2178           for (i = mid; i + 1 < count; i++)
 2179             s->elems[i] = s->elems[i + 1];
 2180           s->nelem = i;
 2181           return c;
 2182         }
 2183       else
 2184         hi = mid;
 2185     }
 2186   return 0;
 2187 }
 2188 
 2189 /* Replace a position with the followed set.  */
 2190 static void
 2191 replace (position_set *dst, idx_t del, position_set *add,
 2192          unsigned int constraint, position_set *tmp)
 2193 {
 2194   unsigned int c = delete (del, dst) & constraint;
 2195 
 2196   if (c)
 2197     {
 2198       copy (dst, tmp);
 2199       merge_constrained (tmp, add, c, dst);
 2200     }
 2201 }
 2202 
 2203 /* Find the index of the state corresponding to the given position set with
 2204    the given preceding context, or create a new state if there is no such
 2205    state.  Context tells whether we got here on a newline or letter.  */
 2206 static state_num
 2207 state_index (struct dfa *d, position_set const *s, int context)
 2208 {
 2209   size_t hash = 0;
 2210   int constraint = 0;
 2211   state_num i;
 2212   token first_end = 0;
 2213 
 2214   for (i = 0; i < s->nelem; ++i)
 2215     {
 2216       size_t ind = s->elems[i].index;
 2217       hash ^= ind + s->elems[i].constraint;
 2218     }
 2219 
 2220   /* Try to find a state that exactly matches the proposed one.  */
 2221   for (i = 0; i < d->sindex; ++i)
 2222     {
 2223       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
 2224           || context != d->states[i].context)
 2225         continue;
 2226       state_num j;
 2227       for (j = 0; j < s->nelem; ++j)
 2228         if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
 2229             || s->elems[j].index != d->states[i].elems.elems[j].index)
 2230           break;
 2231       if (j == s->nelem)
 2232         return i;
 2233     }
 2234 
 2235 #ifdef DEBUG
 2236   fprintf (stderr, "new state %td\n nextpos:", i);
 2237   for (state_num j = 0; j < s->nelem; j++)
 2238     {
 2239       fprintf (stderr, " %td:", s->elems[j].index);
 2240       prtok (d->tokens[s->elems[j].index]);
 2241     }
 2242   fprintf (stderr, "\n context:");
 2243   if (context ^ CTX_ANY)
 2244     {
 2245       if (context & CTX_NONE)
 2246         fprintf (stderr, " CTX_NONE");
 2247       if (context & CTX_LETTER)
 2248         fprintf (stderr, " CTX_LETTER");
 2249       if (context & CTX_NEWLINE)
 2250         fprintf (stderr, " CTX_NEWLINE");
 2251     }
 2252   else
 2253     fprintf (stderr, " CTX_ANY");
 2254   fprintf (stderr, "\n");
 2255 #endif
 2256 
 2257   for (state_num j = 0; j < s->nelem; j++)
 2258     {
 2259       int c = d->constraints[s->elems[j].index];
 2260 
 2261       if (c != 0)
 2262         {
 2263           if (succeeds_in_context (c, context, CTX_ANY))
 2264             constraint |= c;
 2265           if (!first_end)
 2266             first_end = d->tokens[s->elems[j].index];
 2267         }
 2268       else if (d->tokens[s->elems[j].index] == BACKREF)
 2269         constraint = NO_CONSTRAINT;
 2270     }
 2271 
 2272 
 2273   /* Create a new state.  */
 2274   d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
 2275                              sizeof *d->states);
 2276   d->states[i].hash = hash;
 2277   alloc_position_set (&d->states[i].elems, s->nelem);
 2278   copy (s, &d->states[i].elems);
 2279   d->states[i].context = context;
 2280   d->states[i].constraint = constraint;
 2281   d->states[i].first_end = first_end;
 2282   d->states[i].mbps.nelem = 0;
 2283   d->states[i].mbps.elems = NULL;
 2284   d->states[i].mb_trindex = -1;
 2285 
 2286   ++d->sindex;
 2287 
 2288   return i;
 2289 }
 2290 
 2291 /* Find the epsilon closure of a set of positions.  If any position of the set
 2292    contains a symbol that matches the empty string in some context, replace
 2293    that position with the elements of its follow labeled with an appropriate
 2294    constraint.  Repeat exhaustively until no funny positions are left.
 2295    S->elems must be large enough to hold the result.  */
 2296 static void
 2297 epsclosure (struct dfa const *d)
 2298 {
 2299   position_set tmp;
 2300   alloc_position_set (&tmp, d->nleaves);
 2301   for (idx_t i = 0; i < d->tindex; i++)
 2302     if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
 2303         && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
 2304         && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
 2305       {
 2306         unsigned int constraint;
 2307         switch (d->tokens[i])
 2308           {
 2309           case BEGLINE:
 2310             constraint = BEGLINE_CONSTRAINT;
 2311             break;
 2312           case ENDLINE:
 2313             constraint = ENDLINE_CONSTRAINT;
 2314             break;
 2315           case BEGWORD:
 2316             constraint = BEGWORD_CONSTRAINT;
 2317             break;
 2318           case ENDWORD:
 2319             constraint = ENDWORD_CONSTRAINT;
 2320             break;
 2321           case LIMWORD:
 2322             constraint = LIMWORD_CONSTRAINT;
 2323             break;
 2324           case NOTLIMWORD:
 2325             constraint = NOTLIMWORD_CONSTRAINT;
 2326             break;
 2327           default:
 2328             constraint = NO_CONSTRAINT;
 2329             break;
 2330           }
 2331 
 2332         delete (i, &d->follows[i]);
 2333 
 2334         for (idx_t j = 0; j < d->tindex; j++)
 2335           if (i != j && d->follows[j].nelem > 0)
 2336             replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
 2337       }
 2338   free (tmp.elems);
 2339 }
 2340 
 2341 /* Returns the set of contexts for which there is at least one
 2342    character included in C.  */
 2343 
 2344 static int
 2345 charclass_context (struct dfa const *dfa, charclass const *c)
 2346 {
 2347   int context = 0;
 2348 
 2349   for (int j = 0; j < CHARCLASS_WORDS; j++)
 2350     {
 2351       if (c->w[j] & dfa->syntax.newline.w[j])
 2352         context |= CTX_NEWLINE;
 2353       if (c->w[j] & dfa->syntax.letters.w[j])
 2354         context |= CTX_LETTER;
 2355       if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
 2356         context |= CTX_NONE;
 2357     }
 2358 
 2359   return context;
 2360 }
 2361 
 2362 /* Returns the contexts on which the position set S depends.  Each context
 2363    in the set of returned contexts (let's call it SC) may have a different
 2364    follow set than other contexts in SC, and also different from the
 2365    follow set of the complement set (sc ^ CTX_ANY).  However, all contexts
 2366    in the complement set will have the same follow set.  */
 2367 
 2368 static int _GL_ATTRIBUTE_PURE
 2369 state_separate_contexts (struct dfa *d, position_set const *s)
 2370 {
 2371   int separate_contexts = 0;
 2372 
 2373   for (idx_t j = 0; j < s->nelem; j++)
 2374     separate_contexts |= d->separates[s->elems[j].index];
 2375 
 2376   return separate_contexts;
 2377 }
 2378 
 2379 enum
 2380 {
 2381   /* Single token is repeated.  It is distinguished from non-repeated.  */
 2382   OPT_REPEAT = (1 << 0),
 2383 
 2384   /* Multiple tokens are repeated.  This flag is on at head of tokens.  The
 2385      node is not merged.  */
 2386   OPT_LPAREN = (1 << 1),
 2387 
 2388   /* Multiple branches are joined.  The node is not merged.  */
 2389   OPT_RPAREN = (1 << 2),
 2390 
 2391   /* The node is walked.  If the node is found in walking again, OPT_RPAREN
 2392      flag is turned on. */
 2393   OPT_WALKED = (1 << 3),
 2394 
 2395   /* The node is queued.  The node is not queued again.  */
 2396   OPT_QUEUED = (1 << 4)
 2397 };
 2398 
 2399 static void
 2400 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
 2401                  position_set *merged)
 2402 {
 2403   position_set *follows = d->follows;
 2404   idx_t nelem = 0;
 2405 
 2406   d->constraints[tindex] = 0;
 2407 
 2408   for (idx_t i = 0; i < follows[tindex].nelem; i++)
 2409     {
 2410       idx_t sindex = follows[tindex].elems[i].index;
 2411 
 2412       /* Skip the node as pruned in future.  */
 2413       unsigned int iconstraint = follows[tindex].elems[i].constraint;
 2414       if (iconstraint == 0)
 2415         continue;
 2416 
 2417       if (d->tokens[follows[tindex].elems[i].index] <= END)
 2418         {
 2419           d->constraints[tindex] |= follows[tindex].elems[i].constraint;
 2420           continue;
 2421         }
 2422 
 2423       if (!(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
 2424         {
 2425           idx_t j;
 2426 
 2427           for (j = 0; j < nelem; j++)
 2428             {
 2429               idx_t dindex = follows[tindex].elems[j].index;
 2430 
 2431               if (follows[tindex].elems[j].constraint != iconstraint)
 2432                 continue;
 2433 
 2434               if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
 2435                 continue;
 2436 
 2437               if (d->tokens[sindex] != d->tokens[dindex])
 2438                 continue;
 2439 
 2440               if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
 2441                 continue;
 2442 
 2443               if (flags[sindex] & OPT_REPEAT)
 2444                 delete (sindex, &follows[sindex]);
 2445 
 2446               merge2 (&follows[dindex], &follows[sindex], merged);
 2447 
 2448               break;
 2449             }
 2450 
 2451           if (j < nelem)
 2452             continue;
 2453         }
 2454 
 2455       follows[tindex].elems[nelem++] = follows[tindex].elems[i];
 2456       flags[sindex] |= OPT_QUEUED;
 2457     }
 2458 
 2459   follows[tindex].nelem = nelem;
 2460 }
 2461 
 2462 static int
 2463 compare (const void *a, const void *b)
 2464 {
 2465   position const *p = a, *q = b;
 2466   return p->index < q->index ? -1 : p->index > q->index;
 2467 }
 2468 
 2469 static void
 2470 reorder_tokens (struct dfa *d)
 2471 {
 2472   idx_t nleaves;
 2473   ptrdiff_t *map;
 2474   token *tokens;
 2475   position_set *follows;
 2476   int *constraints;
 2477   char *multibyte_prop;
 2478 
 2479   nleaves = 0;
 2480 
 2481   map = xnmalloc (d->tindex, sizeof *map);
 2482 
 2483   map[0] = nleaves++;
 2484 
 2485   for (idx_t i = 1; i < d->tindex; i++)
 2486     map[i] = -1;
 2487 
 2488   tokens = xnmalloc (d->nleaves, sizeof *tokens);
 2489   follows = xnmalloc (d->nleaves, sizeof *follows);
 2490   constraints = xnmalloc (d->nleaves, sizeof *constraints);
 2491 
 2492   if (d->localeinfo.multibyte)
 2493     multibyte_prop = xnmalloc (d->nleaves, sizeof *multibyte_prop);
 2494   else
 2495     multibyte_prop = NULL;
 2496 
 2497   for (idx_t i = 0; i < d->tindex; i++)
 2498     {
 2499       if (map[i] == -1)
 2500         {
 2501           free (d->follows[i].elems);
 2502           d->follows[i].elems = NULL;
 2503           d->follows[i].nelem = 0;
 2504           continue;
 2505         }
 2506 
 2507       tokens[map[i]] = d->tokens[i];
 2508       follows[map[i]] = d->follows[i];
 2509       constraints[map[i]] = d->constraints[i];
 2510 
 2511       if (multibyte_prop != NULL)
 2512         multibyte_prop[map[i]] = d->multibyte_prop[i];
 2513 
 2514       for (idx_t j = 0; j < d->follows[i].nelem; j++)
 2515         {
 2516           if (map[d->follows[i].elems[j].index] == -1)
 2517             map[d->follows[i].elems[j].index] = nleaves++;
 2518 
 2519           d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
 2520         }
 2521 
 2522       qsort (d->follows[i].elems, d->follows[i].nelem,
 2523              sizeof *d->follows[i].elems, compare);
 2524     }
 2525 
 2526   for (idx_t i = 0; i < nleaves; i++)
 2527     {
 2528       d->tokens[i] = tokens[i];
 2529       d->follows[i] = follows[i];
 2530       d->constraints[i] = constraints[i];
 2531 
 2532       if (multibyte_prop != NULL)
 2533         d->multibyte_prop[i] = multibyte_prop[i];
 2534     }
 2535 
 2536   d->tindex = d->nleaves = nleaves;
 2537 
 2538   free (tokens);
 2539   free (follows);
 2540   free (constraints);
 2541   free (multibyte_prop);
 2542   free (map);
 2543 }
 2544 
 2545 static void
 2546 dfaoptimize (struct dfa *d)
 2547 {
 2548   char *flags = xzalloc (d->tindex);
 2549 
 2550   for (idx_t i = 0; i < d->tindex; i++)
 2551     {
 2552       for (idx_t j = 0; j < d->follows[i].nelem; j++)
 2553         {
 2554           if (d->follows[i].elems[j].index == i)
 2555             flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
 2556           else if (d->follows[i].elems[j].index < i)
 2557             flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
 2558           else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
 2559             flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
 2560           else
 2561             flags[d->follows[i].elems[j].index] |= OPT_WALKED;
 2562         }
 2563     }
 2564 
 2565   flags[0] |= OPT_QUEUED;
 2566 
 2567   position_set merged0;
 2568   position_set *merged = &merged0;
 2569   alloc_position_set (merged, d->nleaves);
 2570 
 2571   d->constraints = xnmalloc (d->tindex, sizeof *d->constraints);
 2572 
 2573   for (idx_t i = 0; i < d->tindex; i++)
 2574     if (flags[i] & OPT_QUEUED)
 2575       merge_nfa_state (d, i, flags, merged);
 2576 
 2577   reorder_tokens (d);
 2578 
 2579   free (merged->elems);
 2580   free (flags);
 2581 }
 2582 
 2583 /* Perform bottom-up analysis on the parse tree, computing various functions.
 2584    Note that at this point, we're pretending constructs like \< are real
 2585    characters rather than constraints on what can follow them.
 2586 
 2587    Nullable:  A node is nullable if it is at the root of a regexp that can
 2588    match the empty string.
 2589    *  EMPTY leaves are nullable.
 2590    * No other leaf is nullable.
 2591    * A QMARK or STAR node is nullable.
 2592    * A PLUS node is nullable if its argument is nullable.
 2593    * A CAT node is nullable if both its arguments are nullable.
 2594    * An OR node is nullable if either argument is nullable.
 2595 
 2596    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
 2597    that could correspond to the first character of a string matching the
 2598    regexp rooted at the given node.
 2599    * EMPTY leaves have empty firstpos.
 2600    * The firstpos of a nonempty leaf is that leaf itself.
 2601    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
 2602      argument.
 2603    * The firstpos of a CAT node is the firstpos of the left argument, union
 2604      the firstpos of the right if the left argument is nullable.
 2605    * The firstpos of an OR node is the union of firstpos of each argument.
 2606 
 2607    Lastpos:  The lastpos of a node is the set of positions that could
 2608    correspond to the last character of a string matching the regexp at
 2609    the given node.
 2610    * EMPTY leaves have empty lastpos.
 2611    * The lastpos of a nonempty leaf is that leaf itself.
 2612    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
 2613      argument.
 2614    * The lastpos of a CAT node is the lastpos of its right argument, union
 2615      the lastpos of the left if the right argument is nullable.
 2616    * The lastpos of an OR node is the union of the lastpos of each argument.
 2617 
 2618    Follow:  The follow of a position is the set of positions that could
 2619    correspond to the character following a character matching the node in
 2620    a string matching the regexp.  At this point we consider special symbols
 2621    that match the empty string in some context to be just normal characters.
 2622    Later, if we find that a special symbol is in a follow set, we will
 2623    replace it with the elements of its follow, labeled with an appropriate
 2624    constraint.
 2625    * Every node in the firstpos of the argument of a STAR or PLUS node is in
 2626      the follow of every node in the lastpos.
 2627    * Every node in the firstpos of the second argument of a CAT node is in
 2628      the follow of every node in the lastpos of the first argument.
 2629 
 2630    Because of the postfix representation of the parse tree, the depth-first
 2631    analysis is conveniently done by a linear scan with the aid of a stack.
 2632    Sets are stored as arrays of the elements, obeying a stack-like allocation
 2633    scheme; the number of elements in each set deeper in the stack can be
 2634    used to determine the address of a particular set's array.  */
 2635 static void
 2636 dfaanalyze (struct dfa *d, bool searchflag)
 2637 {
 2638   /* Array allocated to hold position sets.  */
 2639   position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
 2640   /* Firstpos and lastpos elements.  */
 2641   position *firstpos = posalloc;
 2642   position *lastpos = firstpos + d->nleaves;
 2643   position pos;
 2644   position_set tmp;
 2645 
 2646   /* Stack for element counts and nullable flags.  */
 2647   struct
 2648   {
 2649     /* Whether the entry is nullable.  */
 2650     bool nullable;
 2651 
 2652     /* Counts of firstpos and lastpos sets.  */
 2653     idx_t nfirstpos;
 2654     idx_t nlastpos;
 2655   } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
 2656 
 2657   position_set merged;          /* Result of merging sets.  */
 2658 
 2659   addtok (d, CAT);
 2660 
 2661 #ifdef DEBUG
 2662   fprintf (stderr, "dfaanalyze:\n");
 2663   for (idx_t i = 0; i < d->tindex; i++)
 2664     {
 2665       fprintf (stderr, " %td:", i);
 2666       prtok (d->tokens[i]);
 2667     }
 2668   putc ('\n', stderr);
 2669 #endif
 2670 
 2671   d->searchflag = searchflag;
 2672   alloc_position_set (&merged, d->nleaves);
 2673   d->follows = xcalloc (d->tindex, sizeof *d->follows);
 2674 
 2675   for (idx_t i = 0; i < d->tindex; i++)
 2676     {
 2677       switch (d->tokens[i])
 2678         {
 2679         case EMPTY:
 2680           /* The empty set is nullable.  */
 2681           stk->nullable = true;
 2682 
 2683           /* The firstpos and lastpos of the empty leaf are both empty.  */
 2684           stk->nfirstpos = stk->nlastpos = 0;
 2685           stk++;
 2686           break;
 2687 
 2688         case STAR:
 2689         case PLUS:
 2690           /* Every element in the firstpos of the argument is in the follow
 2691              of every element in the lastpos.  */
 2692           {
 2693             tmp.elems = firstpos - stk[-1].nfirstpos;
 2694             tmp.nelem = stk[-1].nfirstpos;
 2695             position *p = lastpos - stk[-1].nlastpos;
 2696             for (idx_t j = 0; j < stk[-1].nlastpos; j++)
 2697               {
 2698                 merge (&tmp, &d->follows[p[j].index], &merged);
 2699                 copy (&merged, &d->follows[p[j].index]);
 2700               }
 2701           }
 2702           FALLTHROUGH;
 2703         case QMARK:
 2704           /* A QMARK or STAR node is automatically nullable.  */
 2705           if (d->tokens[i] != PLUS)
 2706             stk[-1].nullable = true;
 2707           break;
 2708 
 2709         case CAT:
 2710           /* Every element in the firstpos of the second argument is in the
 2711              follow of every element in the lastpos of the first argument.  */
 2712           {
 2713             tmp.nelem = stk[-1].nfirstpos;
 2714             tmp.elems = firstpos - stk[-1].nfirstpos;
 2715             position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
 2716             for (idx_t j = 0; j < stk[-2].nlastpos; j++)
 2717               {
 2718                 merge (&tmp, &d->follows[p[j].index], &merged);
 2719                 copy (&merged, &d->follows[p[j].index]);
 2720               }
 2721           }
 2722 
 2723           /* The firstpos of a CAT node is the firstpos of the first argument,
 2724              union that of the second argument if the first is nullable.  */
 2725           if (stk[-2].nullable)
 2726             stk[-2].nfirstpos += stk[-1].nfirstpos;
 2727           else
 2728             firstpos -= stk[-1].nfirstpos;
 2729 
 2730           /* The lastpos of a CAT node is the lastpos of the second argument,
 2731              union that of the first argument if the second is nullable.  */
 2732           if (stk[-1].nullable)
 2733             stk[-2].nlastpos += stk[-1].nlastpos;
 2734           else
 2735             {
 2736               position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
 2737               for (idx_t j = 0; j < stk[-1].nlastpos; j++)
 2738                 p[j] = p[j + stk[-2].nlastpos];
 2739               lastpos -= stk[-2].nlastpos;
 2740               stk[-2].nlastpos = stk[-1].nlastpos;
 2741             }
 2742 
 2743           /* A CAT node is nullable if both arguments are nullable.  */
 2744           stk[-2].nullable &= stk[-1].nullable;
 2745           stk--;
 2746           break;
 2747 
 2748         case OR:
 2749           /* The firstpos is the union of the firstpos of each argument.  */
 2750           stk[-2].nfirstpos += stk[-1].nfirstpos;
 2751 
 2752           /* The lastpos is the union of the lastpos of each argument.  */
 2753           stk[-2].nlastpos += stk[-1].nlastpos;
 2754 
 2755           /* An OR node is nullable if either argument is nullable.  */
 2756           stk[-2].nullable |= stk[-1].nullable;
 2757           stk--;
 2758           break;
 2759 
 2760         default:
 2761           /* Anything else is a nonempty position.  (Note that special
 2762              constructs like \< are treated as nonempty strings here;
 2763              an "epsilon closure" effectively makes them nullable later.
 2764              Backreferences have to get a real position so we can detect
 2765              transitions on them later.  But they are nullable.  */
 2766           stk->nullable = d->tokens[i] == BACKREF;
 2767 
 2768           /* This position is in its own firstpos and lastpos.  */
 2769           stk->nfirstpos = stk->nlastpos = 1;
 2770           stk++;
 2771 
 2772           firstpos->index = lastpos->index = i;
 2773           firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
 2774           firstpos++, lastpos++;
 2775 
 2776           break;
 2777         }
 2778 #ifdef DEBUG
 2779       /* ... balance the above nonsyntactic #ifdef goo...  */
 2780       fprintf (stderr, "node %td:", i);
 2781       prtok (d->tokens[i]);
 2782       putc ('\n', stderr);
 2783       fprintf (stderr,
 2784                stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
 2785       fprintf (stderr, " firstpos:");
 2786       for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
 2787         {
 2788           fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
 2789           prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
 2790         }
 2791       fprintf (stderr, "\n lastpos:");
 2792       for (idx_t j = 0; j < stk[-1].nlastpos; j++)
 2793         {
 2794           fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
 2795           prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
 2796         }
 2797       putc ('\n', stderr);
 2798 #endif
 2799     }
 2800 
 2801   /* For each follow set that is the follow set of a real position, replace
 2802      it with its epsilon closure.  */
 2803   epsclosure (d);
 2804 
 2805   dfaoptimize (d);
 2806 
 2807 #ifdef DEBUG
 2808   for (idx_t i = 0; i < d->tindex; i++)
 2809     if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
 2810         || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
 2811         || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
 2812       {
 2813         fprintf (stderr, "follows(%td:", i);
 2814         prtok (d->tokens[i]);
 2815         fprintf (stderr, "):");
 2816         for (idx_t j = 0; j < d->follows[i].nelem; j++)
 2817           {
 2818             fprintf (stderr, " %td:", d->follows[i].elems[j].index);
 2819             prtok (d->tokens[d->follows[i].elems[j].index]);
 2820           }
 2821         putc ('\n', stderr);
 2822       }
 2823 #endif
 2824 
 2825   pos.index = 0;
 2826   pos.constraint = NO_CONSTRAINT;
 2827 
 2828   alloc_position_set (&tmp, 1);
 2829 
 2830   append (pos, &tmp);
 2831 
 2832   d->separates = xnmalloc (d->tindex, sizeof *d->separates);
 2833 
 2834   for (idx_t i = 0; i < d->tindex; i++)
 2835     {
 2836       d->separates[i] = 0;
 2837 
 2838       if (prev_newline_dependent (d->constraints[i]))
 2839         d->separates[i] |= CTX_NEWLINE;
 2840       if (prev_letter_dependent (d->constraints[i]))
 2841         d->separates[i] |= CTX_LETTER;
 2842 
 2843       for (idx_t j = 0; j < d->follows[i].nelem; j++)
 2844         {
 2845           if (prev_newline_dependent (d->follows[i].elems[j].constraint))
 2846             d->separates[i] |= CTX_NEWLINE;
 2847           if (prev_letter_dependent (d->follows[i].elems[j].constraint))
 2848             d->separates[i] |= CTX_LETTER;
 2849         }
 2850     }
 2851 
 2852   /* Context wanted by some position.  */
 2853   int separate_contexts = state_separate_contexts (d, &tmp);
 2854 
 2855   /* Build the initial state.  */
 2856   if (separate_contexts & CTX_NEWLINE)
 2857     state_index (d, &tmp, CTX_NEWLINE);
 2858   d->initstate_notbol = d->min_trcount
 2859     = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
 2860   if (separate_contexts & CTX_LETTER)
 2861     d->min_trcount = state_index (d, &tmp, CTX_LETTER);
 2862   d->min_trcount++;
 2863   d->trcount = 0;
 2864 
 2865   free (posalloc);
 2866   free (stkalloc);
 2867   free (merged.elems);
 2868   free (tmp.elems);
 2869 }
 2870 
 2871 /* Make sure D's state arrays are large enough to hold NEW_STATE.  */
 2872 static void
 2873 realloc_trans_if_necessary (struct dfa *d)
 2874 {
 2875   state_num oldalloc = d->tralloc;
 2876   if (oldalloc < d->sindex)
 2877     {
 2878       state_num **realtrans = d->trans ? d->trans - 2 : NULL;
 2879       idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
 2880       realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
 2881                            -1, sizeof *realtrans);
 2882       realtrans[0] = realtrans[1] = NULL;
 2883       d->trans = realtrans + 2;
 2884       idx_t newalloc = d->tralloc = newalloc1 - 2;
 2885       d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
 2886       d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
 2887       d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
 2888       if (d->localeinfo.multibyte)
 2889         {
 2890           realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
 2891           realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
 2892           if (oldalloc == 0)
 2893             realtrans[0] = realtrans[1] = NULL;
 2894           d->mb_trans = realtrans + 2;
 2895         }
 2896       for (; oldalloc < newalloc; oldalloc++)
 2897         {
 2898           d->trans[oldalloc] = NULL;
 2899           d->fails[oldalloc] = NULL;
 2900           if (d->localeinfo.multibyte)
 2901             d->mb_trans[oldalloc] = NULL;
 2902         }
 2903     }
 2904 }
 2905 
 2906 /*
 2907    Calculate the transition table for a new state derived from state s
 2908    for a compiled dfa d after input character uc, and return the new
 2909    state number.
 2910 
 2911    Do not worry about all possible input characters; calculate just the group
 2912    of positions that match uc.  Label it with the set of characters that
 2913    every position in the group matches (taking into account, if necessary,
 2914    preceding context information of s).  Then find the union
 2915    of these positions' follows, i.e., the set of positions of the
 2916    new state.  For each character in the group's label, set the transition
 2917    on this character to be to a state corresponding to the set's positions,
 2918    and its associated backward context information, if necessary.
 2919 
 2920    When building a searching matcher, include the positions of state
 2921    0 in every state.
 2922 
 2923    The group is constructed by building an equivalence-class
 2924    partition of the positions of s.
 2925 
 2926    For each position, find the set of characters C that it matches.  Eliminate
 2927    any characters from C that fail on grounds of backward context.
 2928 
 2929    Check whether the group's label L has nonempty
 2930    intersection with C.  If L - C is nonempty, create a new group labeled
 2931    L - C and having the same positions as the current group, and set L to
 2932    the intersection of L and C.  Insert the position in the group, set
 2933    C = C - L, and resume scanning.
 2934 
 2935    If after comparing with every group there are characters remaining in C,
 2936    create a new group labeled with the characters of C and insert this
 2937    position in that group.  */
 2938 
 2939 static state_num
 2940 build_state (state_num s, struct dfa *d, unsigned char uc)
 2941 {
 2942   position_set follows;         /* Union of the follows for each
 2943                                    position of the current state.  */
 2944   position_set group;           /* Positions that match the input char.  */
 2945   position_set tmp;             /* Temporary space for merging sets.  */
 2946   state_num state;              /* New state.  */
 2947   state_num state_newline;      /* New state on a newline transition.  */
 2948   state_num state_letter;       /* New state on a letter transition.  */
 2949 
 2950 #ifdef DEBUG
 2951   fprintf (stderr, "build state %td\n", s);
 2952 #endif
 2953 
 2954   /* A pointer to the new transition table, and the table itself.  */
 2955   state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
 2956   state_num *trans = *ptrans;
 2957 
 2958   if (!trans)
 2959     {
 2960       /* MAX_TRCOUNT is an arbitrary upper limit on the number of
 2961          transition tables that can exist at once, other than for
 2962          initial states.  Often-used transition tables are quickly
 2963          rebuilt, whereas rarely-used ones are cleared away.  */
 2964       if (MAX_TRCOUNT <= d->trcount)
 2965         {
 2966           for (state_num i = d->min_trcount; i < d->tralloc; i++)
 2967             {
 2968               free (d->trans[i]);
 2969               free (d->fails[i]);
 2970               d->trans[i] = d->fails[i] = NULL;
 2971             }
 2972           d->trcount = 0;
 2973         }
 2974 
 2975       d->trcount++;
 2976       *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
 2977 
 2978       /* Fill transition table with a default value which means that the
 2979          transited state has not been calculated yet.  */
 2980       for (int i = 0; i < NOTCHAR; i++)
 2981         trans[i] = -2;
 2982     }
 2983 
 2984   /* Set up the success bits for this state.  */
 2985   d->success[s] = 0;
 2986   if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
 2987     d->success[s] |= CTX_NEWLINE;
 2988   if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
 2989     d->success[s] |= CTX_LETTER;
 2990   if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
 2991     d->success[s] |= CTX_NONE;
 2992 
 2993   alloc_position_set (&follows, d->nleaves);
 2994 
 2995   /* Find the union of the follows of the positions of the group.
 2996      This is a hideously inefficient loop.  Fix it someday.  */
 2997   for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
 2998     for (idx_t k = 0;
 2999          k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
 3000       insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
 3001               &follows);
 3002 
 3003   /* Positions that match the input char.  */
 3004   alloc_position_set (&group, d->nleaves);
 3005 
 3006   /* The group's label.  */
 3007   charclass label;
 3008   fillset (&label);
 3009 
 3010   for (idx_t i = 0; i < follows.nelem; i++)
 3011     {
 3012       charclass matches;            /* Set of matching characters.  */
 3013       position pos = follows.elems[i];
 3014       bool matched = false;
 3015       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
 3016         {
 3017           zeroset (&matches);
 3018           setbit (d->tokens[pos.index], &matches);
 3019           if (d->tokens[pos.index] == uc)
 3020             matched = true;
 3021         }
 3022       else if (d->tokens[pos.index] >= CSET)
 3023         {
 3024           matches = d->charclasses[d->tokens[pos.index] - CSET];
 3025           if (tstbit (uc, &matches))
 3026             matched = true;
 3027         }
 3028       else if (d->tokens[pos.index] == ANYCHAR)
 3029         {
 3030           matches = d->charclasses[d->canychar];
 3031           if (tstbit (uc, &matches))
 3032             matched = true;
 3033 
 3034           /* ANYCHAR must match with a single character, so we must put
 3035              it to D->states[s].mbps which contains the positions which
 3036              can match with a single character not a byte.  If all
 3037              positions which has ANYCHAR does not depend on context of
 3038              next character, we put the follows instead of it to
 3039              D->states[s].mbps to optimize.  */
 3040           if (succeeds_in_context (pos.constraint, d->states[s].context,
 3041                                    CTX_NONE))
 3042             {
 3043               if (d->states[s].mbps.nelem == 0)
 3044                 alloc_position_set (&d->states[s].mbps, 1);
 3045               insert (pos, &d->states[s].mbps);
 3046             }
 3047         }
 3048       else
 3049         continue;
 3050 
 3051       /* Some characters may need to be eliminated from matches because
 3052          they fail in the current context.  */
 3053       if (pos.constraint != NO_CONSTRAINT)
 3054         {
 3055           if (!succeeds_in_context (pos.constraint,
 3056                                     d->states[s].context, CTX_NEWLINE))
 3057             for (int j = 0; j < CHARCLASS_WORDS; j++)
 3058               matches.w[j] &= ~d->syntax.newline.w[j];
 3059           if (!succeeds_in_context (pos.constraint,
 3060                                     d->states[s].context, CTX_LETTER))
 3061             for (int j = 0; j < CHARCLASS_WORDS; ++j)
 3062               matches.w[j] &= ~d->syntax.letters.w[j];
 3063           if (!succeeds_in_context (pos.constraint,
 3064                                     d->states[s].context, CTX_NONE))
 3065             for (int j = 0; j < CHARCLASS_WORDS; ++j)
 3066               matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
 3067 
 3068           /* If there are no characters left, there's no point in going on.  */
 3069           if (emptyset (&matches))
 3070             continue;
 3071 
 3072           /* If we have reset the bit that made us declare "matched", reset
 3073              that indicator, too.  This is required to avoid an infinite loop
 3074              with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]'  */
 3075           if (!tstbit (uc, &matches))
 3076             matched = false;
 3077         }
 3078 
 3079 #ifdef DEBUG
 3080       fprintf (stderr, " nextpos %td:", pos.index);
 3081       prtok (d->tokens[pos.index]);
 3082       fprintf (stderr, " of");
 3083       for (unsigned j = 0; j < NOTCHAR; j++)
 3084         if (tstbit (j, &matches))
 3085           fprintf (stderr, " 0x%02x", j);
 3086       fprintf (stderr, "\n");
 3087 #endif
 3088 
 3089       if (matched)
 3090         {
 3091           for (int k = 0; k < CHARCLASS_WORDS; ++k)
 3092             label.w[k] &= matches.w[k];
 3093           append (pos, &group);
 3094         }
 3095       else
 3096         {
 3097           for (int k = 0; k < CHARCLASS_WORDS; ++k)
 3098             label.w[k] &= ~matches.w[k];
 3099         }
 3100     }
 3101 
 3102   alloc_position_set (&tmp, d->nleaves);
 3103 
 3104   if (group.nelem > 0)
 3105     {
 3106       /* If we are building a searching matcher, throw in the positions
 3107          of state 0 as well, if possible.  */
 3108       if (d->searchflag)
 3109         {
 3110           /* If a token in follows.elems is not 1st byte of a multibyte
 3111              character, or the states of follows must accept the bytes
 3112              which are not 1st byte of the multibyte character.
 3113              Then, if a state of follows encounters a byte, it must not be
 3114              a 1st byte of a multibyte character nor a single byte character.
 3115              In this case, do not add state[0].follows to next state, because
 3116              state[0] must accept 1st-byte.
 3117 
 3118              For example, suppose <sb a> is a certain single byte character,
 3119              <mb A> is a certain multibyte character, and the codepoint of
 3120              <sb a> equals the 2nd byte of the codepoint of <mb A>.  When
 3121              state[0] accepts <sb a>, state[i] transits to state[i+1] by
 3122              accepting the 1st byte of <mb A>, and state[i+1] accepts the
 3123              2nd byte of <mb A>, if state[i+1] encounters the codepoint of
 3124              <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
 3125              not add state[0].  */
 3126 
 3127           bool mergeit = !d->localeinfo.multibyte;
 3128           if (!mergeit)
 3129             {
 3130               mergeit = true;
 3131               for (idx_t j = 0; mergeit && j < group.nelem; j++)
 3132                 mergeit &= d->multibyte_prop[group.elems[j].index];
 3133             }
 3134           if (mergeit)
 3135             {
 3136               merge (&d->states[0].elems, &group, &tmp);
 3137               copy (&tmp, &group);
 3138             }
 3139         }
 3140 
 3141       /* Find out if the new state will want any context information,
 3142          by calculating possible contexts that the group can match,
 3143          and separate contexts that the new state wants to know.  */
 3144       int possible_contexts = charclass_context (d, &label);
 3145       int separate_contexts = state_separate_contexts (d, &group);
 3146 
 3147       /* Find the state(s) corresponding to the union of the follows.  */
 3148       if (possible_contexts & ~separate_contexts)
 3149         state = state_index (d, &group, separate_contexts ^ CTX_ANY);
 3150       else
 3151         state = -1;
 3152       if (separate_contexts & possible_contexts & CTX_NEWLINE)
 3153         state_newline = state_index (d, &group, CTX_NEWLINE);
 3154       else
 3155         state_newline = state;
 3156       if (separate_contexts & possible_contexts & CTX_LETTER)
 3157         state_letter = state_index (d, &group, CTX_LETTER);
 3158       else
 3159         state_letter = state;
 3160 
 3161       /* Reallocate now, to reallocate any newline transition properly.  */
 3162       realloc_trans_if_necessary (d);
 3163     }
 3164 
 3165   /* If we are a searching matcher, the default transition is to a state
 3166      containing the positions of state 0, otherwise the default transition
 3167      is to fail miserably.  */
 3168   else if (d->searchflag)
 3169     {
 3170       state_newline = 0;
 3171       state_letter = d->min_trcount - 1;
 3172       state = d->initstate_notbol;
 3173     }
 3174   else
 3175     {
 3176       state_newline = -1;
 3177       state_letter = -1;
 3178       state = -1;
 3179     }
 3180 
 3181   /* Set the transitions for each character in the label.  */
 3182   for (int i = 0; i < NOTCHAR; i++)
 3183     if (tstbit (i, &label))
 3184       switch (d->syntax.sbit[i])
 3185         {
 3186         case CTX_NEWLINE:
 3187           trans[i] = state_newline;
 3188           break;
 3189         case CTX_LETTER:
 3190           trans[i] = state_letter;
 3191           break;
 3192         default:
 3193           trans[i] = state;
 3194           break;
 3195         }
 3196 
 3197 #ifdef DEBUG
 3198   fprintf (stderr, "trans table %td", s);
 3199   for (int i = 0; i < NOTCHAR; ++i)
 3200     {
 3201       if (!(i & 0xf))
 3202         fprintf (stderr, "\n");
 3203       fprintf (stderr, " %2td", trans[i]);
 3204     }
 3205   fprintf (stderr, "\n");
 3206 #endif
 3207 
 3208   free (group.elems);
 3209   free (follows.elems);
 3210   free (tmp.elems);
 3211 
 3212   /* Keep the newline transition in a special place so we can use it as
 3213      a sentinel.  */
 3214   if (tstbit (d->syntax.eolbyte, &label))
 3215     {
 3216       d->newlines[s] = trans[d->syntax.eolbyte];
 3217       trans[d->syntax.eolbyte] = -1;
 3218     }
 3219 
 3220   return trans[uc];
 3221 }
 3222 
 3223 /* Multibyte character handling sub-routines for dfaexec.  */
 3224 
 3225 /* Consume a single byte and transit state from 's' to '*next_state'.
 3226    This function is almost same as the state transition routin in dfaexec.
 3227    But state transition is done just once, otherwise matching succeed or
 3228    reach the end of the buffer.  */
 3229 static state_num
 3230 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
 3231 {
 3232   state_num *t;
 3233 
 3234   if (d->trans[s])
 3235     t = d->trans[s];
 3236   else if (d->fails[s])
 3237     t = d->fails[s];
 3238   else
 3239     {
 3240       build_state (s, d, **pp);
 3241       if (d->trans[s])
 3242         t = d->trans[s];
 3243       else
 3244         {
 3245           t = d->fails[s];
 3246           assert (t);
 3247         }
 3248     }
 3249 
 3250   if (t[**pp] == -2)
 3251     build_state (s, d, **pp);
 3252 
 3253   return t[*(*pp)++];
 3254 }
 3255 
 3256 /* Transit state from s, then return new state and update the pointer of
 3257    the buffer.  This function is for a period operator which can match a
 3258    multi-byte character.  */
 3259 static state_num
 3260 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
 3261                unsigned char const *end)
 3262 {
 3263   wint_t wc;
 3264 
 3265   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
 3266 
 3267   /* This state has some operators which can match a multibyte character.  */
 3268   d->mb_follows.nelem = 0;
 3269 
 3270   /* Calculate the state which can be reached from the state 's' by
 3271      consuming 'mbclen' single bytes from the buffer.  */
 3272   state_num s1 = s;
 3273   int mbci;
 3274   for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
 3275     s = transit_state_singlebyte (d, s, pp);
 3276   *pp += mbclen - mbci;
 3277 
 3278   if (wc == WEOF)
 3279     {
 3280       /* It is an invalid character, so ANYCHAR is not accepted.  */
 3281       return s;
 3282     }
 3283 
 3284   /* If all positions which have ANYCHAR do not depend on the context
 3285      of the next character, calculate the next state with
 3286      pre-calculated follows and cache the result.  */
 3287   if (d->states[s1].mb_trindex < 0)
 3288     {
 3289       if (MAX_TRCOUNT <= d->mb_trcount)
 3290         {
 3291           state_num s3;
 3292           for (s3 = -1; s3 < d->tralloc; s3++)
 3293             {
 3294               free (d->mb_trans[s3]);
 3295               d->mb_trans[s3] = NULL;
 3296             }
 3297 
 3298           for (state_num i = 0; i < d->sindex; i++)
 3299             d->states[i].mb_trindex = -1;
 3300           d->mb_trcount = 0;
 3301         }
 3302       d->states[s1].mb_trindex = d->mb_trcount++;
 3303     }
 3304 
 3305   if (! d->mb_trans[s])
 3306     {
 3307       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
 3308       enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
 3309       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
 3310       for (int i = 0; i < MAX_TRCOUNT; i++)
 3311         d->mb_trans[s][i] = -1;
 3312     }
 3313   else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
 3314     return d->mb_trans[s][d->states[s1].mb_trindex];
 3315 
 3316   if (s == -1)
 3317     copy (&d->states[s1].mbps, &d->mb_follows);
 3318   else
 3319     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
 3320 
 3321   int separate_contexts = state_separate_contexts (d, &d->mb_follows);
 3322   state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
 3323   realloc_trans_if_necessary (d);
 3324 
 3325   d->mb_trans[s][d->states[s1].mb_trindex] = s2;
 3326 
 3327   return s2;
 3328 }
 3329 
 3330 /* The initial state may encounter a byte which is not a single byte character
 3331    nor the first byte of a multibyte character.  But it is incorrect for the
 3332    initial state to accept such a byte.  For example, in Shift JIS the regular
 3333    expression "\\" accepts the codepoint 0x5c, but should not accept the second
 3334    byte of the codepoint 0x815c.  Then the initial state must skip the bytes
 3335    that are not a single byte character nor the first byte of a multibyte
 3336    character.
 3337 
 3338    Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
 3339    or exceeds P, and return the advanced MBP.  If WCP is non-NULL and
 3340    the result is greater than P, set *WCP to the final wide character
 3341    processed, or to WEOF if no wide character is processed.  Otherwise,
 3342    if WCP is non-NULL, *WCP may or may not be updated.
 3343 
 3344    Both P and MBP must be no larger than END.  */
 3345 static unsigned char const *
 3346 skip_remains_mb (struct dfa *d, unsigned char const *p,
 3347                  unsigned char const *mbp, char const *end)
 3348 {
 3349   if (d->syntax.never_trail[*p])
 3350     return p;
 3351   while (mbp < p)
 3352     {
 3353       wint_t wc;
 3354       mbp += mbs_to_wchar (&wc, (char const *) mbp,
 3355                            end - (char const *) mbp, d);
 3356     }
 3357   return mbp;
 3358 }
 3359 
 3360 /* Search through a buffer looking for a match to the struct dfa *D.
 3361    Find the first occurrence of a string matching the regexp in the
 3362    buffer, and the shortest possible version thereof.  Return a pointer to
 3363    the first character after the match, or NULL if none is found.  BEGIN
 3364    points to the beginning of the buffer, and END points to the first byte
 3365    after its end.  Note however that we store a sentinel byte (usually
 3366    newline) in *END, so the actual buffer must be one byte longer.
 3367    When ALLOW_NL, newlines may appear in the matching string.
 3368    If COUNT is non-NULL, increment *COUNT once for each newline processed.
 3369    If MULTIBYTE, the input consists of multibyte characters and/or
 3370    encoding-error bytes.  Otherwise, it consists of single-byte characters.
 3371    Here is the list of features that make this DFA matcher punt:
 3372     - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
 3373     - [^...] in non-simple locale
 3374     - [[=foo=]] or [[.foo.]]
 3375     - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
 3376     - back-reference: (.)\1
 3377     - word-delimiter in multibyte locale: \<, \>, \b, \B
 3378    See struct localeinfo.simple for the definition of "simple locale".  */
 3379 
 3380 static inline char *
 3381 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
 3382               ptrdiff_t *count, bool multibyte)
 3383 {
 3384   if (MAX_TRCOUNT <= d->sindex)
 3385     {
 3386       for (state_num s = d->min_trcount; s < d->sindex; s++)
 3387         {
 3388           free (d->states[s].elems.elems);
 3389           free (d->states[s].mbps.elems);
 3390         }
 3391       d->sindex = d->min_trcount;
 3392 
 3393       if (d->trans)
 3394         {
 3395           for (state_num s = 0; s < d->tralloc; s++)
 3396             {
 3397               free (d->trans[s]);
 3398               free (d->fails[s]);
 3399               d->trans[s] = d->fails[s] = NULL;
 3400             }
 3401           d->trcount = 0;
 3402         }
 3403 
 3404       if (d->localeinfo.multibyte && d->mb_trans)
 3405         {
 3406           for (state_num s = -1; s < d->tralloc; s++)
 3407             {
 3408               free (d->mb_trans[s]);
 3409               d->mb_trans[s] = NULL;
 3410             }
 3411           for (state_num s = 0; s < d->min_trcount; s++)
 3412             d->states[s].mb_trindex = -1;
 3413           d->mb_trcount = 0;
 3414         }
 3415     }
 3416 
 3417   if (!d->tralloc)
 3418     realloc_trans_if_necessary (d);
 3419 
 3420   /* Current state.  */
 3421   state_num s = 0, s1 = 0;
 3422 
 3423   /* Current input character.  */
 3424   unsigned char const *p = (unsigned char const *) begin;
 3425   unsigned char const *mbp = p;
 3426 
 3427   /* Copy of d->trans so it can be optimized into a register.  */
 3428   state_num **trans = d->trans;
 3429   unsigned char eol = d->syntax.eolbyte;  /* Likewise for eolbyte.  */
 3430   unsigned char saved_end = *(unsigned char *) end;
 3431   *end = eol;
 3432 
 3433   if (multibyte)
 3434     {
 3435       memset (&d->mbs, 0, sizeof d->mbs);
 3436       if (d->mb_follows.alloc == 0)
 3437         alloc_position_set (&d->mb_follows, d->nleaves);
 3438     }
 3439 
 3440   idx_t nlcount = 0;
 3441   for (;;)
 3442     {
 3443       state_num *t;
 3444       while ((t = trans[s]) != NULL)
 3445         {
 3446           if (s < d->min_trcount)
 3447             {
 3448               if (!multibyte || d->states[s].mbps.nelem == 0)
 3449                 {
 3450                   while (t[*p] == s)
 3451                     p++;
 3452                 }
 3453               if (multibyte)
 3454                 p = mbp = skip_remains_mb (d, p, mbp, end);
 3455             }
 3456 
 3457           if (multibyte)
 3458             {
 3459               s1 = s;
 3460 
 3461               if (d->states[s].mbps.nelem == 0
 3462                   || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
 3463                 {
 3464                   /* If an input character does not match ANYCHAR, do it
 3465                      like a single-byte character.  */
 3466                   s = t[*p++];
 3467                 }
 3468               else
 3469                 {
 3470                   s = transit_state (d, s, &p, (unsigned char *) end);
 3471                   mbp = p;
 3472                   trans = d->trans;
 3473                 }
 3474             }
 3475           else
 3476             {
 3477               s1 = t[*p++];
 3478               t = trans[s1];
 3479               if (! t)
 3480                 {
 3481                   state_num tmp = s;
 3482                   s = s1;
 3483                   s1 = tmp;     /* swap */
 3484                   break;
 3485                 }
 3486               if (s < d->min_trcount)
 3487                 {
 3488                   while (t[*p] == s1)
 3489                     p++;
 3490                 }
 3491               s = t[*p++];
 3492             }
 3493         }
 3494 
 3495       if (s < 0)
 3496         {
 3497           if (s == -2)
 3498             {
 3499               s = build_state (s1, d, p[-1]);
 3500               trans = d->trans;
 3501             }
 3502           else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
 3503             {
 3504               /* The previous character was a newline.  Count it, and skip
 3505                  checking of multibyte character boundary until here.  */
 3506               nlcount++;
 3507               mbp = p;
 3508 
 3509               s = (allow_nl ? d->newlines[s1]
 3510                    : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
 3511                    : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
 3512                    : d->initstate_notbol);
 3513             }
 3514           else
 3515             {
 3516               p = NULL;
 3517               goto done;
 3518             }
 3519         }
 3520       else if (d->fails[s])
 3521         {
 3522           if ((d->success[s] & d->syntax.sbit[*p])
 3523               || ((char *) p == end
 3524                   && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
 3525                                          d)))
 3526             goto done;
 3527 
 3528           if (multibyte && s < d->min_trcount)
 3529             p = mbp = skip_remains_mb (d, p, mbp, end);
 3530 
 3531           s1 = s;
 3532           if (!multibyte || d->states[s].mbps.nelem == 0
 3533               || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
 3534             {
 3535               /* If a input character does not match ANYCHAR, do it
 3536                  like a single-byte character.  */
 3537               s = d->fails[s][*p++];
 3538             }
 3539           else
 3540             {
 3541               s = transit_state (d, s, &p, (unsigned char *) end);
 3542               mbp = p;
 3543               trans = d->trans;
 3544             }
 3545         }
 3546       else
 3547         {
 3548           build_state (s, d, p[0]);
 3549           trans = d->trans;
 3550         }
 3551     }
 3552 
 3553  done:
 3554   if (count)
 3555     *count += nlcount;
 3556   *end = saved_end;
 3557   return (char *) p;
 3558 }
 3559 
 3560 /* Specialized versions of dfaexec for multibyte and single-byte cases.
 3561    This is for performance, as dfaexec_main is an inline function.  */
 3562 
 3563 static char *
 3564 dfaexec_mb (struct dfa *d, char const *begin, char *end,
 3565             bool allow_nl, ptrdiff_t *count, bool *backref)
 3566 {
 3567   return dfaexec_main (d, begin, end, allow_nl, count, true);
 3568 }
 3569 
 3570 static char *
 3571 dfaexec_sb (struct dfa *d, char const *begin, char *end,
 3572             bool allow_nl, ptrdiff_t *count, bool *backref)
 3573 {
 3574   return dfaexec_main (d, begin, end, allow_nl, count, false);
 3575 }
 3576 
 3577 /* Always set *BACKREF and return BEGIN.  Use this wrapper for
 3578    any regexp that uses a construct not supported by this code.  */
 3579 static char *
 3580 dfaexec_noop (struct dfa *d, char const *begin, char *end,
 3581               bool allow_nl, ptrdiff_t *count, bool *backref)
 3582 {
 3583   *backref = true;
 3584   return (char *) begin;
 3585 }
 3586 
 3587 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
 3588    but faster and set *BACKREF if the DFA code does not support this
 3589    regexp usage.  */
 3590 
 3591 char *
 3592 dfaexec (struct dfa *d, char const *begin, char *end,
 3593          bool allow_nl, ptrdiff_t *count, bool *backref)
 3594 {
 3595   return d->dfaexec (d, begin, end, allow_nl, count, backref);
 3596 }
 3597 
 3598 struct dfa *
 3599 dfasuperset (struct dfa const *d)
 3600 {
 3601   return d->superset;
 3602 }
 3603 
 3604 bool
 3605 dfaisfast (struct dfa const *d)
 3606 {
 3607   return d->fast;
 3608 }
 3609 
 3610 static void
 3611 free_mbdata (struct dfa *d)
 3612 {
 3613   free (d->multibyte_prop);
 3614   free (d->lex.brack.chars);
 3615   free (d->mb_follows.elems);
 3616 
 3617   if (d->mb_trans)
 3618     {
 3619       state_num s;
 3620       for (s = -1; s < d->tralloc; s++)
 3621         free (d->mb_trans[s]);
 3622       free (d->mb_trans - 2);
 3623     }
 3624 }
 3625 
 3626 /* Return true if every construct in D is supported by this DFA matcher.  */
 3627 static bool _GL_ATTRIBUTE_PURE
 3628 dfa_supported (struct dfa const *d)
 3629 {
 3630   for (idx_t i = 0; i < d->tindex; i++)
 3631     {
 3632       switch (d->tokens[i])
 3633         {
 3634         case BEGWORD:
 3635         case ENDWORD:
 3636         case LIMWORD:
 3637         case NOTLIMWORD:
 3638           if (!d->localeinfo.multibyte)
 3639             continue;
 3640           FALLTHROUGH;
 3641         case BACKREF:
 3642         case MBCSET:
 3643           return false;
 3644         }
 3645     }
 3646   return true;
 3647 }
 3648 
 3649 /* Disable use of the superset DFA if it is not likely to help
 3650    performance.  */
 3651 static void
 3652 maybe_disable_superset_dfa (struct dfa *d)
 3653 {
 3654   if (!d->localeinfo.using_utf8)
 3655     return;
 3656 
 3657   bool have_backref = false;
 3658   for (idx_t i = 0; i < d->tindex; i++)
 3659     {
 3660       switch (d->tokens[i])
 3661         {
 3662         case ANYCHAR:
 3663           /* Lowered.  */
 3664           abort ();
 3665         case BACKREF:
 3666           have_backref = true;
 3667           break;
 3668         case MBCSET:
 3669           /* Requires multi-byte algorithm.  */
 3670           return;
 3671         default:
 3672           break;
 3673         }
 3674     }
 3675 
 3676   if (!have_backref && d->superset)
 3677     {
 3678       /* The superset DFA is not likely to be much faster, so remove it.  */
 3679       dfafree (d->superset);
 3680       free (d->superset);
 3681       d->superset = NULL;
 3682     }
 3683 
 3684   free_mbdata (d);
 3685   d->localeinfo.multibyte = false;
 3686   d->dfaexec = dfaexec_sb;
 3687   d->fast = true;
 3688 }
 3689 
 3690 static void
 3691 dfassbuild (struct dfa *d)
 3692 {
 3693   struct dfa *sup = dfaalloc ();
 3694 
 3695   *sup = *d;
 3696   sup->localeinfo.multibyte = false;
 3697   sup->dfaexec = dfaexec_sb;
 3698   sup->multibyte_prop = NULL;
 3699   sup->superset = NULL;
 3700   sup->states = NULL;
 3701   sup->sindex = 0;
 3702   sup->constraints = NULL;
 3703   sup->separates = NULL;
 3704   sup->follows = NULL;
 3705   sup->tralloc = 0;
 3706   sup->trans = NULL;
 3707   sup->fails = NULL;
 3708   sup->success = NULL;
 3709   sup->newlines = NULL;
 3710 
 3711   sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
 3712   if (d->cindex)
 3713     {
 3714       memcpy (sup->charclasses, d->charclasses,
 3715               d->cindex * sizeof *sup->charclasses);
 3716     }
 3717 
 3718   sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
 3719   sup->talloc = d->tindex * 2;
 3720 
 3721   bool have_achar = false;
 3722   bool have_nchar = false;
 3723   idx_t j;
 3724   for (idx_t i = j = 0; i < d->tindex; i++)
 3725     {
 3726       switch (d->tokens[i])
 3727         {
 3728         case ANYCHAR:
 3729         case MBCSET:
 3730         case BACKREF:
 3731           {
 3732             charclass ccl;
 3733             fillset (&ccl);
 3734             sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
 3735             sup->tokens[j++] = STAR;
 3736             if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
 3737                 || d->tokens[i + 1] == PLUS)
 3738               i++;
 3739             have_achar = true;
 3740           }
 3741           break;
 3742         case BEGWORD:
 3743         case ENDWORD:
 3744         case LIMWORD:
 3745         case NOTLIMWORD:
 3746           if (d->localeinfo.multibyte)
 3747             {
 3748               /* These constraints aren't supported in a multibyte locale.
 3749                  Ignore them in the superset DFA.  */
 3750               sup->tokens[j++] = EMPTY;
 3751               break;
 3752             }
 3753           FALLTHROUGH;
 3754         default:
 3755           sup->tokens[j++] = d->tokens[i];
 3756           if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
 3757               || d->tokens[i] >= CSET)
 3758             have_nchar = true;
 3759           break;
 3760         }
 3761     }
 3762   sup->tindex = j;
 3763 
 3764   if (have_nchar && (have_achar || d->localeinfo.multibyte))
 3765     d->superset = sup;
 3766   else
 3767     {
 3768       dfafree (sup);
 3769       free (sup);
 3770     }
 3771 }
 3772 
 3773 /* Parse a string S of length LEN into D (but skip this step if S is null).
 3774    Then analyze D and build a matcher for it.
 3775    SEARCHFLAG says whether to build a searching or an exact matcher.  */
 3776 void
 3777 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
 3778 {
 3779   if (s != NULL)
 3780     dfaparse (s, len, d);
 3781 
 3782   dfassbuild (d);
 3783 
 3784   if (dfa_supported (d))
 3785     {
 3786       maybe_disable_superset_dfa (d);
 3787       dfaanalyze (d, searchflag);
 3788     }
 3789   else
 3790     {
 3791       d->dfaexec = dfaexec_noop;
 3792     }
 3793 
 3794   if (d->superset)
 3795     {
 3796       d->fast = true;
 3797       dfaanalyze (d->superset, searchflag);
 3798     }
 3799 }
 3800 
 3801 /* Free the storage held by the components of a dfa.  */
 3802 void
 3803 dfafree (struct dfa *d)
 3804 {
 3805   free (d->charclasses);
 3806   free (d->tokens);
 3807 
 3808   if (d->localeinfo.multibyte)
 3809     free_mbdata (d);
 3810 
 3811   free (d->constraints);
 3812   free (d->separates);
 3813 
 3814   for (idx_t i = 0; i < d->sindex; i++)
 3815     {
 3816       free (d->states[i].elems.elems);
 3817       free (d->states[i].mbps.elems);
 3818     }
 3819   free (d->states);
 3820 
 3821   if (d->follows)
 3822     {
 3823       for (idx_t i = 0; i < d->tindex; i++)
 3824         free (d->follows[i].elems);
 3825       free (d->follows);
 3826     }
 3827 
 3828   if (d->trans)
 3829     {
 3830       for (idx_t i = 0; i < d->tralloc; i++)
 3831         {
 3832           free (d->trans[i]);
 3833           free (d->fails[i]);
 3834         }
 3835 
 3836       free (d->trans - 2);
 3837       free (d->fails);
 3838       free (d->newlines);
 3839       free (d->success);
 3840     }
 3841 
 3842   if (d->superset)
 3843     {
 3844       dfafree (d->superset);
 3845       free (d->superset);
 3846     }
 3847 }
 3848 
 3849 /* Having found the postfix representation of the regular expression,
 3850    try to find a long sequence of characters that must appear in any line
 3851    containing the r.e.
 3852    Finding a "longest" sequence is beyond the scope here;
 3853    we take an easy way out and hope for the best.
 3854    (Take "(ab|a)b"--please.)
 3855 
 3856    We do a bottom-up calculation of sequences of characters that must appear
 3857    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
 3858    representation:
 3859         sequences that must appear at the left of the match ("left")
 3860         sequences that must appear at the right of the match ("right")
 3861         lists of sequences that must appear somewhere in the match ("in")
 3862         sequences that must constitute the match ("is")
 3863 
 3864    When we get to the root of the tree, we use one of the longest of its
 3865    calculated "in" sequences as our answer.
 3866 
 3867    The sequences calculated for the various types of node (in pseudo ANSI c)
 3868    are shown below.  "p" is the operand of unary operators (and the left-hand
 3869    operand of binary operators); "q" is the right-hand operand of binary
 3870    operators.
 3871 
 3872    "ZERO" means "a zero-length sequence" below.
 3873 
 3874         Type    left        right       is      in
 3875         ----    ----        -----       --      --
 3876         char c  # c     # c     # c     # c
 3877 
 3878         ANYCHAR ZERO        ZERO        ZERO        ZERO
 3879 
 3880         MBCSET  ZERO        ZERO        ZERO        ZERO
 3881 
 3882         CSET    ZERO        ZERO        ZERO        ZERO
 3883 
 3884         STAR    ZERO        ZERO        ZERO        ZERO
 3885 
 3886         QMARK   ZERO        ZERO        ZERO        ZERO
 3887 
 3888         PLUS    p->left     p->right    ZERO        p->in
 3889 
 3890         CAT (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
 3891                 p->left :   q->right :  q->is!=ZERO) ?  q->in plus
 3892                 p->is##q->left  p->right##q->is p->is##q->is : p->right##q->left
 3893                                                 ZERO
 3894 
 3895         OR  longest common  longest common  (do p->is and substrings common
 3896                 leading     trailing    to q->is have same p->in and
 3897                 (sub)sequence   (sub)sequence   q->in length and content) ?
 3898                 of p->left  of p->right
 3899                 and q->left and q->right    p->is : NULL
 3900 
 3901    If there's anything else we recognize in the tree, all four sequences get set
 3902    to zero-length sequences.  If there's something we don't recognize in the
 3903    tree, we just return a zero-length sequence.
 3904 
 3905    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
 3906    'aaa')?
 3907 
 3908    And ... is it here or someplace that we might ponder "optimizations" such as
 3909         egrep 'psi|epsilon' ->  egrep 'psi'
 3910         egrep 'pepsi|epsilon'   ->  egrep 'epsi'
 3911                                         (Yes, we now find "epsi" as a "string
 3912                                         that must occur", but we might also
 3913                                         simplify the *entire* r.e. being sought)
 3914         grep '[c]'      ->  grep 'c'
 3915         grep '(ab|a)b'      ->  grep 'ab'
 3916         grep 'ab*'      ->  grep 'a'
 3917         grep 'a*b'      ->  grep 'b'
 3918 
 3919    There are several issues:
 3920 
 3921    Is optimization easy (enough)?
 3922 
 3923    Does optimization actually accomplish anything,
 3924    or is the automaton you get from "psi|epsilon" (for example)
 3925    the same as the one you get from "psi" (for example)?
 3926 
 3927    Are optimizable r.e.'s likely to be used in real-life situations
 3928    (something like 'ab*' is probably unlikely; something like is
 3929    'psi|epsilon' is likelier)?  */
 3930 
 3931 static char *
 3932 icatalloc (char *old, char const *new)
 3933 {
 3934   idx_t newsize = strlen (new);
 3935   if (newsize == 0)
 3936     return old;
 3937   idx_t oldsize = strlen (old);
 3938   char *result = xrealloc (old, oldsize + newsize + 1);
 3939   memcpy (result + oldsize, new, newsize + 1);
 3940   return result;
 3941 }
 3942 
 3943 static void
 3944 freelist (char **cpp)
 3945 {
 3946   while (*cpp)
 3947     free (*cpp++);
 3948 }
 3949 
 3950 static char **
 3951 enlist (char **cpp, char *new, idx_t len)
 3952 {
 3953   new = memcpy (xmalloc (len + 1), new, len);
 3954   new[len] = '\0';
 3955   /* Is there already something in the list that's new (or longer)?  */
 3956   idx_t i;
 3957   for (i = 0; cpp[i] != NULL; i++)
 3958     if (strstr (cpp[i], new) != NULL)
 3959       {
 3960         free (new);
 3961         return cpp;
 3962       }
 3963   /* Eliminate any obsoleted strings.  */
 3964   for (idx_t j = 0; cpp[j] != NULL; )
 3965     if (strstr (new, cpp[j]) == NULL)
 3966       ++j;
 3967     else
 3968       {
 3969         free (cpp[j]);
 3970         if (--i == j)
 3971           break;
 3972         cpp[j] = cpp[i];
 3973         cpp[i] = NULL;
 3974       }
 3975   /* Add the new string.  */
 3976   cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
 3977   cpp[i] = new;
 3978   cpp[i + 1] = NULL;
 3979   return cpp;
 3980 }
 3981 
 3982 /* Given pointers to two strings, return a pointer to an allocated
 3983    list of their distinct common substrings.  */
 3984 static char **
 3985 comsubs (char *left, char const *right)
 3986 {
 3987   char **cpp = xzalloc (sizeof *cpp);
 3988 
 3989   for (char *lcp = left; *lcp != '\0'; lcp++)
 3990     {
 3991       idx_t len = 0;
 3992       char *rcp = strchr (right, *lcp);
 3993       while (rcp != NULL)
 3994         {
 3995           idx_t i;
 3996           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
 3997             continue;
 3998           if (i > len)
 3999             len = i;
 4000           rcp = strchr (rcp + 1, *lcp);
 4001         }
 4002       if (len != 0)
 4003         cpp = enlist (cpp, lcp, len);
 4004     }
 4005   return cpp;
 4006 }
 4007 
 4008 static char **
 4009 addlists (char **old, char **new)
 4010 {
 4011   for (; *new; new++)
 4012     old = enlist (old, *new, strlen (*new));
 4013   return old;
 4014 }
 4015 
 4016 /* Given two lists of substrings, return a new list giving substrings
 4017    common to both.  */
 4018 static char **
 4019 inboth (char **left, char **right)
 4020 {
 4021   char **both = xzalloc (sizeof *both);
 4022 
 4023   for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
 4024     {
 4025       for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
 4026         {
 4027           char **temp = comsubs (left[lnum], right[rnum]);
 4028           both = addlists (both, temp);
 4029           freelist (temp);
 4030           free (temp);
 4031         }
 4032     }
 4033   return both;
 4034 }
 4035 
 4036 typedef struct must must;
 4037 
 4038 struct must
 4039 {
 4040   char **in;
 4041   char *left;
 4042   char *right;
 4043   char *is;
 4044   bool begline;
 4045   bool endline;
 4046   must *prev;
 4047 };
 4048 
 4049 static must *
 4050 allocmust (must *mp, idx_t size)
 4051 {
 4052   must *new_mp = xmalloc (sizeof *new_mp);
 4053   new_mp->in = xzalloc (sizeof *new_mp->in);
 4054   new_mp->left = xzalloc (size);
 4055   new_mp->right = xzalloc (size);
 4056   new_mp->is = xzalloc (size);
 4057   new_mp->begline = false;
 4058   new_mp->endline = false;
 4059   new_mp->prev = mp;
 4060   return new_mp;
 4061 }
 4062 
 4063 static void
 4064 resetmust (must *mp)
 4065 {
 4066   freelist (mp->in);
 4067   mp->in[0] = NULL;
 4068   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
 4069   mp->begline = false;
 4070   mp->endline = false;
 4071 }
 4072 
 4073 static void
 4074 freemust (must *mp)
 4075 {
 4076   freelist (mp->in);
 4077   free (mp->in);
 4078   free (mp->left);
 4079   free (mp->right);
 4080   free (mp->is);
 4081   free (mp);
 4082 }
 4083 
 4084 struct dfamust *
 4085 dfamust (struct dfa const *d)
 4086 {
 4087   must *mp = NULL;
 4088   char const *result = "";
 4089   bool exact = false;
 4090   bool begline = false;
 4091   bool endline = false;
 4092   bool need_begline = false;
 4093   bool need_endline = false;
 4094   bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
 4095 
 4096   for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
 4097     {
 4098       token t = d->tokens[ri];
 4099       switch (t)
 4100         {
 4101         case BEGLINE:
 4102           mp = allocmust (mp, 2);
 4103           mp->begline = true;
 4104           need_begline = true;
 4105           break;
 4106         case ENDLINE:
 4107           mp = allocmust (mp, 2);
 4108           mp->endline = true;
 4109           need_endline = true;
 4110           break;
 4111         case LPAREN:
 4112         case RPAREN:
 4113           assert (!"neither LPAREN nor RPAREN may appear here");
 4114 
 4115         case EMPTY:
 4116         case BEGWORD:
 4117         case ENDWORD:
 4118         case LIMWORD:
 4119         case NOTLIMWORD:
 4120         case BACKREF:
 4121         case ANYCHAR:
 4122         case MBCSET:
 4123           mp = allocmust (mp, 2);
 4124           break;
 4125 
 4126         case STAR:
 4127         case QMARK:
 4128           resetmust (mp);
 4129           break;
 4130 
 4131         case OR:
 4132           {
 4133             char **new;
 4134             must *rmp = mp;
 4135             must *lmp = mp = mp->prev;
 4136             idx_t j, ln, rn, n;
 4137 
 4138             /* Guaranteed to be.  Unlikely, but ...  */
 4139             if (streq (lmp->is, rmp->is))
 4140               {
 4141                 lmp->begline &= rmp->begline;
 4142                 lmp->endline &= rmp->endline;
 4143               }
 4144             else
 4145               {
 4146                 lmp->is[0] = '\0';
 4147                 lmp->begline = false;
 4148                 lmp->endline = false;
 4149               }
 4150             /* Left side--easy */
 4151             idx_t i = 0;
 4152             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
 4153               ++i;
 4154             lmp->left[i] = '\0';
 4155             /* Right side */
 4156             ln = strlen (lmp->right);
 4157             rn = strlen (rmp->right);
 4158             n = ln;
 4159             if (n > rn)
 4160               n = rn;
 4161             for (i = 0; i < n; ++i)
 4162               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
 4163                 break;
 4164             for (j = 0; j < i; ++j)
 4165               lmp->right[j] = lmp->right[(ln - i) + j];
 4166             lmp->right[j] = '\0';
 4167             new = inboth (lmp->in, rmp->in);
 4168             freelist (lmp->in);
 4169             free (lmp->in);
 4170             lmp->in = new;
 4171             freemust (rmp);
 4172           }
 4173           break;
 4174 
 4175         case PLUS:
 4176           mp->is[0] = '\0';
 4177           break;
 4178 
 4179         case END:
 4180           assert (!mp->prev);
 4181           for (idx_t i = 0; mp->in[i] != NULL; i++)
 4182             if (strlen (mp->in[i]) > strlen (result))
 4183               result = mp->in[i];
 4184           if (streq (result, mp->is))
 4185             {
 4186               if ((!need_begline || mp->begline) && (!need_endline
 4187                                                      || mp->endline))
 4188                 exact = true;
 4189               begline = mp->begline;
 4190               endline = mp->endline;
 4191             }
 4192           goto done;
 4193 
 4194         case CAT:
 4195           {
 4196             must *rmp = mp;
 4197             must *lmp = mp = mp->prev;
 4198 
 4199             /* In.  Everything in left, plus everything in
 4200                right, plus concatenation of
 4201                left's right and right's left.  */
 4202             lmp->in = addlists (lmp->in, rmp->in);
 4203             if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
 4204               {
 4205                 idx_t lrlen = strlen (lmp->right);
 4206                 idx_t rllen = strlen (rmp->left);
 4207                 char *tp = xmalloc (lrlen + rllen);
 4208                 memcpy (tp, lmp->right, lrlen);
 4209                 memcpy (tp + lrlen, rmp->left, rllen);
 4210                 lmp->in = enlist (lmp->in, tp, lrlen + rllen);
 4211                 free (tp);
 4212               }
 4213             /* Left-hand */
 4214             if (lmp->is[0] != '\0')
 4215               lmp->left = icatalloc (lmp->left, rmp->left);
 4216             /* Right-hand */
 4217             if (rmp->is[0] == '\0')
 4218               lmp->right[0] = '\0';
 4219             lmp->right = icatalloc (lmp->right, rmp->right);
 4220             /* Guaranteed to be */
 4221             if ((lmp->is[0] != '\0' || lmp->begline)
 4222                 && (rmp->is[0] != '\0' || rmp->endline))
 4223               {
 4224                 lmp->is = icatalloc (lmp->is, rmp->is);
 4225                 lmp->endline = rmp->endline;
 4226               }
 4227             else
 4228               {
 4229                 lmp->is[0] = '\0';
 4230                 lmp->begline = false;
 4231                 lmp->endline = false;
 4232               }
 4233             freemust (rmp);
 4234           }
 4235           break;
 4236 
 4237         case '\0':
 4238           /* Not on *my* shift.  */
 4239           goto done;
 4240 
 4241         default:
 4242           if (CSET <= t)
 4243             {
 4244               /* If T is a singleton, or if case-folding in a unibyte
 4245                  locale and T's members all case-fold to the same char,
 4246                  convert T to one of its members.  Otherwise, do
 4247                  nothing further with T.  */
 4248               charclass *ccl = &d->charclasses[t - CSET];
 4249               int j;
 4250               for (j = 0; j < NOTCHAR; j++)
 4251                 if (tstbit (j, ccl))
 4252                   break;
 4253               if (! (j < NOTCHAR))
 4254                 {
 4255                   mp = allocmust (mp, 2);
 4256                   break;
 4257                 }
 4258               t = j;
 4259               while (++j < NOTCHAR)
 4260                 if (tstbit (j, ccl)
 4261                     && ! (case_fold_unibyte
 4262                           && toupper (j) == toupper (t)))
 4263                   break;
 4264               if (j < NOTCHAR)
 4265                 {
 4266                   mp = allocmust (mp, 2);
 4267                   break;
 4268                 }
 4269             }
 4270 
 4271           idx_t rj = ri + 2;
 4272           if (d->tokens[ri + 1] == CAT)
 4273             {
 4274               for (; rj < d->tindex - 1; rj += 2)
 4275                 {
 4276                   if ((rj != ri && (d->tokens[rj] <= 0
 4277                                     || NOTCHAR <= d->tokens[rj]))
 4278                       || d->tokens[rj + 1] != CAT)
 4279                     break;
 4280                 }
 4281             }
 4282           mp = allocmust (mp, ((rj - ri) >> 1) + 1);
 4283           mp->is[0] = mp->left[0] = mp->right[0]
 4284             = case_fold_unibyte ? toupper (t) : t;
 4285 
 4286           idx_t i;
 4287           for (i = 1; ri + 2 < rj; i++)
 4288             {
 4289               ri += 2;
 4290               t = d->tokens[ri];
 4291               mp->is[i] = mp->left[i] = mp->right[i]
 4292                 = case_fold_unibyte ? toupper (t) : t;
 4293             }
 4294           mp->is[i] = mp->left[i] = mp->right[i] = '\0';
 4295           mp->in = enlist (mp->in, mp->is, i);
 4296           break;
 4297         }
 4298     }
 4299  done:;
 4300 
 4301   struct dfamust *dm = NULL;
 4302   if (*result)
 4303     {
 4304       dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
 4305       dm->exact = exact;
 4306       dm->begline = begline;
 4307       dm->endline = endline;
 4308       strcpy (dm->must, result);
 4309     }
 4310 
 4311   while (mp)
 4312     {
 4313       must *prev = mp->prev;
 4314       freemust (mp);
 4315       mp = prev;
 4316     }
 4317 
 4318   return dm;
 4319 }
 4320 
 4321 void
 4322 dfamustfree (struct dfamust *dm)
 4323 {
 4324   free (dm);
 4325 }
 4326 
 4327 struct dfa *
 4328 dfaalloc (void)
 4329 {
 4330   return xmalloc (sizeof (struct dfa));
 4331 }
 4332 
 4333 /* Initialize DFA.  */
 4334 void
 4335 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
 4336            reg_syntax_t bits, int dfaopts)
 4337 {
 4338   memset (dfa, 0, offsetof (struct dfa, dfaexec));
 4339   dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
 4340   dfa->localeinfo = *linfo;
 4341 
 4342   dfa->fast = !dfa->localeinfo.multibyte;
 4343 
 4344   dfa->canychar = -1;
 4345   dfa->syntax.syntax_bits_set = true;
 4346   dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
 4347   dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
 4348   dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
 4349   dfa->syntax.syntax_bits = bits;
 4350 
 4351   for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
 4352     {
 4353       unsigned char uc = i;
 4354 
 4355       dfa->syntax.sbit[uc] = char_context (dfa, uc);
 4356       switch (dfa->syntax.sbit[uc])
 4357         {
 4358         case CTX_LETTER:
 4359           setbit (uc, &dfa->syntax.letters);
 4360           break;
 4361         case CTX_NEWLINE:
 4362           setbit (uc, &dfa->syntax.newline);
 4363           break;
 4364         }
 4365 
 4366       /* POSIX requires that the five bytes in "\n\r./" (including the
 4367          terminating NUL) cannot occur inside a multibyte character.  */
 4368       dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
 4369                                      ? (uc & 0xc0) != 0x80
 4370                                      : strchr ("\n\r./", uc) != NULL);
 4371     }
 4372 }
 4373 
 4374 /* Initialize TO by copying FROM's syntax settings.  */
 4375 void
 4376 dfacopysyntax (struct dfa *to, struct dfa const *from)
 4377 {
 4378   memset (to, 0, offsetof (struct dfa, syntax));
 4379   to->canychar = -1;
 4380   to->fast = from->fast;
 4381   to->syntax = from->syntax;
 4382   to->dfaexec = from->dfaexec;
 4383   to->localeinfo = from->localeinfo;
 4384 }
 4385 
 4386 /* vim:set shiftwidth=2: */