"Fossies" - the Fresh Open Source Software Archive

Member "gawk-5.1.0/support/regcomp.c" (6 Feb 2020, 114517 Bytes) of package /linux/misc/gawk-5.1.0.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "regcomp.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 5.0.1_vs_5.1.0.

    1 /* Extended regular expression matching and search library.
    2    Copyright (C) 2002-2019 Free Software Foundation, Inc.
    3    This file is part of the GNU C Library.
    4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
    5 
    6    The GNU C Library is free software; you can redistribute it and/or
    7    modify it under the terms of the GNU Lesser General Public
    8    License as published by the Free Software Foundation; either
    9    version 2.1 of the License, or (at your option) any later version.
   10 
   11    The GNU C Library is distributed in the hope that it will be useful,
   12    but WITHOUT ANY WARRANTY; without even the implied warranty of
   13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   14    Lesser General Public License for more details.
   15 
   16    You should have received a copy of the GNU Lesser General Public
   17    License along with the GNU C Library; if not, see
   18    <https://www.gnu.org/licenses/>.  */
   19 
   20 #ifdef _LIBC
   21 # include <locale/weight.h>
   22 #endif
   23 
   24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
   25                       size_t length, reg_syntax_t syntax);
   26 static void re_compile_fastmap_iter (regex_t *bufp,
   27                      const re_dfastate_t *init_state,
   28                      char *fastmap);
   29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
   30 #ifdef RE_ENABLE_I18N
   31 static void free_charset (re_charset_t *cset);
   32 #endif /* RE_ENABLE_I18N */
   33 static void free_workarea_compile (regex_t *preg);
   34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
   35 #ifdef RE_ENABLE_I18N
   36 static void optimize_utf8 (re_dfa_t *dfa);
   37 #endif
   38 static reg_errcode_t analyze (regex_t *preg);
   39 static reg_errcode_t preorder (bin_tree_t *root,
   40                    reg_errcode_t (fn (void *, bin_tree_t *)),
   41                    void *extra);
   42 static reg_errcode_t postorder (bin_tree_t *root,
   43                 reg_errcode_t (fn (void *, bin_tree_t *)),
   44                 void *extra);
   45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
   46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
   47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
   48                  bin_tree_t *node);
   49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
   50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
   51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
   52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
   53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
   54                    unsigned int constraint);
   55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
   56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
   57                      Idx node, bool root);
   58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
   59 static Idx fetch_number (re_string_t *input, re_token_t *token,
   60              reg_syntax_t syntax);
   61 static int peek_token (re_token_t *token, re_string_t *input,
   62             reg_syntax_t syntax);
   63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
   64               reg_syntax_t syntax, reg_errcode_t *err);
   65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
   66                   re_token_t *token, reg_syntax_t syntax,
   67                   Idx nest, reg_errcode_t *err);
   68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
   69                  re_token_t *token, reg_syntax_t syntax,
   70                  Idx nest, reg_errcode_t *err);
   71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
   72                      re_token_t *token, reg_syntax_t syntax,
   73                      Idx nest, reg_errcode_t *err);
   74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
   75                   re_token_t *token, reg_syntax_t syntax,
   76                   Idx nest, reg_errcode_t *err);
   77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
   78                  re_dfa_t *dfa, re_token_t *token,
   79                  reg_syntax_t syntax, reg_errcode_t *err);
   80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
   81                       re_token_t *token, reg_syntax_t syntax,
   82                       reg_errcode_t *err);
   83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
   84                         re_string_t *regexp,
   85                         re_token_t *token, int token_len,
   86                         re_dfa_t *dfa,
   87                         reg_syntax_t syntax,
   88                         bool accept_hyphen);
   89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
   90                       re_string_t *regexp,
   91                       re_token_t *token);
   92 #ifdef RE_ENABLE_I18N
   93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
   94                     re_charset_t *mbcset,
   95                     Idx *equiv_class_alloc,
   96                     const unsigned char *name);
   97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
   98                       bitset_t sbcset,
   99                       re_charset_t *mbcset,
  100                       Idx *char_class_alloc,
  101                       const char *class_name,
  102                       reg_syntax_t syntax);
  103 #else  /* not RE_ENABLE_I18N */
  104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  105                     const unsigned char *name);
  106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  107                       bitset_t sbcset,
  108                       const char *class_name,
  109                       reg_syntax_t syntax);
  110 #endif /* not RE_ENABLE_I18N */
  111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
  112                        RE_TRANSLATE_TYPE trans,
  113                        const char *class_name,
  114                        const char *extra,
  115                        bool non_match, reg_errcode_t *err);
  116 static bin_tree_t *create_tree (re_dfa_t *dfa,
  117                 bin_tree_t *left, bin_tree_t *right,
  118                 re_token_type_t type);
  119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
  120                       bin_tree_t *left, bin_tree_t *right,
  121                       const re_token_t *token);
  122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
  123 static void free_token (re_token_t *node);
  124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
  125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
  126 
  127 /* This table gives an error message for each of the error codes listed
  128    in regex.h.  Obviously the order here has to be same as there.
  129    POSIX doesn't require that we do anything for REG_NOERROR,
  130    but why not be nice?  */
  131 
  132 static const char __re_error_msgid[] =
  133   {
  134 #define REG_NOERROR_IDX 0
  135     gettext_noop ("Success")    /* REG_NOERROR */
  136     "\0"
  137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
  138     gettext_noop ("No match")   /* REG_NOMATCH */
  139     "\0"
  140 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
  141     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
  142     "\0"
  143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
  144     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
  145     "\0"
  146 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
  147     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
  148     "\0"
  149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
  150     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
  151     "\0"
  152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
  153     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
  154     "\0"
  155 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
  156     gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
  157     "\0"
  158 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
  159     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
  160     "\0"
  161 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
  162     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
  163     "\0"
  164 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
  165     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
  166     "\0"
  167 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
  168     gettext_noop ("Invalid range end")  /* REG_ERANGE */
  169     "\0"
  170 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
  171     gettext_noop ("Memory exhausted") /* REG_ESPACE */
  172     "\0"
  173 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
  174     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
  175     "\0"
  176 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
  177     gettext_noop ("Premature end of regular expression") /* REG_EEND */
  178     "\0"
  179 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
  180     gettext_noop ("Regular expression too big") /* REG_ESIZE */
  181     "\0"
  182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
  183     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
  184   };
  185 
  186 static const size_t __re_error_msgid_idx[] =
  187   {
  188     REG_NOERROR_IDX,
  189     REG_NOMATCH_IDX,
  190     REG_BADPAT_IDX,
  191     REG_ECOLLATE_IDX,
  192     REG_ECTYPE_IDX,
  193     REG_EESCAPE_IDX,
  194     REG_ESUBREG_IDX,
  195     REG_EBRACK_IDX,
  196     REG_EPAREN_IDX,
  197     REG_EBRACE_IDX,
  198     REG_BADBR_IDX,
  199     REG_ERANGE_IDX,
  200     REG_ESPACE_IDX,
  201     REG_BADRPT_IDX,
  202     REG_EEND_IDX,
  203     REG_ESIZE_IDX,
  204     REG_ERPAREN_IDX
  205   };
  206 
  207 /* Entry points for GNU code.  */
  208 
  209 /* re_compile_pattern is the GNU regular expression compiler: it
  210    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
  211    Returns 0 if the pattern was valid, otherwise an error string.
  212 
  213    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
  214    are set in BUFP on entry.  */
  215 
  216 const char *
  217 re_compile_pattern (const char *pattern, size_t length,
  218             struct re_pattern_buffer *bufp)
  219 {
  220   reg_errcode_t ret;
  221 
  222   /* And GNU code determines whether or not to get register information
  223      by passing null for the REGS argument to re_match, etc., not by
  224      setting no_sub, unless RE_NO_SUB is set.  */
  225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
  226 
  227   /* Match anchors at newline.  */
  228   bufp->newline_anchor = 1;
  229 
  230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
  231 
  232   if (!ret)
  233     return NULL;
  234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
  235 }
  236 weak_alias (__re_compile_pattern, re_compile_pattern)
  237 
  238 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
  239    also be assigned to arbitrarily: each pattern buffer stores its own
  240    syntax, so it can be changed between regex compilations.  */
  241 /* This has no initializer because initialized variables in Emacs
  242    become read-only after dumping.  */
  243 reg_syntax_t re_syntax_options;
  244 
  245 
  246 /* Specify the precise syntax of regexps for compilation.  This provides
  247    for compatibility for various utilities which historically have
  248    different, incompatible syntaxes.
  249 
  250    The argument SYNTAX is a bit mask comprised of the various bits
  251    defined in regex.h.  We return the old syntax.  */
  252 
  253 reg_syntax_t
  254 re_set_syntax (reg_syntax_t syntax)
  255 {
  256   reg_syntax_t ret = re_syntax_options;
  257 
  258   re_syntax_options = syntax;
  259   return ret;
  260 }
  261 weak_alias (__re_set_syntax, re_set_syntax)
  262 
  263 int
  264 re_compile_fastmap (struct re_pattern_buffer *bufp)
  265 {
  266   re_dfa_t *dfa = bufp->buffer;
  267   char *fastmap = bufp->fastmap;
  268 
  269   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
  270   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
  271   if (dfa->init_state != dfa->init_state_word)
  272     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
  273   if (dfa->init_state != dfa->init_state_nl)
  274     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
  275   if (dfa->init_state != dfa->init_state_begbuf)
  276     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
  277   bufp->fastmap_accurate = 1;
  278   return 0;
  279 }
  280 weak_alias (__re_compile_fastmap, re_compile_fastmap)
  281 
  282 static inline void
  283 __attribute__ ((always_inline))
  284 re_set_fastmap (char *fastmap, bool icase, int ch)
  285 {
  286   fastmap[ch] = 1;
  287   if (icase)
  288     fastmap[tolower (ch)] = 1;
  289 }
  290 
  291 /* Helper function for re_compile_fastmap.
  292    Compile fastmap for the initial_state INIT_STATE.  */
  293 
  294 static void
  295 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
  296              char *fastmap)
  297 {
  298   re_dfa_t *dfa = bufp->buffer;
  299   Idx node_cnt;
  300   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
  301   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
  302     {
  303       Idx node = init_state->nodes.elems[node_cnt];
  304       re_token_type_t type = dfa->nodes[node].type;
  305 
  306       if (type == CHARACTER)
  307     {
  308       re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
  309 #ifdef RE_ENABLE_I18N
  310       if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
  311         {
  312           unsigned char buf[MB_LEN_MAX];
  313           unsigned char *p;
  314           wchar_t wc;
  315           mbstate_t state;
  316 
  317           p = buf;
  318           *p++ = dfa->nodes[node].opr.c;
  319           while (++node < dfa->nodes_len
  320              && dfa->nodes[node].type == CHARACTER
  321              && dfa->nodes[node].mb_partial)
  322         *p++ = dfa->nodes[node].opr.c;
  323           memset (&state, '\0', sizeof (state));
  324           if (__mbrtowc (&wc, (const char *) buf, p - buf,
  325                  &state) == p - buf
  326           && (__wcrtomb ((char *) buf, __towlower (wc), &state)
  327               != (size_t) -1))
  328         re_set_fastmap (fastmap, false, buf[0]);
  329         }
  330 #endif
  331     }
  332       else if (type == SIMPLE_BRACKET)
  333     {
  334       int i, ch;
  335       for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
  336         {
  337           int j;
  338           bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
  339           for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
  340         if (w & ((bitset_word_t) 1 << j))
  341           re_set_fastmap (fastmap, icase, ch);
  342         }
  343     }
  344 #ifdef RE_ENABLE_I18N
  345       else if (type == COMPLEX_BRACKET)
  346     {
  347       re_charset_t *cset = dfa->nodes[node].opr.mbcset;
  348       Idx i;
  349 
  350 # ifdef _LIBC
  351       /* See if we have to try all bytes which start multiple collation
  352          elements.
  353          e.g. In da_DK, we want to catch 'a' since "aa" is a valid
  354           collation element, and don't catch 'b' since 'b' is
  355           the only collation element which starts from 'b' (and
  356           it is caught by SIMPLE_BRACKET).  */
  357           if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
  358           && (cset->ncoll_syms || cset->nranges))
  359         {
  360           const int32_t *table = (const int32_t *)
  361             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
  362           for (i = 0; i < SBC_MAX; ++i)
  363             if (table[i] < 0)
  364               re_set_fastmap (fastmap, icase, i);
  365         }
  366 # endif /* _LIBC */
  367 
  368       /* See if we have to start the match at all multibyte characters,
  369          i.e. where we would not find an invalid sequence.  This only
  370          applies to multibyte character sets; for single byte character
  371          sets, the SIMPLE_BRACKET again suffices.  */
  372       if (dfa->mb_cur_max > 1
  373           && (cset->nchar_classes || cset->non_match || cset->nranges
  374 # ifdef _LIBC
  375           || cset->nequiv_classes
  376 # endif /* _LIBC */
  377          ))
  378         {
  379           unsigned char c = 0;
  380           do
  381         {
  382           mbstate_t mbs;
  383           memset (&mbs, 0, sizeof (mbs));
  384           if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
  385             re_set_fastmap (fastmap, false, (int) c);
  386         }
  387           while (++c != 0);
  388         }
  389 
  390       else
  391         {
  392           /* ... Else catch all bytes which can start the mbchars.  */
  393           for (i = 0; i < cset->nmbchars; ++i)
  394         {
  395           char buf[256];
  396           mbstate_t state;
  397           memset (&state, '\0', sizeof (state));
  398           if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
  399             re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
  400           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
  401             {
  402               if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
  403               != (size_t) -1)
  404             re_set_fastmap (fastmap, false, *(unsigned char *) buf);
  405             }
  406         }
  407         }
  408     }
  409 #endif /* RE_ENABLE_I18N */
  410       else if (type == OP_PERIOD
  411 #ifdef RE_ENABLE_I18N
  412            || type == OP_UTF8_PERIOD
  413 #endif /* RE_ENABLE_I18N */
  414            || type == END_OF_RE)
  415     {
  416       memset (fastmap, '\1', sizeof (char) * SBC_MAX);
  417       if (type == END_OF_RE)
  418         bufp->can_be_null = 1;
  419       return;
  420     }
  421     }
  422 }
  423 
  424 /* Entry point for POSIX code.  */
  425 /* regcomp takes a regular expression as a string and compiles it.
  426 
  427    PREG is a regex_t *.  We do not expect any fields to be initialized,
  428    since POSIX says we shouldn't.  Thus, we set
  429 
  430      'buffer' to the compiled pattern;
  431      'used' to the length of the compiled pattern;
  432      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
  433        REG_EXTENDED bit in CFLAGS is set; otherwise, to
  434        RE_SYNTAX_POSIX_BASIC;
  435      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
  436      'fastmap' to an allocated space for the fastmap;
  437      'fastmap_accurate' to zero;
  438      're_nsub' to the number of subexpressions in PATTERN.
  439 
  440    PATTERN is the address of the pattern string.
  441 
  442    CFLAGS is a series of bits which affect compilation.
  443 
  444      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
  445      use POSIX basic syntax.
  446 
  447      If REG_NEWLINE is set, then . and [^...] don't match newline.
  448      Also, regexec will try a match beginning after every newline.
  449 
  450      If REG_ICASE is set, then we considers upper- and lowercase
  451      versions of letters to be equivalent when matching.
  452 
  453      If REG_NOSUB is set, then when PREG is passed to regexec, that
  454      routine will report only success or failure, and nothing about the
  455      registers.
  456 
  457    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
  458    the return codes and their meanings.)  */
  459 
  460 int
  461 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
  462 {
  463   reg_errcode_t ret;
  464   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
  465              : RE_SYNTAX_POSIX_BASIC);
  466 
  467   preg->buffer = NULL;
  468   preg->allocated = 0;
  469   preg->used = 0;
  470 
  471   /* Try to allocate space for the fastmap.  */
  472   preg->fastmap = re_malloc (char, SBC_MAX);
  473   if (__glibc_unlikely (preg->fastmap == NULL))
  474     return REG_ESPACE;
  475 
  476   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
  477 
  478   /* If REG_NEWLINE is set, newlines are treated differently.  */
  479   if (cflags & REG_NEWLINE)
  480     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
  481       syntax &= ~RE_DOT_NEWLINE;
  482       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
  483       /* It also changes the matching behavior.  */
  484       preg->newline_anchor = 1;
  485     }
  486   else
  487     preg->newline_anchor = 0;
  488   preg->no_sub = !!(cflags & REG_NOSUB);
  489   preg->translate = NULL;
  490 
  491   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
  492 
  493   /* POSIX doesn't distinguish between an unmatched open-group and an
  494      unmatched close-group: both are REG_EPAREN.  */
  495   if (ret == REG_ERPAREN)
  496     ret = REG_EPAREN;
  497 
  498   /* We have already checked preg->fastmap != NULL.  */
  499   if (__glibc_likely (ret == REG_NOERROR))
  500     /* Compute the fastmap now, since regexec cannot modify the pattern
  501        buffer.  This function never fails in this implementation.  */
  502     (void) re_compile_fastmap (preg);
  503   else
  504     {
  505       /* Some error occurred while compiling the expression.  */
  506       re_free (preg->fastmap);
  507       preg->fastmap = NULL;
  508     }
  509 
  510   return (int) ret;
  511 }
  512 libc_hidden_def (__regcomp)
  513 weak_alias (__regcomp, regcomp)
  514 
  515 /* Returns a message corresponding to an error code, ERRCODE, returned
  516    from either regcomp or regexec.   We don't use PREG here.  */
  517 
  518 size_t
  519 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
  520       size_t errbuf_size)
  521 {
  522   const char *msg;
  523   size_t msg_size;
  524   int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
  525 
  526   if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
  527     /* Only error codes returned by the rest of the code should be passed
  528        to this routine.  If we are given anything else, or if other regex
  529        code generates an invalid error code, then the program has a bug.
  530        Dump core so we can fix it.  */
  531     abort ();
  532 
  533   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
  534 
  535   msg_size = strlen (msg) + 1; /* Includes the null.  */
  536 
  537   if (__glibc_likely (errbuf_size != 0))
  538     {
  539       size_t cpy_size = msg_size;
  540       if (__glibc_unlikely (msg_size > errbuf_size))
  541     {
  542       cpy_size = errbuf_size - 1;
  543       errbuf[cpy_size] = '\0';
  544     }
  545       memcpy (errbuf, msg, cpy_size);
  546     }
  547 
  548   return msg_size;
  549 }
  550 weak_alias (__regerror, regerror)
  551 
  552 
  553 #ifdef RE_ENABLE_I18N
  554 /* This static array is used for the map to single-byte characters when
  555    UTF-8 is used.  Otherwise we would allocate memory just to initialize
  556    it the same all the time.  UTF-8 is the preferred encoding so this is
  557    a worthwhile optimization.  */
  558 static const bitset_t utf8_sb_map =
  559 {
  560   /* Set the first 128 bits.  */
  561 # if defined __GNUC__ && !defined __STRICT_ANSI__
  562   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
  563 # else
  564 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
  565 #   error "bitset_word_t is narrower than 32 bits"
  566 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
  567   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
  568 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
  569   BITSET_WORD_MAX, BITSET_WORD_MAX,
  570 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
  571   BITSET_WORD_MAX,
  572 #  endif
  573   (BITSET_WORD_MAX
  574    >> (SBC_MAX % BITSET_WORD_BITS == 0
  575        ? 0
  576        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
  577 # endif
  578 };
  579 #endif
  580 
  581 
  582 static void
  583 free_dfa_content (re_dfa_t *dfa)
  584 {
  585   Idx i, j;
  586 
  587   if (dfa->nodes)
  588     for (i = 0; i < dfa->nodes_len; ++i)
  589       free_token (dfa->nodes + i);
  590   re_free (dfa->nexts);
  591   for (i = 0; i < dfa->nodes_len; ++i)
  592     {
  593       if (dfa->eclosures != NULL)
  594     re_node_set_free (dfa->eclosures + i);
  595       if (dfa->inveclosures != NULL)
  596     re_node_set_free (dfa->inveclosures + i);
  597       if (dfa->edests != NULL)
  598     re_node_set_free (dfa->edests + i);
  599     }
  600   re_free (dfa->edests);
  601   re_free (dfa->eclosures);
  602   re_free (dfa->inveclosures);
  603   re_free (dfa->nodes);
  604 
  605   if (dfa->state_table)
  606     for (i = 0; i <= dfa->state_hash_mask; ++i)
  607       {
  608     struct re_state_table_entry *entry = dfa->state_table + i;
  609     for (j = 0; j < entry->num; ++j)
  610       {
  611         re_dfastate_t *state = entry->array[j];
  612         free_state (state);
  613       }
  614     re_free (entry->array);
  615       }
  616   re_free (dfa->state_table);
  617 #ifdef RE_ENABLE_I18N
  618   if (dfa->sb_char != utf8_sb_map)
  619     re_free (dfa->sb_char);
  620 #endif
  621   re_free (dfa->subexp_map);
  622 #ifdef DEBUG
  623   re_free (dfa->re_str);
  624 #endif
  625 
  626   re_free (dfa);
  627 }
  628 
  629 
  630 /* Free dynamically allocated space used by PREG.  */
  631 
  632 void
  633 regfree (regex_t *preg)
  634 {
  635   re_dfa_t *dfa = preg->buffer;
  636   if (__glibc_likely (dfa != NULL))
  637     {
  638       lock_fini (dfa->lock);
  639       free_dfa_content (dfa);
  640     }
  641   preg->buffer = NULL;
  642   preg->allocated = 0;
  643 
  644   re_free (preg->fastmap);
  645   preg->fastmap = NULL;
  646 
  647   re_free (preg->translate);
  648   preg->translate = NULL;
  649 }
  650 libc_hidden_def (__regfree)
  651 weak_alias (__regfree, regfree)
  652 
  653 /* Entry points compatible with 4.2 BSD regex library.  We don't define
  654    them unless specifically requested.  */
  655 
  656 #if defined _REGEX_RE_COMP || defined _LIBC
  657 
  658 /* BSD has one and only one pattern buffer.  */
  659 static struct re_pattern_buffer re_comp_buf;
  660 
  661 char *
  662 # ifdef _LIBC
  663 /* Make these definitions weak in libc, so POSIX programs can redefine
  664    these names if they don't use our functions, and still use
  665    regcomp/regexec above without link errors.  */
  666 weak_function
  667 # endif
  668 re_comp (const char *s)
  669 {
  670   reg_errcode_t ret;
  671   char *fastmap;
  672 
  673   if (!s)
  674     {
  675       if (!re_comp_buf.buffer)
  676     return gettext ("No previous regular expression");
  677       return 0;
  678     }
  679 
  680   if (re_comp_buf.buffer)
  681     {
  682       fastmap = re_comp_buf.fastmap;
  683       re_comp_buf.fastmap = NULL;
  684       __regfree (&re_comp_buf);
  685       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
  686       re_comp_buf.fastmap = fastmap;
  687     }
  688 
  689   if (re_comp_buf.fastmap == NULL)
  690     {
  691       re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
  692       if (re_comp_buf.fastmap == NULL)
  693     return (char *) gettext (__re_error_msgid
  694                  + __re_error_msgid_idx[(int) REG_ESPACE]);
  695     }
  696 
  697   /* Since 're_exec' always passes NULL for the 'regs' argument, we
  698      don't need to initialize the pattern buffer fields which affect it.  */
  699 
  700   /* Match anchors at newlines.  */
  701   re_comp_buf.newline_anchor = 1;
  702 
  703   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
  704 
  705   if (!ret)
  706     return NULL;
  707 
  708   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
  709   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
  710 }
  711 
  712 #ifdef _LIBC
  713 libc_freeres_fn (free_mem)
  714 {
  715   __regfree (&re_comp_buf);
  716 }
  717 #endif
  718 
  719 #endif /* _REGEX_RE_COMP */
  720 
  721 /* Internal entry point.
  722    Compile the regular expression PATTERN, whose length is LENGTH.
  723    SYNTAX indicate regular expression's syntax.  */
  724 
  725 static reg_errcode_t
  726 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
  727              reg_syntax_t syntax)
  728 {
  729   reg_errcode_t err = REG_NOERROR;
  730   re_dfa_t *dfa;
  731   re_string_t regexp;
  732 
  733   /* Initialize the pattern buffer.  */
  734   preg->fastmap_accurate = 0;
  735   preg->syntax = syntax;
  736   preg->not_bol = preg->not_eol = 0;
  737   preg->used = 0;
  738   preg->re_nsub = 0;
  739   preg->can_be_null = 0;
  740   preg->regs_allocated = REGS_UNALLOCATED;
  741 
  742   /* Initialize the dfa.  */
  743   dfa = preg->buffer;
  744   if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
  745     {
  746       /* If zero allocated, but buffer is non-null, try to realloc
  747      enough space.  This loses if buffer's address is bogus, but
  748      that is the user's responsibility.  If ->buffer is NULL this
  749      is a simple allocation.  */
  750       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
  751       if (dfa == NULL)
  752     return REG_ESPACE;
  753       preg->allocated = sizeof (re_dfa_t);
  754       preg->buffer = dfa;
  755     }
  756   preg->used = sizeof (re_dfa_t);
  757 
  758   err = init_dfa (dfa, length);
  759   if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
  760     err = REG_ESPACE;
  761   if (__glibc_unlikely (err != REG_NOERROR))
  762     {
  763       free_dfa_content (dfa);
  764       preg->buffer = NULL;
  765       preg->allocated = 0;
  766       return err;
  767     }
  768 #ifdef DEBUG
  769   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
  770   dfa->re_str = re_malloc (char, length + 1);
  771   strncpy (dfa->re_str, pattern, length + 1);
  772 #endif
  773 
  774   err = re_string_construct (&regexp, pattern, length, preg->translate,
  775                  (syntax & RE_ICASE) != 0, dfa);
  776   if (__glibc_unlikely (err != REG_NOERROR))
  777     {
  778     re_compile_internal_free_return:
  779       free_workarea_compile (preg);
  780       re_string_destruct (&regexp);
  781       lock_fini (dfa->lock);
  782       free_dfa_content (dfa);
  783       preg->buffer = NULL;
  784       preg->allocated = 0;
  785       return err;
  786     }
  787 
  788   /* Parse the regular expression, and build a structure tree.  */
  789   preg->re_nsub = 0;
  790   dfa->str_tree = parse (&regexp, preg, syntax, &err);
  791   if (__glibc_unlikely (dfa->str_tree == NULL))
  792     goto re_compile_internal_free_return;
  793 
  794   /* Analyze the tree and create the nfa.  */
  795   err = analyze (preg);
  796   if (__glibc_unlikely (err != REG_NOERROR))
  797     goto re_compile_internal_free_return;
  798 
  799 #ifdef RE_ENABLE_I18N
  800   /* If possible, do searching in single byte encoding to speed things up.  */
  801   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
  802     optimize_utf8 (dfa);
  803 #endif
  804 
  805   /* Then create the initial state of the dfa.  */
  806   err = create_initial_state (dfa);
  807 
  808   /* Release work areas.  */
  809   free_workarea_compile (preg);
  810   re_string_destruct (&regexp);
  811 
  812   if (__glibc_unlikely (err != REG_NOERROR))
  813     {
  814       lock_fini (dfa->lock);
  815       free_dfa_content (dfa);
  816       preg->buffer = NULL;
  817       preg->allocated = 0;
  818     }
  819 
  820   return err;
  821 }
  822 
  823 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
  824    as the initial length of some arrays.  */
  825 
  826 static reg_errcode_t
  827 init_dfa (re_dfa_t *dfa, size_t pat_len)
  828 {
  829   __re_size_t table_size;
  830 #ifndef _LIBC
  831   const char *codeset_name;
  832 #endif
  833 #ifdef RE_ENABLE_I18N
  834   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
  835 #else
  836   size_t max_i18n_object_size = 0;
  837 #endif
  838   size_t max_object_size =
  839     MAX (sizeof (struct re_state_table_entry),
  840      MAX (sizeof (re_token_t),
  841           MAX (sizeof (re_node_set),
  842            MAX (sizeof (regmatch_t),
  843             max_i18n_object_size))));
  844 
  845   memset (dfa, '\0', sizeof (re_dfa_t));
  846 
  847   /* Force allocation of str_tree_storage the first time.  */
  848   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
  849 
  850   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
  851      calculation below, and for similar doubling calculations
  852      elsewhere.  And it's <= rather than <, because some of the
  853      doubling calculations add 1 afterwards.  */
  854   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
  855             <= pat_len))
  856     return REG_ESPACE;
  857 
  858   dfa->nodes_alloc = pat_len + 1;
  859   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
  860 
  861   /*  table_size = 2 ^ ceil(log pat_len) */
  862   for (table_size = 1; ; table_size <<= 1)
  863     if (table_size > pat_len)
  864       break;
  865 
  866   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
  867   dfa->state_hash_mask = table_size - 1;
  868 
  869   dfa->mb_cur_max = MB_CUR_MAX;
  870 #ifdef _LIBC
  871   if (dfa->mb_cur_max == 6
  872       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
  873     dfa->is_utf8 = 1;
  874   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
  875                != 0);
  876 #else
  877   codeset_name = nl_langinfo (CODESET);
  878   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
  879       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
  880       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
  881       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
  882     dfa->is_utf8 = 1;
  883 
  884   /* We check exhaustively in the loop below if this charset is a
  885      superset of ASCII.  */
  886   dfa->map_notascii = 0;
  887 #endif
  888 
  889 #ifdef RE_ENABLE_I18N
  890   if (dfa->mb_cur_max > 1)
  891     {
  892       if (dfa->is_utf8)
  893     dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
  894       else
  895     {
  896       int i, j, ch;
  897 
  898       dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
  899       if (__glibc_unlikely (dfa->sb_char == NULL))
  900         return REG_ESPACE;
  901 
  902       /* Set the bits corresponding to single byte chars.  */
  903       for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
  904         for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
  905           {
  906         wint_t wch = __btowc (ch);
  907         if (wch != WEOF)
  908           dfa->sb_char[i] |= (bitset_word_t) 1 << j;
  909 # ifndef _LIBC
  910         if (isascii (ch) && wch != ch)
  911           dfa->map_notascii = 1;
  912 # endif
  913           }
  914     }
  915     }
  916 #endif
  917 
  918   if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
  919     return REG_ESPACE;
  920   return REG_NOERROR;
  921 }
  922 
  923 /* Initialize WORD_CHAR table, which indicate which character is
  924    "word".  In this case "word" means that it is the word construction
  925    character used by some operators like "\<", "\>", etc.  */
  926 
  927 static void
  928 init_word_char (re_dfa_t *dfa)
  929 {
  930   int i = 0;
  931   int j;
  932   int ch = 0;
  933   dfa->word_ops_used = 1;
  934   if (__glibc_likely (dfa->map_notascii == 0))
  935     {
  936       /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
  937      them, an issue when this code is used in Gnulib.  */
  938       bitset_word_t bits0 = 0x00000000;
  939       bitset_word_t bits1 = 0x03ff0000;
  940       bitset_word_t bits2 = 0x87fffffe;
  941       bitset_word_t bits3 = 0x07fffffe;
  942       if (BITSET_WORD_BITS == 64)
  943     {
  944       /* Pacify gcc -Woverflow on 32-bit platformns.  */
  945       dfa->word_char[0] = bits1 << 31 << 1 | bits0;
  946       dfa->word_char[1] = bits3 << 31 << 1 | bits2;
  947       i = 2;
  948     }
  949       else if (BITSET_WORD_BITS == 32)
  950     {
  951       dfa->word_char[0] = bits0;
  952       dfa->word_char[1] = bits1;
  953       dfa->word_char[2] = bits2;
  954       dfa->word_char[3] = bits3;
  955       i = 4;
  956     }
  957       else
  958         goto general_case;
  959       ch = 128;
  960 
  961       if (__glibc_likely (dfa->is_utf8))
  962     {
  963       memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
  964       return;
  965     }
  966     }
  967 
  968  general_case:
  969   for (; i < BITSET_WORDS; ++i)
  970     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
  971       if (isalnum (ch) || ch == '_')
  972     dfa->word_char[i] |= (bitset_word_t) 1 << j;
  973 }
  974 
  975 /* Free the work area which are only used while compiling.  */
  976 
  977 static void
  978 free_workarea_compile (regex_t *preg)
  979 {
  980   re_dfa_t *dfa = preg->buffer;
  981   bin_tree_storage_t *storage, *next;
  982   for (storage = dfa->str_tree_storage; storage; storage = next)
  983     {
  984       next = storage->next;
  985       re_free (storage);
  986     }
  987   dfa->str_tree_storage = NULL;
  988   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
  989   dfa->str_tree = NULL;
  990   re_free (dfa->org_indices);
  991   dfa->org_indices = NULL;
  992 }
  993 
  994 /* Create initial states for all contexts.  */
  995 
  996 static reg_errcode_t
  997 create_initial_state (re_dfa_t *dfa)
  998 {
  999   Idx first, i;
 1000   reg_errcode_t err;
 1001   re_node_set init_nodes;
 1002 
 1003   /* Initial states have the epsilon closure of the node which is
 1004      the first node of the regular expression.  */
 1005   first = dfa->str_tree->first->node_idx;
 1006   dfa->init_node = first;
 1007   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 1008   if (__glibc_unlikely (err != REG_NOERROR))
 1009     return err;
 1010 
 1011   /* The back-references which are in initial states can epsilon transit,
 1012      since in this case all of the subexpressions can be null.
 1013      Then we add epsilon closures of the nodes which are the next nodes of
 1014      the back-references.  */
 1015   if (dfa->nbackref > 0)
 1016     for (i = 0; i < init_nodes.nelem; ++i)
 1017       {
 1018     Idx node_idx = init_nodes.elems[i];
 1019     re_token_type_t type = dfa->nodes[node_idx].type;
 1020 
 1021     Idx clexp_idx;
 1022     if (type != OP_BACK_REF)
 1023       continue;
 1024     for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 1025       {
 1026         re_token_t *clexp_node;
 1027         clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 1028         if (clexp_node->type == OP_CLOSE_SUBEXP
 1029         && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 1030           break;
 1031       }
 1032     if (clexp_idx == init_nodes.nelem)
 1033       continue;
 1034 
 1035     if (type == OP_BACK_REF)
 1036       {
 1037         Idx dest_idx = dfa->edests[node_idx].elems[0];
 1038         if (!re_node_set_contains (&init_nodes, dest_idx))
 1039           {
 1040         reg_errcode_t merge_err
 1041                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 1042         if (merge_err != REG_NOERROR)
 1043           return merge_err;
 1044         i = 0;
 1045           }
 1046       }
 1047       }
 1048 
 1049   /* It must be the first time to invoke acquire_state.  */
 1050   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 1051   /* We don't check ERR here, since the initial state must not be NULL.  */
 1052   if (__glibc_unlikely (dfa->init_state == NULL))
 1053     return err;
 1054   if (dfa->init_state->has_constraint)
 1055     {
 1056       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 1057                                CONTEXT_WORD);
 1058       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 1059                              CONTEXT_NEWLINE);
 1060       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 1061                              &init_nodes,
 1062                              CONTEXT_NEWLINE
 1063                              | CONTEXT_BEGBUF);
 1064       if (__glibc_unlikely (dfa->init_state_word == NULL
 1065                 || dfa->init_state_nl == NULL
 1066                 || dfa->init_state_begbuf == NULL))
 1067     return err;
 1068     }
 1069   else
 1070     dfa->init_state_word = dfa->init_state_nl
 1071       = dfa->init_state_begbuf = dfa->init_state;
 1072 
 1073   re_node_set_free (&init_nodes);
 1074   return REG_NOERROR;
 1075 }
 1076 
 1077 #ifdef RE_ENABLE_I18N
 1078 /* If it is possible to do searching in single byte encoding instead of UTF-8
 1079    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
 1080    DFA nodes where needed.  */
 1081 
 1082 static void
 1083 optimize_utf8 (re_dfa_t *dfa)
 1084 {
 1085   Idx node;
 1086   int i;
 1087   bool mb_chars = false;
 1088   bool has_period = false;
 1089 
 1090   for (node = 0; node < dfa->nodes_len; ++node)
 1091     switch (dfa->nodes[node].type)
 1092       {
 1093       case CHARACTER:
 1094     if (dfa->nodes[node].opr.c >= ASCII_CHARS)
 1095       mb_chars = true;
 1096     break;
 1097       case ANCHOR:
 1098     switch (dfa->nodes[node].opr.ctx_type)
 1099       {
 1100       case LINE_FIRST:
 1101       case LINE_LAST:
 1102       case BUF_FIRST:
 1103       case BUF_LAST:
 1104         break;
 1105       default:
 1106         /* Word anchors etc. cannot be handled.  It's okay to test
 1107            opr.ctx_type since constraints (for all DFA nodes) are
 1108            created by ORing one or more opr.ctx_type values.  */
 1109         return;
 1110       }
 1111     break;
 1112       case OP_PERIOD:
 1113     has_period = true;
 1114     break;
 1115       case OP_BACK_REF:
 1116       case OP_ALT:
 1117       case END_OF_RE:
 1118       case OP_DUP_ASTERISK:
 1119       case OP_OPEN_SUBEXP:
 1120       case OP_CLOSE_SUBEXP:
 1121     break;
 1122       case COMPLEX_BRACKET:
 1123     return;
 1124       case SIMPLE_BRACKET:
 1125     /* Just double check.  */
 1126     {
 1127       int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
 1128             ? 0
 1129             : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
 1130       for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
 1131         {
 1132           if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
 1133         return;
 1134           rshift = 0;
 1135         }
 1136     }
 1137     break;
 1138       default:
 1139     abort ();
 1140       }
 1141 
 1142   if (mb_chars || has_period)
 1143     for (node = 0; node < dfa->nodes_len; ++node)
 1144       {
 1145     if (dfa->nodes[node].type == CHARACTER
 1146         && dfa->nodes[node].opr.c >= ASCII_CHARS)
 1147       dfa->nodes[node].mb_partial = 0;
 1148     else if (dfa->nodes[node].type == OP_PERIOD)
 1149       dfa->nodes[node].type = OP_UTF8_PERIOD;
 1150       }
 1151 
 1152   /* The search can be in single byte locale.  */
 1153   dfa->mb_cur_max = 1;
 1154   dfa->is_utf8 = 0;
 1155   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
 1156 }
 1157 #endif
 1158 
 1159 /* Analyze the structure tree, and calculate "first", "next", "edest",
 1160    "eclosure", and "inveclosure".  */
 1161 
 1162 static reg_errcode_t
 1163 analyze (regex_t *preg)
 1164 {
 1165   re_dfa_t *dfa = preg->buffer;
 1166   reg_errcode_t ret;
 1167 
 1168   /* Allocate arrays.  */
 1169   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
 1170   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
 1171   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
 1172   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 1173   if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
 1174             || dfa->edests == NULL || dfa->eclosures == NULL))
 1175     return REG_ESPACE;
 1176 
 1177   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
 1178   if (dfa->subexp_map != NULL)
 1179     {
 1180       Idx i;
 1181       for (i = 0; i < preg->re_nsub; i++)
 1182     dfa->subexp_map[i] = i;
 1183       preorder (dfa->str_tree, optimize_subexps, dfa);
 1184       for (i = 0; i < preg->re_nsub; i++)
 1185     if (dfa->subexp_map[i] != i)
 1186       break;
 1187       if (i == preg->re_nsub)
 1188     {
 1189       re_free (dfa->subexp_map);
 1190       dfa->subexp_map = NULL;
 1191     }
 1192     }
 1193 
 1194   ret = postorder (dfa->str_tree, lower_subexps, preg);
 1195   if (__glibc_unlikely (ret != REG_NOERROR))
 1196     return ret;
 1197   ret = postorder (dfa->str_tree, calc_first, dfa);
 1198   if (__glibc_unlikely (ret != REG_NOERROR))
 1199     return ret;
 1200   preorder (dfa->str_tree, calc_next, dfa);
 1201   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
 1202   if (__glibc_unlikely (ret != REG_NOERROR))
 1203     return ret;
 1204   ret = calc_eclosure (dfa);
 1205   if (__glibc_unlikely (ret != REG_NOERROR))
 1206     return ret;
 1207 
 1208   /* We only need this during the prune_impossible_nodes pass in regexec.c;
 1209      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
 1210   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
 1211       || dfa->nbackref)
 1212     {
 1213       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
 1214       if (__glibc_unlikely (dfa->inveclosures == NULL))
 1215     return REG_ESPACE;
 1216       ret = calc_inveclosure (dfa);
 1217     }
 1218 
 1219   return ret;
 1220 }
 1221 
 1222 /* Our parse trees are very unbalanced, so we cannot use a stack to
 1223    implement parse tree visits.  Instead, we use parent pointers and
 1224    some hairy code in these two functions.  */
 1225 static reg_errcode_t
 1226 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
 1227        void *extra)
 1228 {
 1229   bin_tree_t *node, *prev;
 1230 
 1231   for (node = root; ; )
 1232     {
 1233       /* Descend down the tree, preferably to the left (or to the right
 1234      if that's the only child).  */
 1235       while (node->left || node->right)
 1236     if (node->left)
 1237       node = node->left;
 1238     else
 1239       node = node->right;
 1240 
 1241       do
 1242     {
 1243       reg_errcode_t err = fn (extra, node);
 1244       if (__glibc_unlikely (err != REG_NOERROR))
 1245         return err;
 1246       if (node->parent == NULL)
 1247         return REG_NOERROR;
 1248       prev = node;
 1249       node = node->parent;
 1250     }
 1251       /* Go up while we have a node that is reached from the right.  */
 1252       while (node->right == prev || node->right == NULL);
 1253       node = node->right;
 1254     }
 1255 }
 1256 
 1257 static reg_errcode_t
 1258 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
 1259       void *extra)
 1260 {
 1261   bin_tree_t *node;
 1262 
 1263   for (node = root; ; )
 1264     {
 1265       reg_errcode_t err = fn (extra, node);
 1266       if (__glibc_unlikely (err != REG_NOERROR))
 1267     return err;
 1268 
 1269       /* Go to the left node, or up and to the right.  */
 1270       if (node->left)
 1271     node = node->left;
 1272       else
 1273     {
 1274       bin_tree_t *prev = NULL;
 1275       while (node->right == prev || node->right == NULL)
 1276         {
 1277           prev = node;
 1278           node = node->parent;
 1279           if (!node)
 1280         return REG_NOERROR;
 1281         }
 1282       node = node->right;
 1283     }
 1284     }
 1285 }
 1286 
 1287 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
 1288    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
 1289    backreferences as well.  Requires a preorder visit.  */
 1290 static reg_errcode_t
 1291 optimize_subexps (void *extra, bin_tree_t *node)
 1292 {
 1293   re_dfa_t *dfa = (re_dfa_t *) extra;
 1294 
 1295   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
 1296     {
 1297       int idx = node->token.opr.idx;
 1298       node->token.opr.idx = dfa->subexp_map[idx];
 1299       dfa->used_bkref_map |= 1 << node->token.opr.idx;
 1300     }
 1301 
 1302   else if (node->token.type == SUBEXP
 1303        && node->left && node->left->token.type == SUBEXP)
 1304     {
 1305       Idx other_idx = node->left->token.opr.idx;
 1306 
 1307       node->left = node->left->left;
 1308       if (node->left)
 1309     node->left->parent = node;
 1310 
 1311       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
 1312       if (other_idx < BITSET_WORD_BITS)
 1313     dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
 1314     }
 1315 
 1316   return REG_NOERROR;
 1317 }
 1318 
 1319 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
 1320    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
 1321 static reg_errcode_t
 1322 lower_subexps (void *extra, bin_tree_t *node)
 1323 {
 1324   regex_t *preg = (regex_t *) extra;
 1325   reg_errcode_t err = REG_NOERROR;
 1326 
 1327   if (node->left && node->left->token.type == SUBEXP)
 1328     {
 1329       node->left = lower_subexp (&err, preg, node->left);
 1330       if (node->left)
 1331     node->left->parent = node;
 1332     }
 1333   if (node->right && node->right->token.type == SUBEXP)
 1334     {
 1335       node->right = lower_subexp (&err, preg, node->right);
 1336       if (node->right)
 1337     node->right->parent = node;
 1338     }
 1339 
 1340   return err;
 1341 }
 1342 
 1343 static bin_tree_t *
 1344 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
 1345 {
 1346   re_dfa_t *dfa = preg->buffer;
 1347   bin_tree_t *body = node->left;
 1348   bin_tree_t *op, *cls, *tree1, *tree;
 1349 
 1350   if (preg->no_sub
 1351       /* We do not optimize empty subexpressions, because otherwise we may
 1352      have bad CONCAT nodes with NULL children.  This is obviously not
 1353      very common, so we do not lose much.  An example that triggers
 1354      this case is the sed "script" /\(\)/x.  */
 1355       && node->left != NULL
 1356       && (node->token.opr.idx >= BITSET_WORD_BITS
 1357       || !(dfa->used_bkref_map
 1358            & ((bitset_word_t) 1 << node->token.opr.idx))))
 1359     return node->left;
 1360 
 1361   /* Convert the SUBEXP node to the concatenation of an
 1362      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
 1363   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
 1364   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
 1365   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
 1366   tree = create_tree (dfa, op, tree1, CONCAT);
 1367   if (__glibc_unlikely (tree == NULL || tree1 == NULL
 1368             || op == NULL || cls == NULL))
 1369     {
 1370       *err = REG_ESPACE;
 1371       return NULL;
 1372     }
 1373 
 1374   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
 1375   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
 1376   return tree;
 1377 }
 1378 
 1379 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
 1380    nodes.  Requires a postorder visit.  */
 1381 static reg_errcode_t
 1382 calc_first (void *extra, bin_tree_t *node)
 1383 {
 1384   re_dfa_t *dfa = (re_dfa_t *) extra;
 1385   if (node->token.type == CONCAT)
 1386     {
 1387       node->first = node->left->first;
 1388       node->node_idx = node->left->node_idx;
 1389     }
 1390   else
 1391     {
 1392       node->first = node;
 1393       node->node_idx = re_dfa_add_node (dfa, node->token);
 1394       if (__glibc_unlikely (node->node_idx == -1))
 1395     return REG_ESPACE;
 1396       if (node->token.type == ANCHOR)
 1397     dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
 1398     }
 1399   return REG_NOERROR;
 1400 }
 1401 
 1402 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
 1403 static reg_errcode_t
 1404 calc_next (void *extra, bin_tree_t *node)
 1405 {
 1406   switch (node->token.type)
 1407     {
 1408     case OP_DUP_ASTERISK:
 1409       node->left->next = node;
 1410       break;
 1411     case CONCAT:
 1412       node->left->next = node->right->first;
 1413       node->right->next = node->next;
 1414       break;
 1415     default:
 1416       if (node->left)
 1417     node->left->next = node->next;
 1418       if (node->right)
 1419     node->right->next = node->next;
 1420       break;
 1421     }
 1422   return REG_NOERROR;
 1423 }
 1424 
 1425 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
 1426 static reg_errcode_t
 1427 link_nfa_nodes (void *extra, bin_tree_t *node)
 1428 {
 1429   re_dfa_t *dfa = (re_dfa_t *) extra;
 1430   Idx idx = node->node_idx;
 1431   reg_errcode_t err = REG_NOERROR;
 1432 
 1433   switch (node->token.type)
 1434     {
 1435     case CONCAT:
 1436       break;
 1437 
 1438     case END_OF_RE:
 1439       DEBUG_ASSERT (node->next == NULL);
 1440       break;
 1441 
 1442     case OP_DUP_ASTERISK:
 1443     case OP_ALT:
 1444       {
 1445     Idx left, right;
 1446     dfa->has_plural_match = 1;
 1447     if (node->left != NULL)
 1448       left = node->left->first->node_idx;
 1449     else
 1450       left = node->next->node_idx;
 1451     if (node->right != NULL)
 1452       right = node->right->first->node_idx;
 1453     else
 1454       right = node->next->node_idx;
 1455     DEBUG_ASSERT (left > -1);
 1456     DEBUG_ASSERT (right > -1);
 1457     err = re_node_set_init_2 (dfa->edests + idx, left, right);
 1458       }
 1459       break;
 1460 
 1461     case ANCHOR:
 1462     case OP_OPEN_SUBEXP:
 1463     case OP_CLOSE_SUBEXP:
 1464       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
 1465       break;
 1466 
 1467     case OP_BACK_REF:
 1468       dfa->nexts[idx] = node->next->node_idx;
 1469       if (node->token.type == OP_BACK_REF)
 1470     err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
 1471       break;
 1472 
 1473     default:
 1474       DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
 1475       dfa->nexts[idx] = node->next->node_idx;
 1476       break;
 1477     }
 1478 
 1479   return err;
 1480 }
 1481 
 1482 /* Duplicate the epsilon closure of the node ROOT_NODE.
 1483    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
 1484    to their own constraint.  */
 1485 
 1486 static reg_errcode_t
 1487 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
 1488             Idx root_node, unsigned int init_constraint)
 1489 {
 1490   Idx org_node, clone_node;
 1491   bool ok;
 1492   unsigned int constraint = init_constraint;
 1493   for (org_node = top_org_node, clone_node = top_clone_node;;)
 1494     {
 1495       Idx org_dest, clone_dest;
 1496       if (dfa->nodes[org_node].type == OP_BACK_REF)
 1497     {
 1498       /* If the back reference epsilon-transit, its destination must
 1499          also have the constraint.  Then duplicate the epsilon closure
 1500          of the destination of the back reference, and store it in
 1501          edests of the back reference.  */
 1502       org_dest = dfa->nexts[org_node];
 1503       re_node_set_empty (dfa->edests + clone_node);
 1504       clone_dest = duplicate_node (dfa, org_dest, constraint);
 1505       if (__glibc_unlikely (clone_dest == -1))
 1506         return REG_ESPACE;
 1507       dfa->nexts[clone_node] = dfa->nexts[org_node];
 1508       ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
 1509       if (__glibc_unlikely (! ok))
 1510         return REG_ESPACE;
 1511     }
 1512       else if (dfa->edests[org_node].nelem == 0)
 1513     {
 1514       /* In case of the node can't epsilon-transit, don't duplicate the
 1515          destination and store the original destination as the
 1516          destination of the node.  */
 1517       dfa->nexts[clone_node] = dfa->nexts[org_node];
 1518       break;
 1519     }
 1520       else if (dfa->edests[org_node].nelem == 1)
 1521     {
 1522       /* In case of the node can epsilon-transit, and it has only one
 1523          destination.  */
 1524       org_dest = dfa->edests[org_node].elems[0];
 1525       re_node_set_empty (dfa->edests + clone_node);
 1526       /* If the node is root_node itself, it means the epsilon closure
 1527          has a loop.  Then tie it to the destination of the root_node.  */
 1528       if (org_node == root_node && clone_node != org_node)
 1529         {
 1530           ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
 1531           if (__glibc_unlikely (! ok))
 1532             return REG_ESPACE;
 1533           break;
 1534         }
 1535       /* In case the node has another constraint, append it.  */
 1536       constraint |= dfa->nodes[org_node].constraint;
 1537       clone_dest = duplicate_node (dfa, org_dest, constraint);
 1538       if (__glibc_unlikely (clone_dest == -1))
 1539         return REG_ESPACE;
 1540       ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
 1541       if (__glibc_unlikely (! ok))
 1542         return REG_ESPACE;
 1543     }
 1544       else /* dfa->edests[org_node].nelem == 2 */
 1545     {
 1546       /* In case of the node can epsilon-transit, and it has two
 1547          destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
 1548       org_dest = dfa->edests[org_node].elems[0];
 1549       re_node_set_empty (dfa->edests + clone_node);
 1550       /* Search for a duplicated node which satisfies the constraint.  */
 1551       clone_dest = search_duplicated_node (dfa, org_dest, constraint);
 1552       if (clone_dest == -1)
 1553         {
 1554           /* There is no such duplicated node, create a new one.  */
 1555           reg_errcode_t err;
 1556           clone_dest = duplicate_node (dfa, org_dest, constraint);
 1557           if (__glibc_unlikely (clone_dest == -1))
 1558         return REG_ESPACE;
 1559           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
 1560           if (__glibc_unlikely (! ok))
 1561         return REG_ESPACE;
 1562           err = duplicate_node_closure (dfa, org_dest, clone_dest,
 1563                         root_node, constraint);
 1564           if (__glibc_unlikely (err != REG_NOERROR))
 1565         return err;
 1566         }
 1567       else
 1568         {
 1569           /* There is a duplicated node which satisfies the constraint,
 1570          use it to avoid infinite loop.  */
 1571           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
 1572           if (__glibc_unlikely (! ok))
 1573         return REG_ESPACE;
 1574         }
 1575 
 1576       org_dest = dfa->edests[org_node].elems[1];
 1577       clone_dest = duplicate_node (dfa, org_dest, constraint);
 1578       if (__glibc_unlikely (clone_dest == -1))
 1579         return REG_ESPACE;
 1580       ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
 1581       if (__glibc_unlikely (! ok))
 1582         return REG_ESPACE;
 1583     }
 1584       org_node = org_dest;
 1585       clone_node = clone_dest;
 1586     }
 1587   return REG_NOERROR;
 1588 }
 1589 
 1590 /* Search for a node which is duplicated from the node ORG_NODE, and
 1591    satisfies the constraint CONSTRAINT.  */
 1592 
 1593 static Idx
 1594 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
 1595             unsigned int constraint)
 1596 {
 1597   Idx idx;
 1598   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
 1599     {
 1600       if (org_node == dfa->org_indices[idx]
 1601       && constraint == dfa->nodes[idx].constraint)
 1602     return idx; /* Found.  */
 1603     }
 1604   return -1; /* Not found.  */
 1605 }
 1606 
 1607 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
 1608    Return the index of the new node, or -1 if insufficient storage is
 1609    available.  */
 1610 
 1611 static Idx
 1612 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
 1613 {
 1614   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
 1615   if (__glibc_likely (dup_idx != -1))
 1616     {
 1617       dfa->nodes[dup_idx].constraint = constraint;
 1618       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
 1619       dfa->nodes[dup_idx].duplicated = 1;
 1620 
 1621       /* Store the index of the original node.  */
 1622       dfa->org_indices[dup_idx] = org_idx;
 1623     }
 1624   return dup_idx;
 1625 }
 1626 
 1627 static reg_errcode_t
 1628 calc_inveclosure (re_dfa_t *dfa)
 1629 {
 1630   Idx src, idx;
 1631   bool ok;
 1632   for (idx = 0; idx < dfa->nodes_len; ++idx)
 1633     re_node_set_init_empty (dfa->inveclosures + idx);
 1634 
 1635   for (src = 0; src < dfa->nodes_len; ++src)
 1636     {
 1637       Idx *elems = dfa->eclosures[src].elems;
 1638       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
 1639     {
 1640       ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
 1641       if (__glibc_unlikely (! ok))
 1642         return REG_ESPACE;
 1643     }
 1644     }
 1645 
 1646   return REG_NOERROR;
 1647 }
 1648 
 1649 /* Calculate "eclosure" for all the node in DFA.  */
 1650 
 1651 static reg_errcode_t
 1652 calc_eclosure (re_dfa_t *dfa)
 1653 {
 1654   Idx node_idx;
 1655   bool incomplete;
 1656   DEBUG_ASSERT (dfa->nodes_len > 0);
 1657   incomplete = false;
 1658   /* For each nodes, calculate epsilon closure.  */
 1659   for (node_idx = 0; ; ++node_idx)
 1660     {
 1661       reg_errcode_t err;
 1662       re_node_set eclosure_elem;
 1663       if (node_idx == dfa->nodes_len)
 1664     {
 1665       if (!incomplete)
 1666         break;
 1667       incomplete = false;
 1668       node_idx = 0;
 1669     }
 1670 
 1671       DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
 1672 
 1673       /* If we have already calculated, skip it.  */
 1674       if (dfa->eclosures[node_idx].nelem != 0)
 1675     continue;
 1676       /* Calculate epsilon closure of 'node_idx'.  */
 1677       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
 1678       if (__glibc_unlikely (err != REG_NOERROR))
 1679     return err;
 1680 
 1681       if (dfa->eclosures[node_idx].nelem == 0)
 1682     {
 1683       incomplete = true;
 1684       re_node_set_free (&eclosure_elem);
 1685     }
 1686     }
 1687   return REG_NOERROR;
 1688 }
 1689 
 1690 /* Calculate epsilon closure of NODE.  */
 1691 
 1692 static reg_errcode_t
 1693 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
 1694 {
 1695   reg_errcode_t err;
 1696   Idx i;
 1697   re_node_set eclosure;
 1698   bool ok;
 1699   bool incomplete = false;
 1700   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
 1701   if (__glibc_unlikely (err != REG_NOERROR))
 1702     return err;
 1703 
 1704   /* This indicates that we are calculating this node now.
 1705      We reference this value to avoid infinite loop.  */
 1706   dfa->eclosures[node].nelem = -1;
 1707 
 1708   /* If the current node has constraints, duplicate all nodes
 1709      since they must inherit the constraints.  */
 1710   if (dfa->nodes[node].constraint
 1711       && dfa->edests[node].nelem
 1712       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
 1713     {
 1714       err = duplicate_node_closure (dfa, node, node, node,
 1715                     dfa->nodes[node].constraint);
 1716       if (__glibc_unlikely (err != REG_NOERROR))
 1717     return err;
 1718     }
 1719 
 1720   /* Expand each epsilon destination nodes.  */
 1721   if (IS_EPSILON_NODE(dfa->nodes[node].type))
 1722     for (i = 0; i < dfa->edests[node].nelem; ++i)
 1723       {
 1724     re_node_set eclosure_elem;
 1725     Idx edest = dfa->edests[node].elems[i];
 1726     /* If calculating the epsilon closure of 'edest' is in progress,
 1727        return intermediate result.  */
 1728     if (dfa->eclosures[edest].nelem == -1)
 1729       {
 1730         incomplete = true;
 1731         continue;
 1732       }
 1733     /* If we haven't calculated the epsilon closure of 'edest' yet,
 1734        calculate now. Otherwise use calculated epsilon closure.  */
 1735     if (dfa->eclosures[edest].nelem == 0)
 1736       {
 1737         err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
 1738         if (__glibc_unlikely (err != REG_NOERROR))
 1739           return err;
 1740       }
 1741     else
 1742       eclosure_elem = dfa->eclosures[edest];
 1743     /* Merge the epsilon closure of 'edest'.  */
 1744     err = re_node_set_merge (&eclosure, &eclosure_elem);
 1745     if (__glibc_unlikely (err != REG_NOERROR))
 1746       return err;
 1747     /* If the epsilon closure of 'edest' is incomplete,
 1748        the epsilon closure of this node is also incomplete.  */
 1749     if (dfa->eclosures[edest].nelem == 0)
 1750       {
 1751         incomplete = true;
 1752         re_node_set_free (&eclosure_elem);
 1753       }
 1754       }
 1755 
 1756   /* An epsilon closure includes itself.  */
 1757   ok = re_node_set_insert (&eclosure, node);
 1758   if (__glibc_unlikely (! ok))
 1759     return REG_ESPACE;
 1760   if (incomplete && !root)
 1761     dfa->eclosures[node].nelem = 0;
 1762   else
 1763     dfa->eclosures[node] = eclosure;
 1764   *new_set = eclosure;
 1765   return REG_NOERROR;
 1766 }
 1767 
 1768 /* Functions for token which are used in the parser.  */
 1769 
 1770 /* Fetch a token from INPUT.
 1771    We must not use this function inside bracket expressions.  */
 1772 
 1773 static void
 1774 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
 1775 {
 1776   re_string_skip_bytes (input, peek_token (result, input, syntax));
 1777 }
 1778 
 1779 /* Peek a token from INPUT, and return the length of the token.
 1780    We must not use this function inside bracket expressions.  */
 1781 
 1782 static int
 1783 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
 1784 {
 1785   unsigned char c;
 1786 
 1787   if (re_string_eoi (input))
 1788     {
 1789       token->type = END_OF_RE;
 1790       return 0;
 1791     }
 1792 
 1793   c = re_string_peek_byte (input, 0);
 1794   token->opr.c = c;
 1795 
 1796   token->word_char = 0;
 1797 #ifdef RE_ENABLE_I18N
 1798   token->mb_partial = 0;
 1799   if (input->mb_cur_max > 1
 1800       && !re_string_first_byte (input, re_string_cur_idx (input)))
 1801     {
 1802       token->type = CHARACTER;
 1803       token->mb_partial = 1;
 1804       return 1;
 1805     }
 1806 #endif
 1807   if (c == '\\')
 1808     {
 1809       unsigned char c2;
 1810       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
 1811     {
 1812       token->type = BACK_SLASH;
 1813       return 1;
 1814     }
 1815 
 1816       c2 = re_string_peek_byte_case (input, 1);
 1817       token->opr.c = c2;
 1818       token->type = CHARACTER;
 1819 #ifdef RE_ENABLE_I18N
 1820       if (input->mb_cur_max > 1)
 1821     {
 1822       wint_t wc = re_string_wchar_at (input,
 1823                       re_string_cur_idx (input) + 1);
 1824       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
 1825     }
 1826       else
 1827 #endif
 1828     token->word_char = IS_WORD_CHAR (c2) != 0;
 1829 
 1830       switch (c2)
 1831     {
 1832     case '|':
 1833       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
 1834         token->type = OP_ALT;
 1835       break;
 1836     case '1': case '2': case '3': case '4': case '5':
 1837     case '6': case '7': case '8': case '9':
 1838       if (!(syntax & RE_NO_BK_REFS))
 1839         {
 1840           token->type = OP_BACK_REF;
 1841           token->opr.idx = c2 - '1';
 1842         }
 1843       break;
 1844     case '<':
 1845       if (!(syntax & RE_NO_GNU_OPS))
 1846         {
 1847           token->type = ANCHOR;
 1848           token->opr.ctx_type = WORD_FIRST;
 1849         }
 1850       break;
 1851     case '>':
 1852       if (!(syntax & RE_NO_GNU_OPS))
 1853         {
 1854           token->type = ANCHOR;
 1855           token->opr.ctx_type = WORD_LAST;
 1856         }
 1857       break;
 1858     case 'b':
 1859       if (!(syntax & RE_NO_GNU_OPS))
 1860         {
 1861           token->type = ANCHOR;
 1862           token->opr.ctx_type = WORD_DELIM;
 1863         }
 1864       break;
 1865     case 'B':
 1866       if (!(syntax & RE_NO_GNU_OPS))
 1867         {
 1868           token->type = ANCHOR;
 1869           token->opr.ctx_type = NOT_WORD_DELIM;
 1870         }
 1871       break;
 1872     case 'w':
 1873       if (!(syntax & RE_NO_GNU_OPS))
 1874         token->type = OP_WORD;
 1875       break;
 1876     case 'W':
 1877       if (!(syntax & RE_NO_GNU_OPS))
 1878         token->type = OP_NOTWORD;
 1879       break;
 1880     case 's':
 1881       if (!(syntax & RE_NO_GNU_OPS))
 1882         token->type = OP_SPACE;
 1883       break;
 1884     case 'S':
 1885       if (!(syntax & RE_NO_GNU_OPS))
 1886         token->type = OP_NOTSPACE;
 1887       break;
 1888     case '`':
 1889       if (!(syntax & RE_NO_GNU_OPS))
 1890         {
 1891           token->type = ANCHOR;
 1892           token->opr.ctx_type = BUF_FIRST;
 1893         }
 1894       break;
 1895     case '\'':
 1896       if (!(syntax & RE_NO_GNU_OPS))
 1897         {
 1898           token->type = ANCHOR;
 1899           token->opr.ctx_type = BUF_LAST;
 1900         }
 1901       break;
 1902     case '(':
 1903       if (!(syntax & RE_NO_BK_PARENS))
 1904         token->type = OP_OPEN_SUBEXP;
 1905       break;
 1906     case ')':
 1907       if (!(syntax & RE_NO_BK_PARENS))
 1908         token->type = OP_CLOSE_SUBEXP;
 1909       break;
 1910     case '+':
 1911       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
 1912         token->type = OP_DUP_PLUS;
 1913       break;
 1914     case '?':
 1915       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
 1916         token->type = OP_DUP_QUESTION;
 1917       break;
 1918     case '{':
 1919       if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
 1920         token->type = OP_OPEN_DUP_NUM;
 1921       break;
 1922     case '}':
 1923       if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
 1924         token->type = OP_CLOSE_DUP_NUM;
 1925       break;
 1926     default:
 1927       break;
 1928     }
 1929       return 2;
 1930     }
 1931 
 1932   token->type = CHARACTER;
 1933 #ifdef RE_ENABLE_I18N
 1934   if (input->mb_cur_max > 1)
 1935     {
 1936       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
 1937       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
 1938     }
 1939   else
 1940 #endif
 1941     token->word_char = IS_WORD_CHAR (token->opr.c);
 1942 
 1943   switch (c)
 1944     {
 1945     case '\n':
 1946       if (syntax & RE_NEWLINE_ALT)
 1947     token->type = OP_ALT;
 1948       break;
 1949     case '|':
 1950       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
 1951     token->type = OP_ALT;
 1952       break;
 1953     case '*':
 1954       token->type = OP_DUP_ASTERISK;
 1955       break;
 1956     case '+':
 1957       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
 1958     token->type = OP_DUP_PLUS;
 1959       break;
 1960     case '?':
 1961       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
 1962     token->type = OP_DUP_QUESTION;
 1963       break;
 1964     case '{':
 1965       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
 1966     token->type = OP_OPEN_DUP_NUM;
 1967       break;
 1968     case '}':
 1969       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
 1970     token->type = OP_CLOSE_DUP_NUM;
 1971       break;
 1972     case '(':
 1973       if (syntax & RE_NO_BK_PARENS)
 1974     token->type = OP_OPEN_SUBEXP;
 1975       break;
 1976     case ')':
 1977       if (syntax & RE_NO_BK_PARENS)
 1978     token->type = OP_CLOSE_SUBEXP;
 1979       break;
 1980     case '[':
 1981       token->type = OP_OPEN_BRACKET;
 1982       break;
 1983     case '.':
 1984       token->type = OP_PERIOD;
 1985       break;
 1986     case '^':
 1987       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
 1988       && re_string_cur_idx (input) != 0)
 1989     {
 1990       char prev = re_string_peek_byte (input, -1);
 1991       if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
 1992         break;
 1993     }
 1994       token->type = ANCHOR;
 1995       token->opr.ctx_type = LINE_FIRST;
 1996       break;
 1997     case '$':
 1998       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
 1999       && re_string_cur_idx (input) + 1 != re_string_length (input))
 2000     {
 2001       re_token_t next;
 2002       re_string_skip_bytes (input, 1);
 2003       peek_token (&next, input, syntax);
 2004       re_string_skip_bytes (input, -1);
 2005       if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
 2006         break;
 2007     }
 2008       token->type = ANCHOR;
 2009       token->opr.ctx_type = LINE_LAST;
 2010       break;
 2011     default:
 2012       break;
 2013     }
 2014   return 1;
 2015 }
 2016 
 2017 /* Peek a token from INPUT, and return the length of the token.
 2018    We must not use this function out of bracket expressions.  */
 2019 
 2020 static int
 2021 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
 2022 {
 2023   unsigned char c;
 2024   if (re_string_eoi (input))
 2025     {
 2026       token->type = END_OF_RE;
 2027       return 0;
 2028     }
 2029   c = re_string_peek_byte (input, 0);
 2030   token->opr.c = c;
 2031 
 2032 #ifdef RE_ENABLE_I18N
 2033   if (input->mb_cur_max > 1
 2034       && !re_string_first_byte (input, re_string_cur_idx (input)))
 2035     {
 2036       token->type = CHARACTER;
 2037       return 1;
 2038     }
 2039 #endif /* RE_ENABLE_I18N */
 2040 
 2041   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
 2042       && re_string_cur_idx (input) + 1 < re_string_length (input))
 2043     {
 2044       /* In this case, '\' escape a character.  */
 2045       unsigned char c2;
 2046       re_string_skip_bytes (input, 1);
 2047       c2 = re_string_peek_byte (input, 0);
 2048       token->opr.c = c2;
 2049       token->type = CHARACTER;
 2050       return 1;
 2051     }
 2052   if (c == '[') /* '[' is a special char in a bracket exps.  */
 2053     {
 2054       unsigned char c2;
 2055       int token_len;
 2056       if (re_string_cur_idx (input) + 1 < re_string_length (input))
 2057     c2 = re_string_peek_byte (input, 1);
 2058       else
 2059     c2 = 0;
 2060       token->opr.c = c2;
 2061       token_len = 2;
 2062       switch (c2)
 2063     {
 2064     case '.':
 2065       token->type = OP_OPEN_COLL_ELEM;
 2066       break;
 2067 
 2068     case '=':
 2069       token->type = OP_OPEN_EQUIV_CLASS;
 2070       break;
 2071 
 2072     case ':':
 2073       if (syntax & RE_CHAR_CLASSES)
 2074         {
 2075           token->type = OP_OPEN_CHAR_CLASS;
 2076           break;
 2077         }
 2078       FALLTHROUGH;
 2079     default:
 2080       token->type = CHARACTER;
 2081       token->opr.c = c;
 2082       token_len = 1;
 2083       break;
 2084     }
 2085       return token_len;
 2086     }
 2087   switch (c)
 2088     {
 2089     case '-':
 2090       token->type = OP_CHARSET_RANGE;
 2091       break;
 2092     case ']':
 2093       token->type = OP_CLOSE_BRACKET;
 2094       break;
 2095     case '^':
 2096       token->type = OP_NON_MATCH_LIST;
 2097       break;
 2098     default:
 2099       token->type = CHARACTER;
 2100     }
 2101   return 1;
 2102 }
 2103 
 2104 /* Functions for parser.  */
 2105 
 2106 /* Entry point of the parser.
 2107    Parse the regular expression REGEXP and return the structure tree.
 2108    If an error occurs, ERR is set by error code, and return NULL.
 2109    This function build the following tree, from regular expression <reg_exp>:
 2110        CAT
 2111        / \
 2112       /   \
 2113    <reg_exp>  EOR
 2114 
 2115    CAT means concatenation.
 2116    EOR means end of regular expression.  */
 2117 
 2118 static bin_tree_t *
 2119 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
 2120        reg_errcode_t *err)
 2121 {
 2122   re_dfa_t *dfa = preg->buffer;
 2123   bin_tree_t *tree, *eor, *root;
 2124   re_token_t current_token;
 2125   dfa->syntax = syntax;
 2126   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
 2127   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
 2128   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
 2129     return NULL;
 2130   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
 2131   if (tree != NULL)
 2132     root = create_tree (dfa, tree, eor, CONCAT);
 2133   else
 2134     root = eor;
 2135   if (__glibc_unlikely (eor == NULL || root == NULL))
 2136     {
 2137       *err = REG_ESPACE;
 2138       return NULL;
 2139     }
 2140   return root;
 2141 }
 2142 
 2143 /* This function build the following tree, from regular expression
 2144    <branch1>|<branch2>:
 2145        ALT
 2146        / \
 2147       /   \
 2148    <branch1> <branch2>
 2149 
 2150    ALT means alternative, which represents the operator '|'.  */
 2151 
 2152 static bin_tree_t *
 2153 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
 2154            reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
 2155 {
 2156   re_dfa_t *dfa = preg->buffer;
 2157   bin_tree_t *tree, *branch = NULL;
 2158   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
 2159   tree = parse_branch (regexp, preg, token, syntax, nest, err);
 2160   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
 2161     return NULL;
 2162 
 2163   while (token->type == OP_ALT)
 2164     {
 2165       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
 2166       if (token->type != OP_ALT && token->type != END_OF_RE
 2167       && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
 2168     {
 2169       bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
 2170       dfa->completed_bkref_map = initial_bkref_map;
 2171       branch = parse_branch (regexp, preg, token, syntax, nest, err);
 2172       if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
 2173         {
 2174           if (tree != NULL)
 2175         postorder (tree, free_tree, NULL);
 2176           return NULL;
 2177         }
 2178       dfa->completed_bkref_map |= accumulated_bkref_map;
 2179     }
 2180       else
 2181     branch = NULL;
 2182       tree = create_tree (dfa, tree, branch, OP_ALT);
 2183       if (__glibc_unlikely (tree == NULL))
 2184     {
 2185       *err = REG_ESPACE;
 2186       return NULL;
 2187     }
 2188     }
 2189   return tree;
 2190 }
 2191 
 2192 /* This function build the following tree, from regular expression
 2193    <exp1><exp2>:
 2194     CAT
 2195     / \
 2196        /   \
 2197    <exp1> <exp2>
 2198 
 2199    CAT means concatenation.  */
 2200 
 2201 static bin_tree_t *
 2202 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
 2203           reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
 2204 {
 2205   bin_tree_t *tree, *expr;
 2206   re_dfa_t *dfa = preg->buffer;
 2207   tree = parse_expression (regexp, preg, token, syntax, nest, err);
 2208   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
 2209     return NULL;
 2210 
 2211   while (token->type != OP_ALT && token->type != END_OF_RE
 2212      && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
 2213     {
 2214       expr = parse_expression (regexp, preg, token, syntax, nest, err);
 2215       if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
 2216     {
 2217       if (tree != NULL)
 2218         postorder (tree, free_tree, NULL);
 2219       return NULL;
 2220     }
 2221       if (tree != NULL && expr != NULL)
 2222     {
 2223       bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
 2224       if (newtree == NULL)
 2225         {
 2226           postorder (expr, free_tree, NULL);
 2227           postorder (tree, free_tree, NULL);
 2228           *err = REG_ESPACE;
 2229           return NULL;
 2230         }
 2231       tree = newtree;
 2232     }
 2233       else if (tree == NULL)
 2234     tree = expr;
 2235       /* Otherwise expr == NULL, we don't need to create new tree.  */
 2236     }
 2237   return tree;
 2238 }
 2239 
 2240 /* This function build the following tree, from regular expression a*:
 2241      *
 2242      |
 2243      a
 2244 */
 2245 
 2246 static bin_tree_t *
 2247 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
 2248           reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
 2249 {
 2250   re_dfa_t *dfa = preg->buffer;
 2251   bin_tree_t *tree;
 2252   switch (token->type)
 2253     {
 2254     case CHARACTER:
 2255       tree = create_token_tree (dfa, NULL, NULL, token);
 2256       if (__glibc_unlikely (tree == NULL))
 2257     {
 2258       *err = REG_ESPACE;
 2259       return NULL;
 2260     }
 2261 #ifdef RE_ENABLE_I18N
 2262       if (dfa->mb_cur_max > 1)
 2263     {
 2264       while (!re_string_eoi (regexp)
 2265          && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
 2266         {
 2267           bin_tree_t *mbc_remain;
 2268           fetch_token (token, regexp, syntax);
 2269           mbc_remain = create_token_tree (dfa, NULL, NULL, token);
 2270           tree = create_tree (dfa, tree, mbc_remain, CONCAT);
 2271           if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
 2272         {
 2273           *err = REG_ESPACE;
 2274           return NULL;
 2275         }
 2276         }
 2277     }
 2278 #endif
 2279       break;
 2280 
 2281     case OP_OPEN_SUBEXP:
 2282       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
 2283       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
 2284     return NULL;
 2285       break;
 2286 
 2287     case OP_OPEN_BRACKET:
 2288       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
 2289       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
 2290     return NULL;
 2291       break;
 2292 
 2293     case OP_BACK_REF:
 2294       if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
 2295     {
 2296       *err = REG_ESUBREG;
 2297       return NULL;
 2298     }
 2299       dfa->used_bkref_map |= 1 << token->opr.idx;
 2300       tree = create_token_tree (dfa, NULL, NULL, token);
 2301       if (__glibc_unlikely (tree == NULL))
 2302     {
 2303       *err = REG_ESPACE;
 2304       return NULL;
 2305     }
 2306       ++dfa->nbackref;
 2307       dfa->has_mb_node = 1;
 2308       break;
 2309 
 2310     case OP_OPEN_DUP_NUM:
 2311       if (syntax & RE_CONTEXT_INVALID_DUP)
 2312     {
 2313       *err = REG_BADRPT;
 2314       return NULL;
 2315     }
 2316       FALLTHROUGH;
 2317     case OP_DUP_ASTERISK:
 2318     case OP_DUP_PLUS:
 2319     case OP_DUP_QUESTION:
 2320       if (syntax & RE_CONTEXT_INVALID_OPS)
 2321     {
 2322       *err = REG_BADRPT;
 2323       return NULL;
 2324     }
 2325       else if (syntax & RE_CONTEXT_INDEP_OPS)
 2326     {
 2327       fetch_token (token, regexp, syntax);
 2328       return parse_expression (regexp, preg, token, syntax, nest, err);
 2329     }
 2330       FALLTHROUGH;
 2331     case OP_CLOSE_SUBEXP:
 2332       if ((token->type == OP_CLOSE_SUBEXP)
 2333       && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
 2334     {
 2335       *err = REG_ERPAREN;
 2336       return NULL;
 2337     }
 2338       FALLTHROUGH;
 2339     case OP_CLOSE_DUP_NUM:
 2340       /* We treat it as a normal character.  */
 2341 
 2342       /* Then we can these characters as normal characters.  */
 2343       token->type = CHARACTER;
 2344       /* mb_partial and word_char bits should be initialized already
 2345      by peek_token.  */
 2346       tree = create_token_tree (dfa, NULL, NULL, token);
 2347       if (__glibc_unlikely (tree == NULL))
 2348     {
 2349       *err = REG_ESPACE;
 2350       return NULL;
 2351     }
 2352       break;
 2353 
 2354     case ANCHOR:
 2355       if ((token->opr.ctx_type
 2356        & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
 2357       && dfa->word_ops_used == 0)
 2358     init_word_char (dfa);
 2359       if (token->opr.ctx_type == WORD_DELIM
 2360       || token->opr.ctx_type == NOT_WORD_DELIM)
 2361     {
 2362       bin_tree_t *tree_first, *tree_last;
 2363       if (token->opr.ctx_type == WORD_DELIM)
 2364         {
 2365           token->opr.ctx_type = WORD_FIRST;
 2366           tree_first = create_token_tree (dfa, NULL, NULL, token);
 2367           token->opr.ctx_type = WORD_LAST;
 2368         }
 2369       else
 2370         {
 2371           token->opr.ctx_type = INSIDE_WORD;
 2372           tree_first = create_token_tree (dfa, NULL, NULL, token);
 2373           token->opr.ctx_type = INSIDE_NOTWORD;
 2374         }
 2375       tree_last = create_token_tree (dfa, NULL, NULL, token);
 2376       tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
 2377       if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
 2378                 || tree == NULL))
 2379         {
 2380           *err = REG_ESPACE;
 2381           return NULL;
 2382         }
 2383     }
 2384       else
 2385     {
 2386       tree = create_token_tree (dfa, NULL, NULL, token);
 2387       if (__glibc_unlikely (tree == NULL))
 2388         {
 2389           *err = REG_ESPACE;
 2390           return NULL;
 2391         }
 2392     }
 2393       /* We must return here, since ANCHORs can't be followed
 2394      by repetition operators.
 2395      eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
 2396          it must not be "<ANCHOR(^)><REPEAT(*)>".  */
 2397       fetch_token (token, regexp, syntax);
 2398       return tree;
 2399 
 2400     case OP_PERIOD:
 2401       tree = create_token_tree (dfa, NULL, NULL, token);
 2402       if (__glibc_unlikely (tree == NULL))
 2403     {
 2404       *err = REG_ESPACE;
 2405       return NULL;
 2406     }
 2407       if (dfa->mb_cur_max > 1)
 2408     dfa->has_mb_node = 1;
 2409       break;
 2410 
 2411     case OP_WORD:
 2412     case OP_NOTWORD:
 2413       tree = build_charclass_op (dfa, regexp->trans,
 2414                  "alnum",
 2415                  "_",
 2416                  token->type == OP_NOTWORD, err);
 2417       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
 2418     return NULL;
 2419       break;
 2420 
 2421     case OP_SPACE:
 2422     case OP_NOTSPACE:
 2423       tree = build_charclass_op (dfa, regexp->trans,
 2424                  "space",
 2425                  "",
 2426                  token->type == OP_NOTSPACE, err);
 2427       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
 2428     return NULL;
 2429       break;
 2430 
 2431     case OP_ALT:
 2432     case END_OF_RE:
 2433       return NULL;
 2434 
 2435     case BACK_SLASH:
 2436       *err = REG_EESCAPE;
 2437       return NULL;
 2438 
 2439     default:
 2440       /* Must not happen?  */
 2441       DEBUG_ASSERT (false);
 2442       return NULL;
 2443     }
 2444   fetch_token (token, regexp, syntax);
 2445 
 2446   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
 2447      || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
 2448     {
 2449       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
 2450                        syntax, err);
 2451       if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
 2452     {
 2453       if (tree != NULL)
 2454         postorder (tree, free_tree, NULL);
 2455       return NULL;
 2456     }
 2457       tree = dup_tree;
 2458       /* In BRE consecutive duplications are not allowed.  */
 2459       if ((syntax & RE_CONTEXT_INVALID_DUP)
 2460       && (token->type == OP_DUP_ASTERISK
 2461           || token->type == OP_OPEN_DUP_NUM))
 2462     {
 2463       if (tree != NULL)
 2464         postorder (tree, free_tree, NULL);
 2465       *err = REG_BADRPT;
 2466       return NULL;
 2467     }
 2468     }
 2469 
 2470   return tree;
 2471 }
 2472 
 2473 /* This function build the following tree, from regular expression
 2474    (<reg_exp>):
 2475      SUBEXP
 2476         |
 2477     <reg_exp>
 2478 */
 2479 
 2480 static bin_tree_t *
 2481 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
 2482            reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
 2483 {
 2484   re_dfa_t *dfa = preg->buffer;
 2485   bin_tree_t *tree;
 2486   size_t cur_nsub;
 2487   cur_nsub = preg->re_nsub++;
 2488 
 2489   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
 2490 
 2491   /* The subexpression may be a null string.  */
 2492   if (token->type == OP_CLOSE_SUBEXP)
 2493     tree = NULL;
 2494   else
 2495     {
 2496       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
 2497       if (__glibc_unlikely (*err == REG_NOERROR
 2498                 && token->type != OP_CLOSE_SUBEXP))
 2499     {
 2500       if (tree != NULL)
 2501         postorder (tree, free_tree, NULL);
 2502       *err = REG_EPAREN;
 2503     }
 2504       if (__glibc_unlikely (*err != REG_NOERROR))
 2505     return NULL;
 2506     }
 2507 
 2508   if (cur_nsub <= '9' - '1')
 2509     dfa->completed_bkref_map |= 1 << cur_nsub;
 2510 
 2511   tree = create_tree (dfa, tree, NULL, SUBEXP);
 2512   if (__glibc_unlikely (tree == NULL))
 2513     {
 2514       *err = REG_ESPACE;
 2515       return NULL;
 2516     }
 2517   tree->token.opr.idx = cur_nsub;
 2518   return tree;
 2519 }
 2520 
 2521 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
 2522 
 2523 static bin_tree_t *
 2524 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
 2525           re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
 2526 {
 2527   bin_tree_t *tree = NULL, *old_tree = NULL;
 2528   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
 2529   re_token_t start_token = *token;
 2530 
 2531   if (token->type == OP_OPEN_DUP_NUM)
 2532     {
 2533       end = 0;
 2534       start = fetch_number (regexp, token, syntax);
 2535       if (start == -1)
 2536     {
 2537       if (token->type == CHARACTER && token->opr.c == ',')
 2538         start = 0; /* We treat "{,m}" as "{0,m}".  */
 2539       else
 2540         {
 2541           *err = REG_BADBR; /* <re>{} is invalid.  */
 2542           return NULL;
 2543         }
 2544     }
 2545       if (__glibc_likely (start != -2))
 2546     {
 2547       /* We treat "{n}" as "{n,n}".  */
 2548       end = ((token->type == OP_CLOSE_DUP_NUM) ? start
 2549          : ((token->type == CHARACTER && token->opr.c == ',')
 2550             ? fetch_number (regexp, token, syntax) : -2));
 2551     }
 2552       if (__glibc_unlikely (start == -2 || end == -2))
 2553     {
 2554       /* Invalid sequence.  */
 2555       if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
 2556         {
 2557           if (token->type == END_OF_RE)
 2558         *err = REG_EBRACE;
 2559           else
 2560         *err = REG_BADBR;
 2561 
 2562           return NULL;
 2563         }
 2564 
 2565       /* If the syntax bit is set, rollback.  */
 2566       re_string_set_index (regexp, start_idx);
 2567       *token = start_token;
 2568       token->type = CHARACTER;
 2569       /* mb_partial and word_char bits should be already initialized by
 2570          peek_token.  */
 2571       return elem;
 2572     }
 2573 
 2574       if (__glibc_unlikely ((end != -1 && start > end)
 2575                 || token->type != OP_CLOSE_DUP_NUM))
 2576     {
 2577       /* First number greater than second.  */
 2578       *err = REG_BADBR;
 2579       return NULL;
 2580     }
 2581 
 2582       if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
 2583     {
 2584       *err = REG_ESIZE;
 2585       return NULL;
 2586     }
 2587     }
 2588   else
 2589     {
 2590       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
 2591       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
 2592     }
 2593 
 2594   fetch_token (token, regexp, syntax);
 2595 
 2596   if (__glibc_unlikely (elem == NULL))
 2597     return NULL;
 2598   if (__glibc_unlikely (start == 0 && end == 0))
 2599     {
 2600       postorder (elem, free_tree, NULL);
 2601       return NULL;
 2602     }
 2603 
 2604   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
 2605   if (__glibc_unlikely (start > 0))
 2606     {
 2607       tree = elem;
 2608       for (i = 2; i <= start; ++i)
 2609     {
 2610       elem = duplicate_tree (elem, dfa);
 2611       tree = create_tree (dfa, tree, elem, CONCAT);
 2612       if (__glibc_unlikely (elem == NULL || tree == NULL))
 2613         goto parse_dup_op_espace;
 2614     }
 2615 
 2616       if (start == end)
 2617     return tree;
 2618 
 2619       /* Duplicate ELEM before it is marked optional.  */
 2620       elem = duplicate_tree (elem, dfa);
 2621       if (__glibc_unlikely (elem == NULL))
 2622         goto parse_dup_op_espace;
 2623       old_tree = tree;
 2624     }
 2625   else
 2626     old_tree = NULL;
 2627 
 2628   if (elem->token.type == SUBEXP)
 2629     {
 2630       uintptr_t subidx = elem->token.opr.idx;
 2631       postorder (elem, mark_opt_subexp, (void *) subidx);
 2632     }
 2633 
 2634   tree = create_tree (dfa, elem, NULL,
 2635               (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
 2636   if (__glibc_unlikely (tree == NULL))
 2637     goto parse_dup_op_espace;
 2638 
 2639   /* This loop is actually executed only when end != -1,
 2640      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
 2641      already created the start+1-th copy.  */
 2642   if (TYPE_SIGNED (Idx) || end != -1)
 2643     for (i = start + 2; i <= end; ++i)
 2644       {
 2645     elem = duplicate_tree (elem, dfa);
 2646     tree = create_tree (dfa, tree, elem, CONCAT);
 2647     if (__glibc_unlikely (elem == NULL || tree == NULL))
 2648       goto parse_dup_op_espace;
 2649 
 2650     tree = create_tree (dfa, tree, NULL, OP_ALT);
 2651     if (__glibc_unlikely (tree == NULL))
 2652       goto parse_dup_op_espace;
 2653       }
 2654 
 2655   if (old_tree)
 2656     tree = create_tree (dfa, old_tree, tree, CONCAT);
 2657 
 2658   return tree;
 2659 
 2660  parse_dup_op_espace:
 2661   *err = REG_ESPACE;
 2662   return NULL;
 2663 }
 2664 
 2665 /* Size of the names for collating symbol/equivalence_class/character_class.
 2666    I'm not sure, but maybe enough.  */
 2667 #define BRACKET_NAME_BUF_SIZE 32
 2668 
 2669 #ifndef _LIBC
 2670 
 2671 # ifdef RE_ENABLE_I18N
 2672 /* Convert the byte B to the corresponding wide character.  In a
 2673    unibyte locale, treat B as itself.  In a multibyte locale, return
 2674    WEOF if B is an encoding error.  */
 2675 static wint_t
 2676 parse_byte (unsigned char b, re_charset_t *mbcset)
 2677 {
 2678   return mbcset == NULL ? b : __btowc (b);
 2679 }
 2680 # endif
 2681 
 2682   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
 2683      Build the range expression which starts from START_ELEM, and ends
 2684      at END_ELEM.  The result are written to MBCSET and SBCSET.
 2685      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
 2686      mbcset->range_ends, is a pointer argument since we may
 2687      update it.  */
 2688 
 2689 static reg_errcode_t
 2690 # ifdef RE_ENABLE_I18N
 2691 build_range_exp (const reg_syntax_t syntax,
 2692                  bitset_t sbcset,
 2693                  re_charset_t *mbcset,
 2694                  Idx *range_alloc,
 2695                  const bracket_elem_t *start_elem,
 2696                  const bracket_elem_t *end_elem)
 2697 # else /* not RE_ENABLE_I18N */
 2698 build_range_exp (const reg_syntax_t syntax,
 2699                  bitset_t sbcset,
 2700                  const bracket_elem_t *start_elem,
 2701                  const bracket_elem_t *end_elem)
 2702 # endif /* not RE_ENABLE_I18N */
 2703 {
 2704   unsigned int start_ch, end_ch;
 2705   /* Equivalence Classes and Character Classes can't be a range start/end.  */
 2706   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
 2707             || start_elem->type == CHAR_CLASS
 2708             || end_elem->type == EQUIV_CLASS
 2709             || end_elem->type == CHAR_CLASS))
 2710     return REG_ERANGE;
 2711 
 2712   /* We can handle no multi character collating elements without libc
 2713      support.  */
 2714   if (__glibc_unlikely ((start_elem->type == COLL_SYM
 2715              && strlen ((char *) start_elem->opr.name) > 1)
 2716             || (end_elem->type == COLL_SYM
 2717                 && strlen ((char *) end_elem->opr.name) > 1)))
 2718     return REG_ECOLLATE;
 2719 
 2720 # ifdef RE_ENABLE_I18N
 2721   {
 2722     wchar_t wc;
 2723     wint_t start_wc;
 2724     wint_t end_wc;
 2725 
 2726     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
 2727         : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
 2728            : 0));
 2729     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
 2730           : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
 2731          : 0));
 2732     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
 2733         ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
 2734     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
 2735           ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
 2736     if (start_wc == WEOF || end_wc == WEOF)
 2737       return REG_ECOLLATE;
 2738     else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
 2739                    && start_wc > end_wc))
 2740       return REG_ERANGE;
 2741 
 2742     /* Got valid collation sequence values, add them as a new entry.
 2743        However, for !_LIBC we have no collation elements: if the
 2744        character set is single byte, the single byte character set
 2745        that we build below suffices.  parse_bracket_exp passes
 2746        no MBCSET if dfa->mb_cur_max == 1.  */
 2747     if (mbcset)
 2748       {
 2749     /* Check the space of the arrays.  */
 2750     if (__glibc_unlikely (*range_alloc == mbcset->nranges))
 2751       {
 2752         /* There is not enough space, need realloc.  */
 2753         wchar_t *new_array_start, *new_array_end;
 2754         Idx new_nranges;
 2755 
 2756         /* +1 in case of mbcset->nranges is 0.  */
 2757         new_nranges = 2 * mbcset->nranges + 1;
 2758         /* Use realloc since mbcset->range_starts and mbcset->range_ends
 2759            are NULL if *range_alloc == 0.  */
 2760         new_array_start = re_realloc (mbcset->range_starts, wchar_t,
 2761                       new_nranges);
 2762         new_array_end = re_realloc (mbcset->range_ends, wchar_t,
 2763                     new_nranges);
 2764 
 2765         if (__glibc_unlikely (new_array_start == NULL
 2766                   || new_array_end == NULL))
 2767           {
 2768         re_free (new_array_start);
 2769         re_free (new_array_end);
 2770         return REG_ESPACE;
 2771           }
 2772 
 2773         mbcset->range_starts = new_array_start;
 2774         mbcset->range_ends = new_array_end;
 2775         *range_alloc = new_nranges;
 2776       }
 2777 
 2778     mbcset->range_starts[mbcset->nranges] = start_wc;
 2779     mbcset->range_ends[mbcset->nranges++] = end_wc;
 2780       }
 2781 
 2782     /* Build the table for single byte characters.  */
 2783     for (wc = 0; wc < SBC_MAX; ++wc)
 2784       {
 2785     if (start_wc <= wc && wc <= end_wc)
 2786       bitset_set (sbcset, wc);
 2787       }
 2788   }
 2789 # else /* not RE_ENABLE_I18N */
 2790   {
 2791     unsigned int ch;
 2792     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
 2793         : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
 2794            : 0));
 2795     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
 2796           : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
 2797          : 0));
 2798     if (start_ch > end_ch)
 2799       return REG_ERANGE;
 2800     /* Build the table for single byte characters.  */
 2801     for (ch = 0; ch < SBC_MAX; ++ch)
 2802       if (start_ch <= ch  && ch <= end_ch)
 2803     bitset_set (sbcset, ch);
 2804   }
 2805 # endif /* not RE_ENABLE_I18N */
 2806   return REG_NOERROR;
 2807 }
 2808 #endif /* not _LIBC */
 2809 
 2810 #ifndef _LIBC
 2811 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
 2812    Build the collating element which is represented by NAME.
 2813    The result are written to MBCSET and SBCSET.
 2814    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
 2815    pointer argument since we may update it.  */
 2816 
 2817 static reg_errcode_t
 2818 # ifdef RE_ENABLE_I18N
 2819 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
 2820             Idx *coll_sym_alloc, const unsigned char *name)
 2821 # else /* not RE_ENABLE_I18N */
 2822 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
 2823 # endif /* not RE_ENABLE_I18N */
 2824 {
 2825   size_t name_len = strlen ((const char *) name);
 2826   if (__glibc_unlikely (name_len != 1))
 2827     return REG_ECOLLATE;
 2828   else
 2829     {
 2830       bitset_set (sbcset, name[0]);
 2831       return REG_NOERROR;
 2832     }
 2833 }
 2834 #endif /* not _LIBC */
 2835 
 2836 /* This function parse bracket expression like "[abc]", "[a-c]",
 2837    "[[.a-a.]]" etc.  */
 2838 
 2839 static bin_tree_t *
 2840 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 2841            reg_syntax_t syntax, reg_errcode_t *err)
 2842 {
 2843 #ifdef _LIBC
 2844   const unsigned char *collseqmb;
 2845   const char *collseqwc;
 2846   uint32_t nrules;
 2847   int32_t table_size;
 2848   const int32_t *symb_table;
 2849   const unsigned char *extra;
 2850 
 2851   /* Local function for parse_bracket_exp used in _LIBC environment.
 2852      Seek the collating symbol entry corresponding to NAME.
 2853      Return the index of the symbol in the SYMB_TABLE,
 2854      or -1 if not found.  */
 2855 
 2856   auto inline int32_t
 2857   __attribute__ ((always_inline))
 2858   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
 2859     {
 2860       int32_t elem;
 2861 
 2862       for (elem = 0; elem < table_size; elem++)
 2863     if (symb_table[2 * elem] != 0)
 2864       {
 2865         int32_t idx = symb_table[2 * elem + 1];
 2866         /* Skip the name of collating element name.  */
 2867         idx += 1 + extra[idx];
 2868         if (/* Compare the length of the name.  */
 2869         name_len == extra[idx]
 2870         /* Compare the name.  */
 2871         && memcmp (name, &extra[idx + 1], name_len) == 0)
 2872           /* Yep, this is the entry.  */
 2873           return elem;
 2874       }
 2875       return -1;
 2876     }
 2877 
 2878   /* Local function for parse_bracket_exp used in _LIBC environment.
 2879      Look up the collation sequence value of BR_ELEM.
 2880      Return the value if succeeded, UINT_MAX otherwise.  */
 2881 
 2882   auto inline unsigned int
 2883   __attribute__ ((always_inline))
 2884   lookup_collation_sequence_value (bracket_elem_t *br_elem)
 2885     {
 2886       if (br_elem->type == SB_CHAR)
 2887     {
 2888       /*
 2889       if (MB_CUR_MAX == 1)
 2890       */
 2891       if (nrules == 0)
 2892         return collseqmb[br_elem->opr.ch];
 2893       else
 2894         {
 2895           wint_t wc = __btowc (br_elem->opr.ch);
 2896           return __collseq_table_lookup (collseqwc, wc);
 2897         }
 2898     }
 2899       else if (br_elem->type == MB_CHAR)
 2900     {
 2901       if (nrules != 0)
 2902         return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
 2903     }
 2904       else if (br_elem->type == COLL_SYM)
 2905     {
 2906       size_t sym_name_len = strlen ((char *) br_elem->opr.name);
 2907       if (nrules != 0)
 2908         {
 2909           int32_t elem, idx;
 2910           elem = seek_collating_symbol_entry (br_elem->opr.name,
 2911                           sym_name_len);
 2912           if (elem != -1)
 2913         {
 2914           /* We found the entry.  */
 2915           idx = symb_table[2 * elem + 1];
 2916           /* Skip the name of collating element name.  */
 2917           idx += 1 + extra[idx];
 2918           /* Skip the byte sequence of the collating element.  */
 2919           idx += 1 + extra[idx];
 2920           /* Adjust for the alignment.  */
 2921           idx = (idx + 3) & ~3;
 2922           /* Skip the multibyte collation sequence value.  */
 2923           idx += sizeof (unsigned int);
 2924           /* Skip the wide char sequence of the collating element.  */
 2925           idx += sizeof (unsigned int) *
 2926             (1 + *(unsigned int *) (extra + idx));
 2927           /* Return the collation sequence value.  */
 2928           return *(unsigned int *) (extra + idx);
 2929         }
 2930           else if (sym_name_len == 1)
 2931         {
 2932           /* No valid character.  Match it as a single byte
 2933              character.  */
 2934           return collseqmb[br_elem->opr.name[0]];
 2935         }
 2936         }
 2937       else if (sym_name_len == 1)
 2938         return collseqmb[br_elem->opr.name[0]];
 2939     }
 2940       return UINT_MAX;
 2941     }
 2942 
 2943   /* Local function for parse_bracket_exp used in _LIBC environment.
 2944      Build the range expression which starts from START_ELEM, and ends
 2945      at END_ELEM.  The result are written to MBCSET and SBCSET.
 2946      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
 2947      mbcset->range_ends, is a pointer argument since we may
 2948      update it.  */
 2949 
 2950   auto inline reg_errcode_t
 2951   __attribute__ ((always_inline))
 2952   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
 2953            bracket_elem_t *start_elem, bracket_elem_t *end_elem)
 2954     {
 2955       unsigned int ch;
 2956       uint32_t start_collseq;
 2957       uint32_t end_collseq;
 2958 
 2959       /* Equivalence Classes and Character Classes can't be a range
 2960      start/end.  */
 2961       if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
 2962                 || start_elem->type == CHAR_CLASS
 2963                 || end_elem->type == EQUIV_CLASS
 2964                 || end_elem->type == CHAR_CLASS))
 2965     return REG_ERANGE;
 2966 
 2967       /* FIXME: Implement rational ranges here, too.  */
 2968       start_collseq = lookup_collation_sequence_value (start_elem);
 2969       end_collseq = lookup_collation_sequence_value (end_elem);
 2970       /* Check start/end collation sequence values.  */
 2971       if (__glibc_unlikely (start_collseq == UINT_MAX
 2972                 || end_collseq == UINT_MAX))
 2973     return REG_ECOLLATE;
 2974       if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
 2975                 && start_collseq > end_collseq))
 2976     return REG_ERANGE;
 2977 
 2978       /* Got valid collation sequence values, add them as a new entry.
 2979      However, if we have no collation elements, and the character set
 2980      is single byte, the single byte character set that we
 2981      build below suffices. */
 2982       if (nrules > 0 || dfa->mb_cur_max > 1)
 2983     {
 2984       /* Check the space of the arrays.  */
 2985       if (__glibc_unlikely (*range_alloc == mbcset->nranges))
 2986         {
 2987           /* There is not enough space, need realloc.  */
 2988           uint32_t *new_array_start;
 2989           uint32_t *new_array_end;
 2990           Idx new_nranges;
 2991 
 2992           /* +1 in case of mbcset->nranges is 0.  */
 2993           new_nranges = 2 * mbcset->nranges + 1;
 2994           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
 2995                         new_nranges);
 2996           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
 2997                       new_nranges);
 2998 
 2999           if (__glibc_unlikely (new_array_start == NULL
 3000                     || new_array_end == NULL))
 3001         return REG_ESPACE;
 3002 
 3003           mbcset->range_starts = new_array_start;
 3004           mbcset->range_ends = new_array_end;
 3005           *range_alloc = new_nranges;
 3006         }
 3007 
 3008       mbcset->range_starts[mbcset->nranges] = start_collseq;
 3009       mbcset->range_ends[mbcset->nranges++] = end_collseq;
 3010     }
 3011 
 3012       /* Build the table for single byte characters.  */
 3013       for (ch = 0; ch < SBC_MAX; ch++)
 3014     {
 3015       uint32_t ch_collseq;
 3016       /*
 3017       if (MB_CUR_MAX == 1)
 3018       */
 3019       if (nrules == 0)
 3020         ch_collseq = collseqmb[ch];
 3021       else
 3022         ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
 3023       if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
 3024         bitset_set (sbcset, ch);
 3025     }
 3026       return REG_NOERROR;
 3027     }
 3028 
 3029   /* Local function for parse_bracket_exp used in _LIBC environment.
 3030      Build the collating element which is represented by NAME.
 3031      The result are written to MBCSET and SBCSET.
 3032      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
 3033      pointer argument since we may update it.  */
 3034 
 3035   auto inline reg_errcode_t
 3036   __attribute__ ((always_inline))
 3037   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
 3038               Idx *coll_sym_alloc, const unsigned char *name)
 3039     {
 3040       int32_t elem, idx;
 3041       size_t name_len = strlen ((const char *) name);
 3042       if (nrules != 0)
 3043     {
 3044       elem = seek_collating_symbol_entry (name, name_len);
 3045       if (elem != -1)
 3046         {
 3047           /* We found the entry.  */
 3048           idx = symb_table[2 * elem + 1];
 3049           /* Skip the name of collating element name.  */
 3050           idx += 1 + extra[idx];
 3051         }
 3052       else if (name_len == 1)
 3053         {
 3054           /* No valid character, treat it as a normal
 3055          character.  */
 3056           bitset_set (sbcset, name[0]);
 3057           return REG_NOERROR;
 3058         }
 3059       else
 3060         return REG_ECOLLATE;
 3061 
 3062       /* Got valid collation sequence, add it as a new entry.  */
 3063       /* Check the space of the arrays.  */
 3064       if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
 3065         {
 3066           /* Not enough, realloc it.  */
 3067           /* +1 in case of mbcset->ncoll_syms is 0.  */
 3068           Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
 3069           /* Use realloc since mbcset->coll_syms is NULL
 3070          if *alloc == 0.  */
 3071           int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
 3072                            new_coll_sym_alloc);
 3073           if (__glibc_unlikely (new_coll_syms == NULL))
 3074         return REG_ESPACE;
 3075           mbcset->coll_syms = new_coll_syms;
 3076           *coll_sym_alloc = new_coll_sym_alloc;
 3077         }
 3078       mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
 3079       return REG_NOERROR;
 3080     }
 3081       else
 3082     {
 3083       if (__glibc_unlikely (name_len != 1))
 3084         return REG_ECOLLATE;
 3085       else
 3086         {
 3087           bitset_set (sbcset, name[0]);
 3088           return REG_NOERROR;
 3089         }
 3090     }
 3091     }
 3092 #endif
 3093 
 3094   re_token_t br_token;
 3095   re_bitset_ptr_t sbcset;
 3096 #ifdef RE_ENABLE_I18N
 3097   re_charset_t *mbcset;
 3098   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
 3099   Idx equiv_class_alloc = 0, char_class_alloc = 0;
 3100 #endif /* not RE_ENABLE_I18N */
 3101   bool non_match = false;
 3102   bin_tree_t *work_tree;
 3103   int token_len;
 3104   bool first_round = true;
 3105 #ifdef _LIBC
 3106   collseqmb = (const unsigned char *)
 3107     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
 3108   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
 3109   if (nrules)
 3110     {
 3111       /*
 3112       if (MB_CUR_MAX > 1)
 3113       */
 3114       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
 3115       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
 3116       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
 3117                           _NL_COLLATE_SYMB_TABLEMB);
 3118       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
 3119                            _NL_COLLATE_SYMB_EXTRAMB);
 3120     }
 3121 #endif
 3122   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 3123 #ifdef RE_ENABLE_I18N
 3124   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
 3125 #endif /* RE_ENABLE_I18N */
 3126 #ifdef RE_ENABLE_I18N
 3127   if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
 3128 #else
 3129   if (__glibc_unlikely (sbcset == NULL))
 3130 #endif /* RE_ENABLE_I18N */
 3131     {
 3132       re_free (sbcset);
 3133 #ifdef RE_ENABLE_I18N
 3134       re_free (mbcset);
 3135 #endif
 3136       *err = REG_ESPACE;
 3137       return NULL;
 3138     }
 3139 
 3140   token_len = peek_token_bracket (token, regexp, syntax);
 3141   if (__glibc_unlikely (token->type == END_OF_RE))
 3142     {
 3143       *err = REG_BADPAT;
 3144       goto parse_bracket_exp_free_return;
 3145     }
 3146   if (token->type == OP_NON_MATCH_LIST)
 3147     {
 3148 #ifdef RE_ENABLE_I18N
 3149       mbcset->non_match = 1;
 3150 #endif /* not RE_ENABLE_I18N */
 3151       non_match = true;
 3152       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
 3153     bitset_set (sbcset, '\n');
 3154       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
 3155       token_len = peek_token_bracket (token, regexp, syntax);
 3156       if (__glibc_unlikely (token->type == END_OF_RE))
 3157     {
 3158       *err = REG_BADPAT;
 3159       goto parse_bracket_exp_free_return;
 3160     }
 3161     }
 3162 
 3163   /* We treat the first ']' as a normal character.  */
 3164   if (token->type == OP_CLOSE_BRACKET)
 3165     token->type = CHARACTER;
 3166 
 3167   while (1)
 3168     {
 3169       bracket_elem_t start_elem, end_elem;
 3170       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
 3171       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
 3172       reg_errcode_t ret;
 3173       int token_len2 = 0;
 3174       bool is_range_exp = false;
 3175       re_token_t token2;
 3176 
 3177       start_elem.opr.name = start_name_buf;
 3178       start_elem.type = COLL_SYM;
 3179       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
 3180                    syntax, first_round);
 3181       if (__glibc_unlikely (ret != REG_NOERROR))
 3182     {
 3183       *err = ret;
 3184       goto parse_bracket_exp_free_return;
 3185     }
 3186       first_round = false;
 3187 
 3188       /* Get information about the next token.  We need it in any case.  */
 3189       token_len = peek_token_bracket (token, regexp, syntax);
 3190 
 3191       /* Do not check for ranges if we know they are not allowed.  */
 3192       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
 3193     {
 3194       if (__glibc_unlikely (token->type == END_OF_RE))
 3195         {
 3196           *err = REG_EBRACK;
 3197           goto parse_bracket_exp_free_return;
 3198         }
 3199       if (token->type == OP_CHARSET_RANGE)
 3200         {
 3201           re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
 3202           token_len2 = peek_token_bracket (&token2, regexp, syntax);
 3203           if (__glibc_unlikely (token2.type == END_OF_RE))
 3204         {
 3205           *err = REG_EBRACK;
 3206           goto parse_bracket_exp_free_return;
 3207         }
 3208           if (token2.type == OP_CLOSE_BRACKET)
 3209         {
 3210           /* We treat the last '-' as a normal character.  */
 3211           re_string_skip_bytes (regexp, -token_len);
 3212           token->type = CHARACTER;
 3213         }
 3214           else
 3215         is_range_exp = true;
 3216         }
 3217     }
 3218 
 3219       if (is_range_exp == true)
 3220     {
 3221       end_elem.opr.name = end_name_buf;
 3222       end_elem.type = COLL_SYM;
 3223       ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
 3224                        dfa, syntax, true);
 3225       if (__glibc_unlikely (ret != REG_NOERROR))
 3226         {
 3227           *err = ret;
 3228           goto parse_bracket_exp_free_return;
 3229         }
 3230 
 3231       token_len = peek_token_bracket (token, regexp, syntax);
 3232 
 3233 #ifdef _LIBC
 3234       *err = build_range_exp (sbcset, mbcset, &range_alloc,
 3235                   &start_elem, &end_elem);
 3236 #else
 3237 # ifdef RE_ENABLE_I18N
 3238       *err = build_range_exp (syntax, sbcset,
 3239                   dfa->mb_cur_max > 1 ? mbcset : NULL,
 3240                   &range_alloc, &start_elem, &end_elem);
 3241 # else
 3242       *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
 3243 # endif
 3244 #endif /* RE_ENABLE_I18N */
 3245       if (__glibc_unlikely (*err != REG_NOERROR))
 3246         goto parse_bracket_exp_free_return;
 3247     }
 3248       else
 3249     {
 3250       switch (start_elem.type)
 3251         {
 3252         case SB_CHAR:
 3253           bitset_set (sbcset, start_elem.opr.ch);
 3254           break;
 3255 #ifdef RE_ENABLE_I18N
 3256         case MB_CHAR:
 3257           /* Check whether the array has enough space.  */
 3258           if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
 3259         {
 3260           wchar_t *new_mbchars;
 3261           /* Not enough, realloc it.  */
 3262           /* +1 in case of mbcset->nmbchars is 0.  */
 3263           mbchar_alloc = 2 * mbcset->nmbchars + 1;
 3264           /* Use realloc since array is NULL if *alloc == 0.  */
 3265           new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
 3266                         mbchar_alloc);
 3267           if (__glibc_unlikely (new_mbchars == NULL))
 3268             goto parse_bracket_exp_espace;
 3269           mbcset->mbchars = new_mbchars;
 3270         }
 3271           mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
 3272           break;
 3273 #endif /* RE_ENABLE_I18N */
 3274         case EQUIV_CLASS:
 3275           *err = build_equiv_class (sbcset,
 3276 #ifdef RE_ENABLE_I18N
 3277                     mbcset, &equiv_class_alloc,
 3278 #endif /* RE_ENABLE_I18N */
 3279                     start_elem.opr.name);
 3280           if (__glibc_unlikely (*err != REG_NOERROR))
 3281         goto parse_bracket_exp_free_return;
 3282           break;
 3283         case COLL_SYM:
 3284           *err = build_collating_symbol (sbcset,
 3285 #ifdef RE_ENABLE_I18N
 3286                          mbcset, &coll_sym_alloc,
 3287 #endif /* RE_ENABLE_I18N */
 3288                          start_elem.opr.name);
 3289           if (__glibc_unlikely (*err != REG_NOERROR))
 3290         goto parse_bracket_exp_free_return;
 3291           break;
 3292         case CHAR_CLASS:
 3293           *err = build_charclass (regexp->trans, sbcset,
 3294 #ifdef RE_ENABLE_I18N
 3295                       mbcset, &char_class_alloc,
 3296 #endif /* RE_ENABLE_I18N */
 3297                       (const char *) start_elem.opr.name,
 3298                       syntax);
 3299           if (__glibc_unlikely (*err != REG_NOERROR))
 3300            goto parse_bracket_exp_free_return;
 3301           break;
 3302         default:
 3303           DEBUG_ASSERT (false);
 3304           break;
 3305         }
 3306     }
 3307       if (__glibc_unlikely (token->type == END_OF_RE))
 3308     {
 3309       *err = REG_EBRACK;
 3310       goto parse_bracket_exp_free_return;
 3311     }
 3312       if (token->type == OP_CLOSE_BRACKET)
 3313     break;
 3314     }
 3315 
 3316   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
 3317 
 3318   /* If it is non-matching list.  */
 3319   if (non_match)
 3320     bitset_not (sbcset);
 3321 
 3322 #ifdef RE_ENABLE_I18N
 3323   /* Ensure only single byte characters are set.  */
 3324   if (dfa->mb_cur_max > 1)
 3325     bitset_mask (sbcset, dfa->sb_char);
 3326 
 3327   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
 3328       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
 3329                              || mbcset->non_match)))
 3330     {
 3331       bin_tree_t *mbc_tree;
 3332       int sbc_idx;
 3333       /* Build a tree for complex bracket.  */
 3334       dfa->has_mb_node = 1;
 3335       br_token.type = COMPLEX_BRACKET;
 3336       br_token.opr.mbcset = mbcset;
 3337       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
 3338       if (__glibc_unlikely (mbc_tree == NULL))
 3339     goto parse_bracket_exp_espace;
 3340       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
 3341     if (sbcset[sbc_idx])
 3342       break;
 3343       /* If there are no bits set in sbcset, there is no point
 3344      of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
 3345       if (sbc_idx < BITSET_WORDS)
 3346     {
 3347       /* Build a tree for simple bracket.  */
 3348       br_token.type = SIMPLE_BRACKET;
 3349       br_token.opr.sbcset = sbcset;
 3350       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
 3351       if (__glibc_unlikely (work_tree == NULL))
 3352         goto parse_bracket_exp_espace;
 3353 
 3354       /* Then join them by ALT node.  */
 3355       work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
 3356       if (__glibc_unlikely (work_tree == NULL))
 3357         goto parse_bracket_exp_espace;
 3358     }
 3359       else
 3360     {
 3361       re_free (sbcset);
 3362       work_tree = mbc_tree;
 3363     }
 3364     }
 3365   else
 3366 #endif /* not RE_ENABLE_I18N */
 3367     {
 3368 #ifdef RE_ENABLE_I18N
 3369       free_charset (mbcset);
 3370 #endif
 3371       /* Build a tree for simple bracket.  */
 3372       br_token.type = SIMPLE_BRACKET;
 3373       br_token.opr.sbcset = sbcset;
 3374       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
 3375       if (__glibc_unlikely (work_tree == NULL))
 3376     goto parse_bracket_exp_espace;
 3377     }
 3378   return work_tree;
 3379 
 3380  parse_bracket_exp_espace:
 3381   *err = REG_ESPACE;
 3382  parse_bracket_exp_free_return:
 3383   re_free (sbcset);
 3384 #ifdef RE_ENABLE_I18N
 3385   free_charset (mbcset);
 3386 #endif /* RE_ENABLE_I18N */
 3387   return NULL;
 3388 }
 3389 
 3390 /* Parse an element in the bracket expression.  */
 3391 
 3392 static reg_errcode_t
 3393 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
 3394                re_token_t *token, int token_len, re_dfa_t *dfa,
 3395                reg_syntax_t syntax, bool accept_hyphen)
 3396 {
 3397 #ifdef RE_ENABLE_I18N
 3398   int cur_char_size;
 3399   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
 3400   if (cur_char_size > 1)
 3401     {
 3402       elem->type = MB_CHAR;
 3403       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
 3404       re_string_skip_bytes (regexp, cur_char_size);
 3405       return REG_NOERROR;
 3406     }
 3407 #endif /* RE_ENABLE_I18N */
 3408   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
 3409   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
 3410       || token->type == OP_OPEN_EQUIV_CLASS)
 3411     return parse_bracket_symbol (elem, regexp, token);
 3412   if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
 3413     {
 3414       /* A '-' must only appear as anything but a range indicator before
 3415      the closing bracket.  Everything else is an error.  */
 3416       re_token_t token2;
 3417       (void) peek_token_bracket (&token2, regexp, syntax);
 3418       if (token2.type != OP_CLOSE_BRACKET)
 3419     /* The actual error value is not standardized since this whole
 3420        case is undefined.  But ERANGE makes good sense.  */
 3421     return REG_ERANGE;
 3422     }
 3423   elem->type = SB_CHAR;
 3424   elem->opr.ch = token->opr.c;
 3425   return REG_NOERROR;
 3426 }
 3427 
 3428 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
 3429    such as [:<character_class>:], [.<collating_element>.], and
 3430    [=<equivalent_class>=].  */
 3431 
 3432 static reg_errcode_t
 3433 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
 3434               re_token_t *token)
 3435 {
 3436   unsigned char ch, delim = token->opr.c;
 3437   int i = 0;
 3438   if (re_string_eoi(regexp))
 3439     return REG_EBRACK;
 3440   for (;; ++i)
 3441     {
 3442       if (i >= BRACKET_NAME_BUF_SIZE)
 3443     return REG_EBRACK;
 3444       if (token->type == OP_OPEN_CHAR_CLASS)
 3445     ch = re_string_fetch_byte_case (regexp);
 3446       else
 3447     ch = re_string_fetch_byte (regexp);
 3448       if (re_string_eoi(regexp))
 3449     return REG_EBRACK;
 3450       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
 3451     break;
 3452       elem->opr.name[i] = ch;
 3453     }
 3454   re_string_skip_bytes (regexp, 1);
 3455   elem->opr.name[i] = '\0';
 3456   switch (token->type)
 3457     {
 3458     case OP_OPEN_COLL_ELEM:
 3459       elem->type = COLL_SYM;
 3460       break;
 3461     case OP_OPEN_EQUIV_CLASS:
 3462       elem->type = EQUIV_CLASS;
 3463       break;
 3464     case OP_OPEN_CHAR_CLASS:
 3465       elem->type = CHAR_CLASS;
 3466       break;
 3467     default:
 3468       break;
 3469     }
 3470   return REG_NOERROR;
 3471 }
 3472 
 3473   /* Helper function for parse_bracket_exp.
 3474      Build the equivalence class which is represented by NAME.
 3475      The result are written to MBCSET and SBCSET.
 3476      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
 3477      is a pointer argument since we may update it.  */
 3478 
 3479 static reg_errcode_t
 3480 #ifdef RE_ENABLE_I18N
 3481 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
 3482            Idx *equiv_class_alloc, const unsigned char *name)
 3483 #else /* not RE_ENABLE_I18N */
 3484 build_equiv_class (bitset_t sbcset, const unsigned char *name)
 3485 #endif /* not RE_ENABLE_I18N */
 3486 {
 3487 #ifdef _LIBC
 3488   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
 3489   if (nrules != 0)
 3490     {
 3491       const int32_t *table, *indirect;
 3492       const unsigned char *weights, *extra, *cp;
 3493       unsigned char char_buf[2];
 3494       int32_t idx1, idx2;
 3495       unsigned int ch;
 3496       size_t len;
 3497       /* Calculate the index for equivalence class.  */
 3498       cp = name;
 3499       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 3500       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
 3501                            _NL_COLLATE_WEIGHTMB);
 3502       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
 3503                            _NL_COLLATE_EXTRAMB);
 3504       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
 3505                         _NL_COLLATE_INDIRECTMB);
 3506       idx1 = findidx (table, indirect, extra, &cp, -1);
 3507       if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
 3508     /* This isn't a valid character.  */
 3509     return REG_ECOLLATE;
 3510 
 3511       /* Build single byte matching table for this equivalence class.  */
 3512       len = weights[idx1 & 0xffffff];
 3513       for (ch = 0; ch < SBC_MAX; ++ch)
 3514     {
 3515       char_buf[0] = ch;
 3516       cp = char_buf;
 3517       idx2 = findidx (table, indirect, extra, &cp, 1);
 3518 /*
 3519       idx2 = table[ch];
 3520 */
 3521       if (idx2 == 0)
 3522         /* This isn't a valid character.  */
 3523         continue;
 3524       /* Compare only if the length matches and the collation rule
 3525          index is the same.  */
 3526       if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
 3527           && memcmp (weights + (idx1 & 0xffffff) + 1,
 3528              weights + (idx2 & 0xffffff) + 1, len) == 0)
 3529         bitset_set (sbcset, ch);
 3530     }
 3531       /* Check whether the array has enough space.  */
 3532       if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
 3533     {
 3534       /* Not enough, realloc it.  */
 3535       /* +1 in case of mbcset->nequiv_classes is 0.  */
 3536       Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
 3537       /* Use realloc since the array is NULL if *alloc == 0.  */
 3538       int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
 3539                            int32_t,
 3540                            new_equiv_class_alloc);
 3541       if (__glibc_unlikely (new_equiv_classes == NULL))
 3542         return REG_ESPACE;
 3543       mbcset->equiv_classes = new_equiv_classes;
 3544       *equiv_class_alloc = new_equiv_class_alloc;
 3545     }
 3546       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
 3547     }
 3548   else
 3549 #endif /* _LIBC */
 3550     {
 3551       if (__glibc_unlikely (strlen ((const char *) name) != 1))
 3552     return REG_ECOLLATE;
 3553       bitset_set (sbcset, *name);
 3554     }
 3555   return REG_NOERROR;
 3556 }
 3557 
 3558   /* Helper function for parse_bracket_exp.
 3559      Build the character class which is represented by NAME.
 3560      The result are written to MBCSET and SBCSET.
 3561      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
 3562      is a pointer argument since we may update it.  */
 3563 
 3564 static reg_errcode_t
 3565 #ifdef RE_ENABLE_I18N
 3566 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
 3567          re_charset_t *mbcset, Idx *char_class_alloc,
 3568          const char *class_name, reg_syntax_t syntax)
 3569 #else /* not RE_ENABLE_I18N */
 3570 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
 3571          const char *class_name, reg_syntax_t syntax)
 3572 #endif /* not RE_ENABLE_I18N */
 3573 {
 3574   int i;
 3575   const char *name = class_name;
 3576 
 3577   /* In case of REG_ICASE "upper" and "lower" match the both of
 3578      upper and lower cases.  */
 3579   if ((syntax & RE_ICASE)
 3580       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
 3581     name = "alpha";
 3582 
 3583 #ifdef RE_ENABLE_I18N
 3584   /* Check the space of the arrays.  */
 3585   if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
 3586     {
 3587       /* Not enough, realloc it.  */
 3588       /* +1 in case of mbcset->nchar_classes is 0.  */
 3589       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
 3590       /* Use realloc since array is NULL if *alloc == 0.  */
 3591       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
 3592                            new_char_class_alloc);
 3593       if (__glibc_unlikely (new_char_classes == NULL))
 3594     return REG_ESPACE;
 3595       mbcset->char_classes = new_char_classes;
 3596       *char_class_alloc = new_char_class_alloc;
 3597     }
 3598   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
 3599 #endif /* RE_ENABLE_I18N */
 3600 
 3601 #define BUILD_CHARCLASS_LOOP(ctype_func)    \
 3602   do {                      \
 3603     if (__glibc_unlikely (trans != NULL))           \
 3604       {                     \
 3605     for (i = 0; i < SBC_MAX; ++i)       \
 3606       if (ctype_func (i))           \
 3607         bitset_set (sbcset, trans[i]);  \
 3608       }                     \
 3609     else                    \
 3610       {                     \
 3611     for (i = 0; i < SBC_MAX; ++i)       \
 3612       if (ctype_func (i))           \
 3613         bitset_set (sbcset, i);     \
 3614       }                     \
 3615   } while (0)
 3616 
 3617   if (strcmp (name, "alnum") == 0)
 3618     BUILD_CHARCLASS_LOOP (isalnum);
 3619   else if (strcmp (name, "cntrl") == 0)
 3620     BUILD_CHARCLASS_LOOP (iscntrl);
 3621   else if (strcmp (name, "lower") == 0)
 3622     BUILD_CHARCLASS_LOOP (islower);
 3623   else if (strcmp (name, "space") == 0)
 3624     BUILD_CHARCLASS_LOOP (isspace);
 3625   else if (strcmp (name, "alpha") == 0)
 3626     BUILD_CHARCLASS_LOOP (isalpha);
 3627   else if (strcmp (name, "digit") == 0)
 3628     BUILD_CHARCLASS_LOOP (isdigit);
 3629   else if (strcmp (name, "print") == 0)
 3630     BUILD_CHARCLASS_LOOP (isprint);
 3631   else if (strcmp (name, "upper") == 0)
 3632     BUILD_CHARCLASS_LOOP (isupper);
 3633   else if (strcmp (name, "blank") == 0)
 3634     BUILD_CHARCLASS_LOOP (isblank);
 3635   else if (strcmp (name, "graph") == 0)
 3636     BUILD_CHARCLASS_LOOP (isgraph);
 3637   else if (strcmp (name, "punct") == 0)
 3638     BUILD_CHARCLASS_LOOP (ispunct);
 3639   else if (strcmp (name, "xdigit") == 0)
 3640     BUILD_CHARCLASS_LOOP (isxdigit);
 3641   else
 3642     return REG_ECTYPE;
 3643 
 3644   return REG_NOERROR;
 3645 }
 3646 
 3647 static bin_tree_t *
 3648 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
 3649             const char *class_name,
 3650             const char *extra, bool non_match,
 3651             reg_errcode_t *err)
 3652 {
 3653   re_bitset_ptr_t sbcset;
 3654 #ifdef RE_ENABLE_I18N
 3655   re_charset_t *mbcset;
 3656   Idx alloc = 0;
 3657 #endif /* not RE_ENABLE_I18N */
 3658   reg_errcode_t ret;
 3659   bin_tree_t *tree;
 3660 
 3661   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 3662   if (__glibc_unlikely (sbcset == NULL))
 3663     {
 3664       *err = REG_ESPACE;
 3665       return NULL;
 3666     }
 3667 #ifdef RE_ENABLE_I18N
 3668   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
 3669   if (__glibc_unlikely (mbcset == NULL))
 3670     {
 3671       re_free (sbcset);
 3672       *err = REG_ESPACE;
 3673       return NULL;
 3674     }
 3675   mbcset->non_match = non_match;
 3676 #endif /* RE_ENABLE_I18N */
 3677 
 3678   /* We don't care the syntax in this case.  */
 3679   ret = build_charclass (trans, sbcset,
 3680 #ifdef RE_ENABLE_I18N
 3681              mbcset, &alloc,
 3682 #endif /* RE_ENABLE_I18N */
 3683              class_name, 0);
 3684 
 3685   if (__glibc_unlikely (ret != REG_NOERROR))
 3686     {
 3687       re_free (sbcset);
 3688 #ifdef RE_ENABLE_I18N
 3689       free_charset (mbcset);
 3690 #endif /* RE_ENABLE_I18N */
 3691       *err = ret;
 3692       return NULL;
 3693     }
 3694   /* \w match '_' also.  */
 3695   for (; *extra; extra++)
 3696     bitset_set (sbcset, *extra);
 3697 
 3698   /* If it is non-matching list.  */
 3699   if (non_match)
 3700     bitset_not (sbcset);
 3701 
 3702 #ifdef RE_ENABLE_I18N
 3703   /* Ensure only single byte characters are set.  */
 3704   if (dfa->mb_cur_max > 1)
 3705     bitset_mask (sbcset, dfa->sb_char);
 3706 #endif
 3707 
 3708   /* Build a tree for simple bracket.  */
 3709   re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
 3710   tree = create_token_tree (dfa, NULL, NULL, &br_token);
 3711   if (__glibc_unlikely (tree == NULL))
 3712     goto build_word_op_espace;
 3713 
 3714 #ifdef RE_ENABLE_I18N
 3715   if (dfa->mb_cur_max > 1)
 3716     {
 3717       bin_tree_t *mbc_tree;
 3718       /* Build a tree for complex bracket.  */
 3719       br_token.type = COMPLEX_BRACKET;
 3720       br_token.opr.mbcset = mbcset;
 3721       dfa->has_mb_node = 1;
 3722       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
 3723       if (__glibc_unlikely (mbc_tree == NULL))
 3724     goto build_word_op_espace;
 3725       /* Then join them by ALT node.  */
 3726       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
 3727       if (__glibc_likely (mbc_tree != NULL))
 3728     return tree;
 3729     }
 3730   else
 3731     {
 3732       free_charset (mbcset);
 3733       return tree;
 3734     }
 3735 #else /* not RE_ENABLE_I18N */
 3736   return tree;
 3737 #endif /* not RE_ENABLE_I18N */
 3738 
 3739  build_word_op_espace:
 3740   re_free (sbcset);
 3741 #ifdef RE_ENABLE_I18N
 3742   free_charset (mbcset);
 3743 #endif /* RE_ENABLE_I18N */
 3744   *err = REG_ESPACE;
 3745   return NULL;
 3746 }
 3747 
 3748 /* This is intended for the expressions like "a{1,3}".
 3749    Fetch a number from 'input', and return the number.
 3750    Return -1 if the number field is empty like "{,1}".
 3751    Return RE_DUP_MAX + 1 if the number field is too large.
 3752    Return -2 if an error occurred.  */
 3753 
 3754 static Idx
 3755 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
 3756 {
 3757   Idx num = -1;
 3758   unsigned char c;
 3759   while (1)
 3760     {
 3761       fetch_token (token, input, syntax);
 3762       c = token->opr.c;
 3763       if (__glibc_unlikely (token->type == END_OF_RE))
 3764     return -2;
 3765       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
 3766     break;
 3767       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
 3768          ? -2
 3769          : num == -1
 3770          ? c - '0'
 3771          : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
 3772     }
 3773   return num;
 3774 }
 3775 
 3776 #ifdef RE_ENABLE_I18N
 3777 static void
 3778 free_charset (re_charset_t *cset)
 3779 {
 3780   re_free (cset->mbchars);
 3781 # ifdef _LIBC
 3782   re_free (cset->coll_syms);
 3783   re_free (cset->equiv_classes);
 3784 # endif
 3785   re_free (cset->range_starts);
 3786   re_free (cset->range_ends);
 3787   re_free (cset->char_classes);
 3788   re_free (cset);
 3789 }
 3790 #endif /* RE_ENABLE_I18N */
 3791 
 3792 /* Functions for binary tree operation.  */
 3793 
 3794 /* Create a tree node.  */
 3795 
 3796 static bin_tree_t *
 3797 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
 3798          re_token_type_t type)
 3799 {
 3800   re_token_t t = { .type = type };
 3801   return create_token_tree (dfa, left, right, &t);
 3802 }
 3803 
 3804 static bin_tree_t *
 3805 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
 3806            const re_token_t *token)
 3807 {
 3808   bin_tree_t *tree;
 3809   if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
 3810     {
 3811       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
 3812 
 3813       if (storage == NULL)
 3814     return NULL;
 3815       storage->next = dfa->str_tree_storage;
 3816       dfa->str_tree_storage = storage;
 3817       dfa->str_tree_storage_idx = 0;
 3818     }
 3819   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
 3820 
 3821   tree->parent = NULL;
 3822   tree->left = left;
 3823   tree->right = right;
 3824   tree->token = *token;
 3825   tree->token.duplicated = 0;
 3826   tree->token.opt_subexp = 0;
 3827   tree->first = NULL;
 3828   tree->next = NULL;
 3829   tree->node_idx = -1;
 3830 
 3831   if (left != NULL)
 3832     left->parent = tree;
 3833   if (right != NULL)
 3834     right->parent = tree;
 3835   return tree;
 3836 }
 3837 
 3838 /* Mark the tree SRC as an optional subexpression.
 3839    To be called from preorder or postorder.  */
 3840 
 3841 static reg_errcode_t
 3842 mark_opt_subexp (void *extra, bin_tree_t *node)
 3843 {
 3844   Idx idx = (uintptr_t) extra;
 3845   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
 3846     node->token.opt_subexp = 1;
 3847 
 3848   return REG_NOERROR;
 3849 }
 3850 
 3851 /* Free the allocated memory inside NODE. */
 3852 
 3853 static void
 3854 free_token (re_token_t *node)
 3855 {
 3856 #ifdef RE_ENABLE_I18N
 3857   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 3858     free_charset (node->opr.mbcset);
 3859   else
 3860 #endif /* RE_ENABLE_I18N */
 3861     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 3862       re_free (node->opr.sbcset);
 3863 }
 3864 
 3865 /* Worker function for tree walking.  Free the allocated memory inside NODE
 3866    and its children. */
 3867 
 3868 static reg_errcode_t
 3869 free_tree (void *extra, bin_tree_t *node)
 3870 {
 3871   free_token (&node->token);
 3872   return REG_NOERROR;
 3873 }
 3874 
 3875 
 3876 /* Duplicate the node SRC, and return new node.  This is a preorder
 3877    visit similar to the one implemented by the generic visitor, but
 3878    we need more infrastructure to maintain two parallel trees --- so,
 3879    it's easier to duplicate.  */
 3880 
 3881 static bin_tree_t *
 3882 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
 3883 {
 3884   const bin_tree_t *node;
 3885   bin_tree_t *dup_root;
 3886   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
 3887 
 3888   for (node = root; ; )
 3889     {
 3890       /* Create a new tree and link it back to the current parent.  */
 3891       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
 3892       if (*p_new == NULL)
 3893     return NULL;
 3894       (*p_new)->parent = dup_node;
 3895       (*p_new)->token.duplicated = 1;
 3896       dup_node = *p_new;
 3897 
 3898       /* Go to the left node, or up and to the right.  */
 3899       if (node->left)
 3900     {
 3901       node = node->left;
 3902       p_new = &dup_node->left;
 3903     }
 3904       else
 3905     {
 3906       const bin_tree_t *prev = NULL;
 3907       while (node->right == prev || node->right == NULL)
 3908         {
 3909           prev = node;
 3910           node = node->parent;
 3911           dup_node = dup_node->parent;
 3912           if (!node)
 3913         return dup_root;
 3914         }
 3915       node = node->right;
 3916       p_new = &dup_node->right;
 3917     }
 3918     }
 3919 }