"Fossies" - the Fresh Open Source Software Archive

Member "gawk-5.1.0/re.c" (6 Feb 2020, 16596 Bytes) of package /linux/misc/gawk-5.1.0.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "re.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 5.0.1_vs_5.1.0.

    1 /*
    2  * re.c - compile regular expressions.
    3  */
    4 
    5 /*
    6  * Copyright (C) 1991-2019 the Free Software Foundation, Inc.
    7  *
    8  * This file is part of GAWK, the GNU implementation of the
    9  * AWK Programming Language.
   10  *
   11  * GAWK is free software; you can redistribute it and/or modify
   12  * it under the terms of the GNU General Public License as published by
   13  * the Free Software Foundation; either version 3 of the License, or
   14  * (at your option) any later version.
   15  *
   16  * GAWK is distributed in the hope that it will be useful,
   17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   19  * GNU General Public License for more details.
   20  *
   21  * You should have received a copy of the GNU General Public License
   22  * along with this program; if not, write to the Free Software
   23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
   24  */
   25 
   26 #include "awk.h"
   27 
   28 #include "localeinfo.h"
   29 
   30 static reg_syntax_t syn;
   31 static void check_bracket_exp(char *s, size_t len);
   32 const char *regexflags2str(int flags);
   33 
   34 static struct localeinfo localeinfo;
   35 
   36 /* make_regexp --- generate compiled regular expressions */
   37 
   38 Regexp *
   39 make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
   40 {
   41     static char metas[] = ".*+(){}[]|?^$\\";
   42     Regexp *rp;
   43     const char *rerr;
   44     const char *src = s;
   45     static char *buf = NULL;
   46     static size_t buflen;
   47     const char *end = s + len;
   48     char *dest;
   49     int c, c2;
   50     static bool first = true;
   51     static bool no_dfa = false;
   52     int i;
   53     static struct dfa* dfaregs[2] = { NULL, NULL };
   54     static bool nul_warned = false;
   55 
   56     if (do_lint && ! nul_warned && memchr(s, '\0', len) != NULL) {
   57         nul_warned = true;
   58         lintwarn(_("behavior of matching a regexp containing NUL characters is not defined by POSIX"));
   59     }
   60 
   61     /*
   62      * The number of bytes in the current multibyte character.
   63      * It is 0, when the current character is a singlebyte character.
   64      */
   65     size_t is_multibyte = 0;
   66     mbstate_t mbs;
   67 
   68     memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize.  */
   69 
   70     if (first) {
   71         /* for debugging and testing */
   72         no_dfa = (getenv("GAWK_NO_DFA") != NULL);
   73         /* don't set first to false here, we do it below */
   74     }
   75 
   76     /* always check */
   77     check_bracket_exp((char *) s, len);
   78 
   79     /* Handle escaped characters first. */
   80 
   81     /*
   82      * Build a copy of the string (in buf) with the
   83      * escaped characters translated, and generate the regex
   84      * from that.
   85      */
   86     if (buf == NULL) {
   87         emalloc(buf, char *, len + 1, "make_regexp");
   88         buflen = len;
   89     } else if (len > buflen) {
   90         erealloc(buf, char *, len + 1, "make_regexp");
   91         buflen = len;
   92     }
   93     dest = buf;
   94 
   95     while (src < end) {
   96         if (gawk_mb_cur_max > 1 && ! is_multibyte) {
   97             /* The previous byte is a singlebyte character, or last byte
   98                of a multibyte character.  We check the next character.  */
   99             is_multibyte = mbrlen(src, end - src, &mbs);
  100             if (   is_multibyte == 1
  101                 || is_multibyte == (size_t) -1
  102                 || is_multibyte == (size_t) -2
  103                 || is_multibyte == 0) {
  104                 /* We treat it as a single-byte character.  */
  105                 is_multibyte = 0;
  106             }
  107         }
  108 
  109         const char *ok_to_escape;
  110         if (do_posix)
  111             ok_to_escape = "{}()|*+?.^$\\[]/-";
  112         else if (do_traditional)
  113             ok_to_escape = "()|*+?.^$\\[]/-";
  114         else
  115             ok_to_escape = "<>`'BywWsS{}()|*+?.^$\\[]/-";
  116 
  117         /* We skip multibyte character, since it must not be a special
  118            character.  */
  119         if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
  120             (*src == '\\')) {
  121             c = *++src;
  122             switch (c) {
  123             case '\0':  /* \\ before \0, either dynamic data or real end of string */
  124                 if (src >= s + len)
  125                     *dest++ = '\\'; // at end of string, will fatal below
  126                 else
  127                     fatal(_("invalid NUL byte in dynamic regexp"));
  128                 break;
  129             case 'a':
  130             case 'b':
  131             case 'f':
  132             case 'n':
  133             case 'r':
  134             case 't':
  135             case 'v':
  136             case 'x':
  137             case '0':
  138             case '1':
  139             case '2':
  140             case '3':
  141             case '4':
  142             case '5':
  143             case '6':
  144             case '7':
  145                 c2 = parse_escape(&src);
  146                 if (c2 < 0)
  147                     cant_happen();
  148                 /*
  149                  * Unix awk treats octal (and hex?) chars
  150                  * literally in re's, so escape regexp
  151                  * metacharacters.
  152                  */
  153                 if (do_traditional
  154                     && ! do_posix
  155                     && (isdigit(c) || c == 'x')
  156                     && strchr("()|*+?.^$\\[]", c2) != NULL)
  157                     *dest++ = '\\';
  158                 *dest++ = (char) c2;
  159                 if (do_lint
  160                     && ! nul_warned
  161                     && c2 == '\0') {
  162                     nul_warned = true;
  163                     lintwarn(_("behavior of matching a regexp containing NUL characters is not defined by POSIX"));
  164                 }
  165                 break;
  166             case '8':
  167             case '9':   /* a\9b not valid */
  168                 *dest++ = c;
  169                 src++;
  170             {
  171                 static bool warned[2];
  172 
  173                 if (! warned[c - '8']) {
  174                     warning(_("regexp escape sequence `\\%c' treated as plain `%c'"), c, c);
  175                     warned[c - '8'] = true;
  176                 }
  177             }
  178                 break;
  179             case 'y':   /* normally \b */
  180                 /* gnu regex op */
  181                 if (! do_traditional) {
  182                     *dest++ = '\\';
  183                     *dest++ = 'b';
  184                     src++;
  185                     break;
  186                 }
  187                 /* else, fall through */
  188             default:
  189                 if (strchr(ok_to_escape, c) == NULL) {
  190                     static bool warned[256];
  191 
  192                     if (! warned[c & 0xFF]) {
  193                         warning(_("regexp escape sequence `\\%c' is not a known regexp operator"), c);
  194                         warned[c & 0xFF] = true;
  195                     }
  196                 }
  197                 *dest++ = '\\';
  198                 *dest++ = (char) c;
  199                 src++;
  200                 break;
  201             } /* switch */
  202         } else {
  203             c = *src;
  204             *dest++ = *src++;   /* not '\\' */
  205         }
  206         if (gawk_mb_cur_max > 1 && is_multibyte)
  207             is_multibyte--;
  208     } /* while */
  209 
  210     *dest = '\0';
  211     len = dest - buf;
  212 
  213     ezalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
  214     rp->pat.allocated = 0;  /* regex will allocate the buffer */
  215     emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
  216 
  217     /*
  218      * Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
  219      * was going to turn out to be, I wouldn't have bothered with it.
  220      *
  221      * In the case where we have a multibyte character set, we have no
  222      * choice but to use RE_ICASE, since the casetable is for single-byte
  223      * character sets only.
  224      *
  225      * On the other hand, if we do have a single-byte character set,
  226      * using the casetable should give  a performance improvement, since
  227      * it's computed only once, not each time a regex is compiled.  We
  228      * also think it's probably better for portability.  See the
  229      * discussion by the definition of casetable[] in eval.c.
  230      */
  231 
  232     ignorecase = !! ignorecase; /* force to 1 or 0 */
  233     if (ignorecase) {
  234         if (gawk_mb_cur_max > 1) {
  235             syn |= RE_ICASE;
  236             rp->pat.translate = NULL;
  237         } else {
  238             syn &= ~RE_ICASE;
  239             rp->pat.translate = (RE_TRANSLATE_TYPE) casetable;
  240         }
  241     } else {
  242         rp->pat.translate = NULL;
  243         syn &= ~RE_ICASE;
  244     }
  245 
  246     /* initialize dfas to hold syntax */
  247     if (first) {
  248         first = false;
  249         dfaregs[0] = dfaalloc();
  250         dfaregs[1] = dfaalloc();
  251         dfasyntax(dfaregs[0], & localeinfo, syn, DFA_ANCHOR);
  252         dfasyntax(dfaregs[1], & localeinfo, syn | RE_ICASE, DFA_ANCHOR);
  253     }
  254 
  255     re_set_syntax(syn);
  256 
  257     if ((rerr = re_compile_pattern(buf, len, &(rp->pat))) != NULL) {
  258         refree(rp);
  259         if (! canfatal) {
  260             /* rerr already gettextized inside regex routines */
  261             error("%s: /%s/", rerr, buf);
  262             return NULL;
  263         }
  264         fatal("invalid regexp: %s: /%s/", rerr, buf);
  265     }
  266 
  267     /* gack. this must be done *after* re_compile_pattern */
  268     rp->pat.newline_anchor = false; /* don't get \n in middle of string */
  269     if (dfa && ! no_dfa) {
  270         rp->dfareg = dfaalloc();
  271         dfacopysyntax(rp->dfareg, dfaregs[ignorecase]);
  272         dfacomp(buf, len, rp->dfareg, true);
  273     } else
  274         rp->dfareg = NULL;
  275 
  276     /* Additional flags that help with RS as regexp. */
  277     for (i = 0; i < len; i++) {
  278         if (strchr(metas, buf[i]) != NULL) {
  279             rp->has_meta = true;
  280             break;
  281         }
  282     }
  283 
  284     for (i = len - 1; i >= 0; i--) {
  285         if (strchr("*+|?", buf[i]) != NULL) {
  286             rp->maybe_long = true;
  287             break;
  288         }
  289     }
  290 
  291     return rp;
  292 }
  293 
  294 /* research --- do a regexp search. use dfa if possible */
  295 
  296 int
  297 research(Regexp *rp, char *str, int start,
  298      size_t len, int flags)
  299 {
  300     const char *ret = str;
  301     bool try_backref = false;
  302     int need_start;
  303     int no_bol;
  304     int res;
  305 
  306     need_start = ((flags & RE_NEED_START) != 0);
  307     no_bol = ((flags & RE_NO_BOL) != 0);
  308 
  309     if (no_bol)
  310         rp->pat.not_bol = 1;
  311 
  312     /*
  313      * Always do dfa search if can; if it fails, then even if
  314      * need_start is true, we won't bother with the regex search.
  315      *
  316      * The dfa matcher doesn't have a no_bol flag, so don't bother
  317      * trying it in that case.
  318      *
  319      * 7/2008: Skip the dfa matcher if need_start. The dfa matcher
  320      * has bugs in certain multibyte cases and it's too difficult
  321      * to try to special case things.
  322      * 7/2017: Apparently there are some cases where DFA gets
  323      * stuck, even in the C locale, so we use dfa only if not need_start.
  324      *
  325      * Should that issue ever get resolved, note this comment:
  326      *
  327      * 7/2016: The dfa matcher can't handle a case where searching
  328      * starts in the middle of a string, so don't bother trying it
  329      * in that case.
  330      *  if (rp->dfa && ! no_bol && start == 0) ...
  331      */
  332     if (rp->dfareg != NULL && ! no_bol && ! need_start) {
  333         struct dfa *superset = dfasuperset(rp->dfareg);
  334         if (superset)
  335             ret = dfaexec(superset, str+start, str+start+len,
  336                             true, NULL, NULL);
  337 
  338         if (ret && (! need_start
  339                 || (! superset && dfaisfast(rp->dfareg))))
  340             ret = dfaexec(rp->dfareg, str+start, str+start+len,
  341                         true, NULL, &try_backref);
  342     }
  343 
  344     if (ret) {
  345         if (   rp->dfareg == NULL
  346             || start != 0
  347             || no_bol
  348             || need_start
  349             || try_backref) {
  350             /*
  351              * Passing NULL as last arg speeds up search for cases
  352              * where we don't need the start/end info.
  353              */
  354             res = re_search(&(rp->pat), str, start+len,
  355                 start, len, need_start ? &(rp->regs) : NULL);
  356         } else
  357             res = 1;
  358     } else
  359         res = -1;
  360 
  361     rp->pat.not_bol = 0;
  362     return res;
  363 }
  364 
  365 /* refree --- free up the dynamic memory used by a compiled regexp */
  366 
  367 void
  368 refree(Regexp *rp)
  369 {
  370     if (rp == NULL)
  371         return;
  372     rp->pat.translate = NULL;
  373     regfree(& rp->pat);
  374     if (rp->regs.start)
  375         free(rp->regs.start);
  376     if (rp->regs.end)
  377         free(rp->regs.end);
  378     if (rp->dfareg != NULL) {
  379         dfafree(rp->dfareg);
  380         free(rp->dfareg);
  381     }
  382     efree(rp);
  383 }
  384 
  385 /* dfaerror --- print an error message for the dfa routines */
  386 
  387 void
  388 dfaerror(const char *s)
  389 {
  390     fatal("%s", s);
  391     exit(EXIT_FATAL);   /* for DJGPP */
  392 }
  393 
  394 /* re_cache_get --- populate regexp cache if empty */
  395 
  396 static inline Regexp *
  397 re_cache_get(NODE *t)
  398 {
  399     if (t->re_reg[IGNORECASE] == NULL)
  400         t->re_reg[IGNORECASE] = make_regexp(t->re_exp->stptr, t->re_exp->stlen, IGNORECASE, t->re_cnt, true);
  401     return t->re_reg[IGNORECASE];
  402 }
  403 
  404 /* re_update --- recompile a dynamic regexp */
  405 
  406 Regexp *
  407 re_update(NODE *t)
  408 {
  409     NODE *t1;
  410 
  411     if (t->type == Node_val && (t->flags & REGEX) != 0)
  412         return re_cache_get(t->typed_re);
  413 
  414     if ((t->re_flags & CONSTANT) != 0) {
  415         /* it's a constant, so just return it as is */
  416         assert(t->type == Node_regex);
  417         return re_cache_get(t);
  418     }
  419     t1 = t->re_exp;
  420     if (t->re_text != NULL) {
  421         /* if contents haven't changed, just return it */
  422         if (cmp_nodes(t->re_text, t1, true) == 0)
  423             return re_cache_get(t);
  424         /* things changed, fall through to recompile */
  425         unref(t->re_text);
  426     }
  427     /* get fresh copy of the text of the regexp */
  428     t->re_text = dupnode(t1);
  429 
  430     /* text changed */
  431 
  432     /* free old */
  433     if (t->re_reg[0] != NULL) {
  434         refree(t->re_reg[0]);
  435         t->re_reg[0] = NULL;
  436     }
  437     if (t->re_reg[1] != NULL) {
  438         refree(t->re_reg[1]);
  439         t->re_reg[1] = NULL;
  440     }
  441     if (t->re_cnt > 0 && ++t->re_cnt > 10)
  442         /*
  443          * The regex appears to update frequently, so disable DFA
  444          * matching (which trades off expensive upfront compilation
  445          * overhead for faster subsequent matching).
  446          */
  447         t->re_cnt = 0;
  448     if (t->re_text == NULL) {
  449         /* reset regexp text if needed */
  450         t1 = t->re_exp;
  451         unref(t->re_text);
  452         t->re_text = dupnode(t1);
  453     }
  454     return re_cache_get(t);
  455 }
  456 
  457 /* resetup --- choose what kind of regexps we match */
  458 
  459 void
  460 resetup()
  461 {
  462     // init localeinfo for dfa
  463     init_localeinfo(& localeinfo);
  464 
  465     /*
  466      * Syntax bits: _that_ is yet another mind trip.  Recreational drugs
  467      * are helpful for recovering from the experience.
  468      *
  469      *  Aharon Robbins <arnold@skeeve.com>
  470      *  Sun, 21 Oct 2007 23:55:33 +0200
  471      */
  472     if (do_posix)
  473         syn = RE_SYNTAX_POSIX_AWK;  /* strict POSIX re's */
  474     else if (do_traditional)
  475         syn = RE_SYNTAX_AWK;        /* traditional Unix awk re's */
  476     else
  477         syn = RE_SYNTAX_GNU_AWK;    /* POSIX re's + GNU ops */
  478 
  479     /*
  480      * Interval expressions are now on by default, as POSIX is
  481      * wide-spread enough that people want it. The do_intervals
  482      * variable remains for use with --traditional.
  483      */
  484     if (do_intervals)
  485         syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES;
  486 
  487     (void) re_set_syntax(syn);
  488 }
  489 
  490 /* using_utf8 --- are we using utf8 */
  491 
  492 bool
  493 using_utf8(void)
  494 {
  495     return localeinfo.using_utf8;
  496 }
  497 
  498 /* reisstring --- return true if the RE match is a simple string match */
  499 
  500 int
  501 reisstring(const char *text, size_t len, Regexp *re, const char *buf)
  502 {
  503     int res;
  504     const char *matched;
  505 
  506     /* simple checking for meta characters in re */
  507     if (re->has_meta)
  508         return false;   /* give up early, can't be string match */
  509 
  510     /* make accessable to gdb */
  511     matched = &buf[RESTART(re, buf)];
  512 
  513     res = (memcmp(text, matched, len) == 0);
  514 
  515     return res;
  516 }
  517 
  518 /* reflags2str --- make a regex flags value readable */
  519 
  520 const char *
  521 reflags2str(int flagval)
  522 {
  523     static const struct flagtab values[] = {
  524         { RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
  525         { RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
  526         { RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
  527         { RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
  528         { RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
  529         { RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
  530         { RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
  531         { RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
  532         { RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
  533         { RE_INTERVALS, "RE_INTERVALS" },
  534         { RE_LIMITED_OPS, "RE_LIMITED_OPS" },
  535         { RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
  536         { RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
  537         { RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
  538         { RE_NO_BK_REFS, "RE_NO_BK_REFS" },
  539         { RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
  540         { RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
  541         { RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
  542         { RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
  543         { RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
  544         { RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
  545         { RE_ICASE, "RE_ICASE" },
  546         { RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" },
  547         { RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" },
  548         { RE_NO_SUB, "RE_NO_SUB" },
  549         { 0,    NULL },
  550     };
  551 
  552     if (flagval == RE_SYNTAX_EMACS) /* == 0 */
  553         return "RE_SYNTAX_EMACS";
  554 
  555     return genflags2str(flagval, values);
  556 }
  557 
  558 /*
  559  * dfawarn() is called by the dfa routines whenever a regex is compiled
  560  * must supply a dfawarn.
  561  */
  562 
  563 void
  564 dfawarn(const char *dfa_warning)
  565 {
  566     /*
  567      * This routine does nothing, since gawk does its own
  568      * (better) check for bad [[:foo:]] syntax.
  569      */
  570 }
  571 
  572 /* check_bracket_exp --- look for /[:space:]/ that should be /[[:space:]]/ */
  573 
  574 static void
  575 check_bracket_exp(char *s, size_t length)
  576 {
  577     static struct reclass {
  578         const char *name;
  579         size_t len;
  580         bool warned;
  581     } classes[] = {
  582         /*
  583          * Ordered by what we hope is frequency,
  584          * since it's linear searched.
  585          */
  586         { "[:alpha:]", 9, false },
  587         { "[:digit:]", 9, false },
  588         { "[:alnum:]", 9, false },
  589         { "[:upper:]", 9, false },
  590         { "[:lower:]", 9, false },
  591         { "[:space:]", 9, false },
  592         { "[:xdigit:]", 10, false },
  593         { "[:punct:]", 9, false },
  594         { "[:print:]", 9, false },
  595         { "[:graph:]", 9, false },
  596         { "[:cntrl:]", 9, false },
  597         { "[:blank:]", 9, false },
  598         { NULL, 0 }
  599     };
  600     int i;
  601     bool found = false;
  602     char save;
  603     char *sp, *sp2, *end;
  604     int len;
  605     int count = 0;
  606 
  607     if (length == 0)
  608         return;
  609 
  610     end = s + length;
  611     save = s[length];
  612     s[length] = '\0';
  613     sp = s;
  614 
  615 again:
  616     sp = sp2 = memchr(sp, '[', (end - sp));
  617     if (sp == NULL)
  618         goto done;
  619 
  620     for (count++, sp++; *sp != '\0'; sp++) {
  621         if (*sp == '[')
  622             count++;
  623         /*
  624          * ] as first char after open [ is skipped
  625          * \] is skipped
  626          * [^]] is skipped
  627          */
  628         if (*sp == ']' && sp > sp2) {
  629              if (sp[-1] != '['
  630                  && sp[-1] != '\\')
  631                  ;
  632              else if ((sp - sp2) >= 2
  633                   && sp[-1] == '^' && sp[-2] == '[')
  634                  ;
  635              else
  636                 count--;
  637         }
  638 
  639         if (count == 0) {
  640             sp++;   /* skip past ']' */
  641             break;
  642         }
  643     }
  644 
  645     if (count > 0) {    /* bad regex, give up */
  646         goto done;
  647     }
  648 
  649     /* sp2 has start */
  650 
  651     for (i = 0; classes[i].name != NULL; i++) {
  652         if (classes[i].warned)
  653             continue;
  654         len = classes[i].len;
  655         if (   len == (sp - sp2)
  656             && memcmp(sp2, classes[i].name, len) == 0) {
  657             found = true;
  658             break;
  659         }
  660     }
  661 
  662     if (found && ! classes[i].warned) {
  663         warning(_("regexp component `%.*s' should probably be `[%.*s]'"),
  664                 len, sp2, len, sp2);
  665         classes[i].warned = true;
  666     }
  667 
  668     if (sp < end) {
  669         found = false;
  670         goto again;
  671     }
  672 done:
  673     s[length] = save;
  674 }