"Fossies" - the Fresh Open Source Software Archive

Member "gawk-5.1.0/node.c" (6 Feb 2020, 25214 Bytes) of package /linux/misc/gawk-5.1.0.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "node.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 5.0.1_vs_5.1.0.

    1 /*
    2  * node.c -- routines for node management
    3  */
    4 
    5 /*
    6  * Copyright (C) 1986, 1988, 1989, 1991-2001, 2003-2015, 2017, 2018, 2019,
    7  * the Free Software Foundation, Inc.
    8  *
    9  * This file is part of GAWK, the GNU implementation of the
   10  * AWK Programming Language.
   11  *
   12  * GAWK is free software; you can redistribute it and/or modify
   13  * it under the terms of the GNU General Public License as published by
   14  * the Free Software Foundation; either version 3 of the License, or
   15  * (at your option) any later version.
   16  *
   17  * GAWK is distributed in the hope that it will be useful,
   18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   20  * GNU General Public License for more details.
   21  *
   22  * You should have received a copy of the GNU General Public License
   23  * along with this program; if not, write to the Free Software
   24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
   25  */
   26 
   27 #include "awk.h"
   28 #include "math.h"
   29 #include "floatmagic.h" /* definition of isnan */
   30 
   31 static int is_ieee_magic_val(const char *val);
   32 static NODE *r_make_number(double x);
   33 static AWKNUM get_ieee_magic_val(char *val);
   34 extern NODE **fmt_list;          /* declared in eval.c */
   35 
   36 NODE *(*make_number)(double) = r_make_number;
   37 NODE *(*str2number)(NODE *) = r_force_number;
   38 NODE *(*format_val)(const char *, int, NODE *) = r_format_val;
   39 int (*cmp_numbers)(const NODE *, const NODE *) = cmp_awknums;
   40 
   41 /* is_hex --- return true if a string looks like a hex value */
   42 
   43 static bool
   44 is_hex(const char *str, const char *cpend)
   45 {
   46     /* on entry, we know the string length is >= 1 */
   47     if (*str == '-' || *str == '+')
   48         str++;
   49 
   50     if (str + 1 < cpend && str[0] == '0' && (str[1] == 'x' || str[1] == 'X'))
   51         return true;
   52 
   53     return false;
   54 }
   55 
   56 /* force_number --- force a value to be numeric */
   57 
   58 NODE *
   59 r_force_number(NODE *n)
   60 {
   61     char *cp;
   62     char *cpend;
   63     char save;
   64     char *ptr;
   65     extern double strtod();
   66 
   67     if ((n->flags & NUMCUR) != 0)
   68         return n;
   69 
   70     /*
   71      * We should always set NUMCUR. If USER_INPUT is set and it's a
   72      * numeric string, we clear STRING and enable NUMBER, but if it's not
   73      * numeric, we disable USER_INPUT.
   74      */
   75 
   76     /* All the conditionals are an attempt to avoid the expensive strtod */
   77 
   78     n->flags |= NUMCUR;
   79     n->numbr = 0.0;
   80 
   81     /* Trim leading white space, bailing out if there's nothing else */
   82     for (cp = n->stptr, cpend = cp + n->stlen;
   83          cp < cpend && isspace((unsigned char) *cp); cp++)
   84         continue;
   85 
   86     if (cp == cpend)
   87         goto badnum;
   88 
   89     /* At this point, we know the string is not entirely white space */
   90     /* Trim trailing white space */
   91     while (isspace((unsigned char) cpend[-1]))
   92         cpend--;
   93 
   94     /*
   95      * 2/2007:
   96      * POSIX, by way of severe language lawyering, seems to
   97      * allow things like "inf" and "nan" to mean something.
   98      * So if do_posix, the user gets what he deserves.
   99      * This also allows hexadecimal floating point. Ugh.
  100      */
  101     if (! do_posix) {
  102         if (is_alpha((unsigned char) *cp))
  103             goto badnum;
  104         else if (cpend == cp+4 && is_ieee_magic_val(cp)) {
  105             n->numbr = get_ieee_magic_val(cp);
  106             goto goodnum;
  107         }
  108         /* else
  109             fall through */
  110     }
  111     /* else POSIX, so
  112         fall through */
  113 
  114     if (   (! do_posix      /* not POSIXLY paranoid and */
  115             && (is_alpha((unsigned char) *cp)   /* letter, or */
  116                     /* CANNOT do non-decimal and saw 0x */
  117             || (! do_non_decimal_data && is_hex(cp, cpend))))) {
  118         goto badnum;
  119     }
  120 
  121     if (cpend - cp == 1) {      /* only one character */
  122         if (isdigit((unsigned char) *cp)) { /* it's a digit! */
  123             n->numbr = (AWKNUM)(*cp - '0');
  124             if (n->stlen == 1)      /* no white space */
  125                 n->flags |= NUMINT;
  126             goto goodnum;
  127         }
  128         goto badnum;
  129     }
  130 
  131     errno = 0;
  132     if (do_non_decimal_data     /* main.c assures false if do_posix */
  133         && ! do_traditional && get_numbase(cp, cpend - cp, true) != 10) {
  134         /* nondec2awknum() saves and restores the byte after the string itself */
  135         n->numbr = nondec2awknum(cp, cpend - cp, &ptr);
  136     } else {
  137         save = *cpend;
  138         *cpend = '\0';
  139         n->numbr = (AWKNUM) strtod((const char *) cp, &ptr);
  140         *cpend = save;
  141     }
  142 
  143     if (errno == 0 || errno == ERANGE) {
  144         errno = 0;  /* reset in case of ERANGE */
  145         if (ptr == cpend)
  146             goto goodnum;
  147         /* else keep the leading numeric value without updating flags */
  148         /* fall through to badnum */
  149     } else {
  150         errno = 0;
  151         /*
  152          * N.B. For subnormal values, strtod may return the
  153          * floating-point representation while setting errno to ERANGE.
  154          * We force the numeric value to 0 in such cases.
  155          */
  156         n->numbr = 0;
  157         /*
  158          * Or should we accept it as a NUMBER even though strtod
  159          * threw an error?
  160          */
  161         /* fall through to badnum */
  162     }
  163 badnum:
  164     n->flags &= ~USER_INPUT;
  165     return n;
  166 
  167 goodnum:
  168     if ((n->flags & USER_INPUT) != 0) {
  169         /* leave USER_INPUT enabled to indicate that this is a strnum */
  170         n->flags &= ~STRING;
  171         n->flags |= NUMBER;
  172     }
  173     return n;
  174 }
  175 
  176 
  177 /*
  178  * The following lookup table is used as an optimization in force_string;
  179  * (more complicated) variations on this theme didn't seem to pay off, but
  180  * systematic testing might be in order at some point.
  181  */
  182 static const char *values[] = {
  183     "0",
  184     "1",
  185     "2",
  186     "3",
  187     "4",
  188     "5",
  189     "6",
  190     "7",
  191     "8",
  192     "9",
  193 };
  194 #define NVAL    (sizeof(values)/sizeof(values[0]))
  195 
  196 /* r_format_val --- format a numeric value based on format */
  197 
  198 NODE *
  199 r_format_val(const char *format, int index, NODE *s)
  200 {
  201     char buf[BUFSIZ];
  202     char *sp = buf;
  203     double val;
  204 
  205     /*
  206      * 2/2007: Simplify our lives here. Instead of worrying about
  207      * whether or not the value will fit into a long just so we
  208      * can use sprintf("%ld", val) on it, always format it ourselves.
  209      * The only thing to worry about is that integral values always
  210      * format as integers. %.0f does that very well.
  211      *
  212      * 6/2008: Would that things were so simple. Always using %.0f
  213      * imposes a notable performance penalty for applications that
  214      * do a lot of conversion of integers to strings. So, we reinstate
  215      * the old code, but use %.0f for integral values that are outside
  216      * the range of a long.  This seems a reasonable compromise.
  217      *
  218      * 12/2009: Use <= and >= in the comparisons with LONG_xxx instead of
  219      * < and > so that things work correctly on systems with 64 bit integers.
  220      */
  221 
  222     if (out_of_range(s)) {
  223         const char *result = format_nan_inf(s, 'g');
  224         return make_string(result, strlen(result));
  225     } else if ((val = double_to_int(s->numbr)) != s->numbr
  226             || val <= LONG_MIN || val >= LONG_MAX
  227     ) {
  228         /* not an integral value, or out of integer range */
  229         /*
  230          * Once upon a time, we just blindly did this:
  231          *  sprintf(sp, format, s->numbr);
  232          *  s->stlen = strlen(sp);
  233          *  s->stfmt = index;
  234          * but that's no good if, e.g., OFMT is %s. So we punt,
  235          * and just always format the value ourselves.
  236          */
  237 
  238         NODE *dummy[2], *r;
  239         unsigned int oflags;
  240 
  241         /* create dummy node for a sole use of format_tree */
  242         dummy[1] = s;
  243         oflags = s->flags;
  244 
  245         if (val == s->numbr) {
  246             /* integral value, but outside range of %ld, use %.0f */
  247             r = format_tree("%.0f", 4, dummy, 2);
  248             s->stfmt = STFMT_UNUSED;
  249         } else {
  250             r = format_tree(format, fmt_list[index]->stlen, dummy, 2);
  251             assert(r != NULL);
  252             s->stfmt = index;
  253         }
  254         s->flags = oflags;
  255         s->stlen = r->stlen;
  256         if ((s->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
  257             efree(s->stptr);
  258         s->stptr = r->stptr;
  259 #ifdef HAVE_MPFR
  260         s->strndmode = MPFR_round_mode;
  261 #endif
  262         freenode(r);    /* Do not unref(r)! We want to keep s->stptr == r->stpr.  */
  263 
  264         goto no_malloc;
  265     } else {
  266         /*
  267          * integral value; force conversion to long only once.
  268          */
  269         long num = (long) val;
  270 
  271         if (num < NVAL && num >= 0) {
  272             sp = (char *) values[num];
  273             s->stlen = 1;
  274         } else {
  275             (void) sprintf(sp, "%ld", num);
  276             s->stlen = strlen(sp);
  277         }
  278         s->stfmt = STFMT_UNUSED;
  279         if ((s->flags & INTIND) != 0) {
  280             s->flags &= ~(INTIND|NUMBER);
  281             s->flags |= STRING;
  282         }
  283 #ifdef HAVE_MPFR
  284         s->strndmode = MPFR_round_mode;
  285 #endif
  286     }
  287     if ((s->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
  288         efree(s->stptr);
  289     emalloc(s->stptr, char *, s->stlen + 1, "format_val");
  290     memcpy(s->stptr, sp, s->stlen + 1);
  291 no_malloc:
  292     s->flags |= STRCUR;
  293     free_wstr(s);
  294     return s;
  295 }
  296 
  297 /* r_dupnode --- duplicate a node */
  298 
  299 NODE *
  300 r_dupnode(NODE *n)
  301 {
  302     NODE *r;
  303 
  304     assert(n->type == Node_val);
  305 
  306 #ifdef GAWKDEBUG
  307     if ((n->flags & MALLOC) != 0) {
  308         n->valref++;
  309         return n;
  310     }
  311 #endif
  312 
  313 #ifdef HAVE_MPFR
  314     if ((n->flags & MPZN) != 0) {
  315         r = mpg_integer();
  316         mpz_set(r->mpg_i, n->mpg_i);
  317         r->flags = n->flags;
  318     } else if ((n->flags & MPFN) != 0) {
  319         r = mpg_float();
  320         int tval = mpfr_set(r->mpg_numbr, n->mpg_numbr, ROUND_MODE);
  321         IEEE_FMT(r->mpg_numbr, tval);
  322         r->flags = n->flags;
  323     } else {
  324 #endif
  325         getnode(r);
  326         *r = *n;
  327 #ifdef HAVE_MPFR
  328     }
  329 #endif
  330 
  331     r->flags |= MALLOC;
  332     r->valref = 1;
  333     /*
  334      * DON'T call free_wstr(r) here!
  335      * r->wstptr still points at n->wstptr's value, and we
  336      * don't want to free it!
  337      */
  338     r->wstptr = NULL;
  339     r->wstlen = 0;
  340 
  341     if ((n->flags & STRCUR) != 0) {
  342         emalloc(r->stptr, char *, n->stlen + 1, "r_dupnode");
  343         memcpy(r->stptr, n->stptr, n->stlen);
  344         r->stptr[n->stlen] = '\0';
  345         if ((n->flags & WSTRCUR) != 0) {
  346             r->wstlen = n->wstlen;
  347             emalloc(r->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 1), "r_dupnode");
  348             memcpy(r->wstptr, n->wstptr, n->wstlen * sizeof(wchar_t));
  349             r->wstptr[n->wstlen] = L'\0';
  350             r->flags |= WSTRCUR;
  351         }
  352     }
  353 
  354     return r;
  355 }
  356 
  357 /* r_make_number --- allocate a node with defined number */
  358 
  359 static NODE *
  360 r_make_number(double x)
  361 {
  362     NODE *r = make_number_node(0);
  363     r->numbr = x;
  364     return r;
  365 }
  366 
  367 /* cmp_awknums --- compare two AWKNUMs */
  368 
  369 int
  370 cmp_awknums(const NODE *t1, const NODE *t2)
  371 {
  372     /*
  373      * This routine is also used to sort numeric array indices or values.
  374      * For the purposes of sorting, NaN is considered greater than
  375      * any other value, and all NaN values are considered equivalent and equal.
  376      * This isn't in compliance with IEEE standard, but compliance w.r.t. NaN
  377      * comparison at the awk level is a different issue, and needs to be dealt
  378      * with in the interpreter for each opcode seperately.
  379      */
  380 
  381     if (isnan(t1->numbr))
  382         return ! isnan(t2->numbr);
  383     if (isnan(t2->numbr))
  384         return -1;
  385     /* don't subtract, in case one or both are infinite */
  386     if (t1->numbr == t2->numbr)
  387         return 0;
  388     if (t1->numbr < t2->numbr)
  389         return -1;
  390     return 1;
  391 }
  392 
  393 
  394 /* make_str_node --- make a string node */
  395 
  396 NODE *
  397 make_str_node(const char *s, size_t len, int flags)
  398 {
  399     NODE *r;
  400     getnode(r);
  401     r->type = Node_val;
  402     r->numbr = 0;
  403     r->flags = (MALLOC|STRING|STRCUR);
  404     r->valref = 1;
  405     r->stfmt = STFMT_UNUSED;
  406 #ifdef HAVE_MPFR
  407     r->strndmode = MPFR_round_mode;
  408 #endif
  409     r->wstptr = NULL;
  410     r->wstlen = 0;
  411 
  412     if ((flags & ALREADY_MALLOCED) != 0)
  413         r->stptr = (char *) s;
  414     else {
  415         emalloc(r->stptr, char *, len + 1, "make_str_node");
  416         memcpy(r->stptr, s, len);
  417     }
  418     r->stptr[len] = '\0';
  419 
  420     if ((flags & SCAN) != 0) {  /* scan for escape sequences */
  421         const char *pf;
  422         char *ptm;
  423         int c;
  424         const char *end;
  425         mbstate_t cur_state;
  426 
  427         memset(& cur_state, 0, sizeof(cur_state));
  428 
  429         end = &(r->stptr[len]);
  430         for (pf = ptm = r->stptr; pf < end;) {
  431             /*
  432              * Keep multibyte characters together. This avoids
  433              * problems if a subsequent byte of a multibyte
  434              * character happens to be a backslash.
  435              */
  436             if (gawk_mb_cur_max > 1) {
  437                 int mblen = mbrlen(pf, end-pf, &cur_state);
  438 
  439                 if (mblen > 1) {
  440                     int i;
  441 
  442                     for (i = 0; i < mblen; i++)
  443                         *ptm++ = *pf++;
  444                     continue;
  445                 }
  446             }
  447 
  448             c = *pf++;
  449             if (c == '\\') {
  450                 c = parse_escape(&pf);
  451                 if (c < 0) {
  452                     if (do_lint)
  453                         lintwarn(_("backslash string continuation is not portable"));
  454                     if ((flags & ELIDE_BACK_NL) != 0)
  455                         continue;
  456                     c = '\\';
  457                 }
  458                 *ptm++ = c;
  459             } else
  460                 *ptm++ = c;
  461         }
  462         len = ptm - r->stptr;
  463         erealloc(r->stptr, char *, len + 1, "make_str_node");
  464         r->stptr[len] = '\0';
  465     }
  466     r->stlen = len;
  467 
  468     return r;
  469 }
  470 
  471 /* make_typed_regex --- make a typed regex node */
  472 
  473 NODE *
  474 make_typed_regex(const char *re, size_t len)
  475 {
  476     NODE *n, *exp, *n2;
  477 
  478     exp = make_str_node(re, len, ALREADY_MALLOCED);
  479     n = make_regnode(Node_regex, exp);
  480     if (n == NULL)
  481         fatal(_("could not make typed regex"));
  482 
  483     n2 = make_string(re, len);
  484     n2->typed_re = n;
  485     n2->numbr = 0;
  486     n2->flags |= NUMCUR|STRCUR|REGEX; 
  487     n2->flags &= ~(STRING|NUMBER);
  488 
  489     return n2;
  490 }
  491 
  492 
  493 /* unref --- remove reference to a particular node */
  494 
  495 void
  496 r_unref(NODE *tmp)
  497 {
  498 #ifdef GAWKDEBUG
  499     if (tmp == NULL)
  500         return;
  501     if ((tmp->flags & MALLOC) != 0) {
  502         if (tmp->valref > 1) {
  503             tmp->valref--;
  504             return;
  505         }
  506         if ((tmp->flags & STRCUR) != 0)
  507             efree(tmp->stptr);
  508     }
  509 #else
  510     if ((tmp->flags & (MALLOC|STRCUR)) == (MALLOC|STRCUR))
  511         efree(tmp->stptr);
  512 #endif
  513 
  514     mpfr_unset(tmp);
  515 
  516     free_wstr(tmp);
  517     freenode(tmp);
  518 }
  519 
  520 
  521 /*
  522  * parse_escape:
  523  *
  524  * Parse a C escape sequence.  STRING_PTR points to a variable containing a
  525  * pointer to the string to parse.  That pointer is updated past the
  526  * characters we use.  The value of the escape sequence is returned.
  527  *
  528  * A negative value means the sequence \ newline was seen, which is supposed to
  529  * be equivalent to nothing at all.
  530  *
  531  * If \ is followed by a null character, we return a negative value and leave
  532  * the string pointer pointing at the null character.
  533  *
  534  * If \ is followed by 000, we return 0 and leave the string pointer after the
  535  * zeros.  A value of 0 does not mean end of string.
  536  *
  537  * POSIX doesn't allow \x.
  538  */
  539 
  540 int
  541 parse_escape(const char **string_ptr)
  542 {
  543     int c = *(*string_ptr)++;
  544     int i;
  545     int count;
  546     int j;
  547     const char *start;
  548 
  549     if (do_lint_old) {
  550         switch (c) {
  551         case 'a':
  552         case 'b':
  553         case 'f':
  554         case 'r':
  555             lintwarn(_("old awk does not support the `\\%c' escape sequence"), c);
  556             break;
  557         }
  558     }
  559 
  560     switch (c) {
  561     case 'a':
  562         return '\a';
  563     case 'b':
  564         return '\b';
  565     case 'f':
  566         return '\f';
  567     case 'n':
  568         return '\n';
  569     case 'r':
  570         return '\r';
  571     case 't':
  572         return '\t';
  573     case 'v':
  574         return '\v';
  575     case '\n':
  576         return -2;
  577     case 0:
  578         (*string_ptr)--;
  579         return -1;
  580     case '0':
  581     case '1':
  582     case '2':
  583     case '3':
  584     case '4':
  585     case '5':
  586     case '6':
  587     case '7':
  588         i = c - '0';
  589         count = 0;
  590         while (++count < 3) {
  591             if ((c = *(*string_ptr)++) >= '0' && c <= '7') {
  592                 i *= 8;
  593                 i += c - '0';
  594             } else {
  595                 (*string_ptr)--;
  596                 break;
  597             }
  598         }
  599         return i;
  600     case 'x':
  601         if (do_lint) {
  602             static bool warned = false;
  603 
  604             if (! warned) {
  605                 warned = true;
  606                 lintwarn(_("POSIX does not allow `\\x' escapes"));
  607             }
  608         }
  609         if (do_posix)
  610             return ('x');
  611         if (! isxdigit((unsigned char) (*string_ptr)[0])) {
  612             warning(_("no hex digits in `\\x' escape sequence"));
  613             return ('x');
  614         }
  615         start = *string_ptr;
  616         for (i = j = 0; j < 2; j++) {
  617             /* do outside test to avoid multiple side effects */
  618             c = *(*string_ptr)++;
  619             if (isxdigit(c)) {
  620                 i *= 16;
  621                 if (isdigit(c))
  622                     i += c - '0';
  623                 else if (isupper(c))
  624                     i += c - 'A' + 10;
  625                 else
  626                     i += c - 'a' + 10;
  627             } else {
  628                 (*string_ptr)--;
  629                 break;
  630             }
  631         }
  632         if (do_lint && j > 2)
  633             lintwarn(_("hex escape \\x%.*s of %d characters probably not interpreted the way you expect"), j, start, j);
  634         return i;
  635     case '\\':
  636     case '"':
  637         return c;
  638     default:
  639     {
  640         static bool warned[256];
  641         unsigned char uc = (unsigned char) c;
  642 
  643         /* N.B.: use unsigned char here to avoid Latin-1 problems */
  644 
  645         if (! warned[uc]) {
  646             warned[uc] = true;
  647 
  648             warning(_("escape sequence `\\%c' treated as plain `%c'"), uc, uc);
  649         }
  650     }
  651         return c;
  652     }
  653 }
  654 
  655 /* get_numbase --- return the base to use for the number in 's' */
  656 
  657 int
  658 get_numbase(const char *s, size_t len, bool use_locale)
  659 {
  660     int dec_point = '.';
  661     const char *str = s;
  662 
  663 #if defined(HAVE_LOCALE_H)
  664     /*
  665      * loc.decimal_point may not have been initialized yet,
  666      * so double check it before using it.
  667      */
  668     if (use_locale && loc.decimal_point != NULL && loc.decimal_point[0] != '\0')
  669         dec_point = loc.decimal_point[0];   /* XXX --- assumes one char */
  670 #endif
  671 
  672     if (len < 2 || str[0] != '0')
  673         return 10;
  674 
  675     /* leading 0x or 0X */
  676     if (str[1] == 'x' || str[1] == 'X')
  677         return 16;
  678 
  679     /*
  680      * Numbers with '.', 'e', or 'E' are decimal.
  681      * Have to check so that things like 00.34 are handled right.
  682      *
  683      * These beasts can have trailing whitespace. Deal with that too.
  684      */
  685     for (; len > 0; len--, str++) {
  686         if (*str == 'e' || *str == 'E' || *str == dec_point)
  687             return 10;
  688         else if (! isdigit((unsigned char) *str))
  689             break;
  690     }
  691 
  692     if (! isdigit((unsigned char) s[1])
  693             || s[1] == '8' || s[1] == '9'
  694     )
  695         return 10;
  696     return 8;
  697 }
  698 
  699 /* str2wstr --- convert a multibyte string to a wide string */
  700 
  701 NODE *
  702 str2wstr(NODE *n, size_t **ptr)
  703 {
  704     size_t i, count, src_count;
  705     char *sp;
  706     mbstate_t mbs;
  707     wchar_t wc, *wsp;
  708     static bool warned = false;
  709 
  710     assert((n->flags & (STRING|STRCUR)) != 0);
  711 
  712     /*
  713      * Don't convert global null string or global null field
  714      * variables to a wide string. They are both zero-length anyway.
  715      * This also avoids future double-free errors while releasing
  716      * shallow copies, eg. *tmp = *Null_field; free_wstr(tmp);
  717      */
  718     if (n == Nnull_string || n == Null_field)
  719         return n;
  720 
  721     if ((n->flags & WSTRCUR) != 0) {
  722         if (ptr == NULL)
  723             return n;
  724         /* otherwise
  725             fall through and recompute to fill in the array */
  726         free_wstr(n);
  727     }
  728 
  729     /*
  730      * After consideration and consultation, this
  731      * code trades space for time. We allocate
  732      * an array of wchar_t that is n->stlen long.
  733      * This is needed in the worst case anyway, where
  734      * each input byte maps to one wchar_t.  The
  735      * advantage is that we only have to convert the string
  736      * once, instead of twice, once to find out how many
  737      * wide characters, and then again to actually fill in
  738      * the info.  If there's a lot left over, we can
  739      * realloc the wide string down in size.
  740      */
  741 
  742     emalloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->stlen + 1), "str2wstr");
  743     wsp = n->wstptr;
  744 
  745     /*
  746      * For use by do_match, create and fill in an array.
  747      * For each byte `i' in n->stptr (the original string),
  748      * a[i] is equal to `j', where `j' is the corresponding wchar_t
  749      * in the converted wide string.
  750      *
  751      * Create the array.
  752      */
  753     if (ptr != NULL) {
  754         ezalloc(*ptr, size_t *, sizeof(size_t) * n->stlen, "str2wstr");
  755     }
  756 
  757     sp = n->stptr;
  758     src_count = n->stlen;
  759     memset(& mbs, 0, sizeof(mbs));
  760     for (i = 0; src_count > 0; i++) {
  761         /*
  762          * 9/2010: Check the current byte; if it's a valid character,
  763          * then it doesn't start a multibyte sequence. This brings a
  764          * big speed up. Thanks to Ulrich Drepper for the tip.
  765          * 11/2010: Thanks to Paolo Bonzini for some even faster code.
  766          */
  767         if (is_valid_character(*sp)) {
  768             count = 1;
  769             wc = btowc_cache(*sp);
  770         } else
  771             count = mbrtowc(& wc, sp, src_count, & mbs);
  772         switch (count) {
  773         case (size_t) -2:
  774         case (size_t) -1:
  775             /*
  776              * mbrtowc(3) says the state of mbs becomes undefined
  777              * after a bad character, so reset it.
  778              */
  779             memset(& mbs, 0, sizeof(mbs));
  780 
  781             /* Warn the user something's wrong */
  782             if (! warned) {
  783                 warned = true;
  784                 warning(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale."));
  785             }
  786 
  787             /*
  788              * 8/2015: If we're using UTF, then instead of just
  789              * skipping the character, plug in the Unicode
  790              * replacement character. In most cases this gives
  791              * us "better" results, in that character counts
  792              * and string lengths tend to make more sense.
  793              *
  794              * Otherwise, just skip the bad byte and keep going,
  795              * so that we get a more-or-less full string, instead of
  796              * stopping early. This is particularly important
  797              * for match() where we need to build the indices.
  798              */
  799             if (using_utf8()) {
  800                 count = 1;
  801                 wc = 0xFFFD;    /* unicode replacement character */
  802                 goto set_wc;
  803             } else {
  804                 /* skip it and keep going */
  805                 sp++;
  806                 src_count--;
  807             }
  808             break;
  809 
  810         case 0:
  811             count = 1;
  812             /* fall through */
  813         default:
  814         set_wc:
  815             *wsp++ = wc;
  816             src_count -= count;
  817             while (count--)  {
  818                 if (ptr != NULL)
  819                     (*ptr)[sp - n->stptr] = i;
  820                 sp++;
  821             }
  822             break;
  823         }
  824     }
  825 
  826     *wsp = L'\0';
  827     n->wstlen = wsp - n->wstptr;
  828     n->flags |= WSTRCUR;
  829 #define ARBITRARY_AMOUNT_TO_GIVE_BACK 100
  830     if (n->stlen - n->wstlen > ARBITRARY_AMOUNT_TO_GIVE_BACK)
  831         erealloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 1), "str2wstr");
  832 
  833     return n;
  834 }
  835 
  836 /* wstr2str --- convert a wide string back into multibyte one */
  837 
  838 NODE *
  839 wstr2str(NODE *n)
  840 {
  841     size_t result;
  842     size_t length;
  843     wchar_t *wp;
  844     mbstate_t mbs;
  845     char *newval, *cp;
  846 
  847     assert(n->valref == 1);
  848     assert((n->flags & WSTRCUR) != 0);
  849 
  850     /*
  851      * Convert the wide chars in t1->wstptr back into m.b. chars.
  852      * This is pretty grotty, but it's the most straightforward
  853      * way to do things.
  854      */
  855     memset(& mbs, 0, sizeof(mbs));
  856 
  857     length = n->wstlen;
  858     emalloc(newval, char *, (length * gawk_mb_cur_max) + 1, "wstr2str");
  859 
  860     wp = n->wstptr;
  861     for (cp = newval; length > 0; length--) {
  862         result = wcrtomb(cp, *wp, & mbs);
  863         if (result == (size_t) -1)  /* what to do? break seems best */
  864             break;
  865         cp += result;
  866         wp++;
  867     }
  868     *cp = '\0';
  869 
  870     /* N.B. caller just created n with make_string, so this free is safe */
  871     efree(n->stptr);
  872     n->stptr = newval;
  873     n->stlen = cp - newval;
  874 
  875     return n;
  876 }
  877 
  878 /* free_wstr --- release the wide string part of a node */
  879 
  880 void
  881 r_free_wstr(NODE *n)
  882 {
  883     assert(n->type == Node_val);
  884 
  885     if ((n->flags & WSTRCUR) != 0) {
  886         assert(n->wstptr != NULL);
  887         efree(n->wstptr);
  888     }
  889     n->wstptr = NULL;
  890     n->wstlen = 0;
  891     n->flags &= ~WSTRCUR;
  892 }
  893 
  894 static void __attribute__ ((unused))
  895 dump_wstr(FILE *fp, const wchar_t *str, size_t len)
  896 {
  897     if (str == NULL || len == 0)
  898         return;
  899 
  900     for (; len--; str++)
  901         putwc(*str, fp);
  902 }
  903 
  904 /* wstrstr --- walk haystack, looking for needle, wide char version */
  905 
  906 const wchar_t *
  907 wstrstr(const wchar_t *haystack, size_t hs_len,
  908     const wchar_t *needle, size_t needle_len)
  909 {
  910     size_t i;
  911 
  912     if (haystack == NULL || needle == NULL || needle_len > hs_len)
  913         return NULL;
  914 
  915     for (i = 0; i < hs_len; i++) {
  916         if (haystack[i] == needle[0]
  917             && i+needle_len-1 < hs_len
  918             && haystack[i+needle_len-1] == needle[needle_len-1]) {
  919             /* first & last chars match, check string */
  920             if (memcmp(haystack+i, needle, sizeof(wchar_t) * needle_len) == 0) {
  921                 return haystack + i;
  922             }
  923         }
  924     }
  925 
  926     return NULL;
  927 }
  928 
  929 /* wcasestrstr --- walk haystack, nocase look for needle, wide char version */
  930 
  931 const wchar_t *
  932 wcasestrstr(const wchar_t *haystack, size_t hs_len,
  933     const wchar_t *needle, size_t needle_len)
  934 {
  935     size_t i, j;
  936 
  937     if (haystack == NULL || needle == NULL || needle_len > hs_len)
  938         return NULL;
  939 
  940     for (i = 0; i < hs_len; i++) {
  941         if (towlower(haystack[i]) == towlower(needle[0])
  942             && i+needle_len-1 < hs_len
  943             && towlower(haystack[i+needle_len-1]) == towlower(needle[needle_len-1])) {
  944             /* first & last chars match, check string */
  945             const wchar_t *start;
  946 
  947             start = haystack+i;
  948             for (j = 0; j < needle_len; j++, start++) {
  949                 wchar_t h, n;
  950 
  951                 h = towlower(*start);
  952                 n = towlower(needle[j]);
  953                 if (h != n)
  954                     goto out;
  955             }
  956             return haystack + i;
  957         }
  958 out:    ;
  959     }
  960 
  961     return NULL;
  962 }
  963 
  964 /* is_ieee_magic_val --- return true for +inf, -inf, +nan, -nan */
  965 
  966 static int
  967 is_ieee_magic_val(const char *val)
  968 {
  969     /*
  970      * Avoid strncasecmp: it mishandles ASCII bytes in some locales.
  971      * Assume the length is 4, as the caller checks this.
  972      */
  973     return (   (val[0] == '+' || val[0] == '-')
  974         && (   (   (val[1] == 'i' || val[1] == 'I')
  975             && (val[2] == 'n' || val[2] == 'N')
  976             && (val[3] == 'f' || val[3] == 'F'))
  977             || (   (val[1] == 'n' || val[1] == 'N')
  978             && (val[2] == 'a' || val[2] == 'A')
  979             && (val[3] == 'n' || val[3] == 'N'))));
  980 }
  981 
  982 /* get_ieee_magic_val --- return magic value for string */
  983 
  984 static AWKNUM
  985 get_ieee_magic_val(char *val)
  986 {
  987     static bool first = true;
  988     static AWKNUM inf;
  989     static AWKNUM nan;
  990     char save;
  991 
  992     char *ptr;
  993     save = val[4];
  994     val[4] = '\0';
  995     AWKNUM v = strtod(val, &ptr);
  996     val[4] = save;
  997 
  998     if (val == ptr) { /* Older strtod implementations don't support inf or nan. */
  999         if (first) {
 1000             first = false;
 1001             nan = sqrt(-1.0);
 1002             inf = -log(0.0);
 1003         }
 1004 
 1005         v = ((val[1] == 'i' || val[1] == 'I') ? inf : nan);
 1006         if (val[0] == '-')
 1007             v = -v;
 1008     }
 1009 
 1010     return v;
 1011 }
 1012 
 1013 wint_t btowc_cache[256];
 1014 
 1015 /* init_btowc_cache --- initialize the cache */
 1016 
 1017 void init_btowc_cache()
 1018 {
 1019     int i;
 1020 
 1021     for (i = 0; i < 255; i++) {
 1022         btowc_cache[i] = btowc(i);
 1023     }
 1024 }
 1025 
 1026 #define BLOCKCHUNK 100
 1027 
 1028 struct block_header nextfree[BLOCK_MAX] = {
 1029     { NULL, sizeof(NODE), "node" },
 1030     { NULL, sizeof(BUCKET), "bucket" },
 1031 #ifdef HAVE_MPFR
 1032     { NULL, sizeof(mpfr_t), "mpfr" },
 1033     { NULL, sizeof(mpz_t), "mpz" },
 1034 #endif
 1035 };
 1036 
 1037 #ifdef MEMDEBUG
 1038 
 1039 void *
 1040 r_getblock(int id)
 1041 {
 1042     void *res;
 1043     emalloc(res, void *, nextfree[id].size, "getblock");
 1044     nextfree[id].active++;
 1045     if (nextfree[id].highwater < nextfree[id].active)
 1046         nextfree[id].highwater = nextfree[id].active;
 1047     return res;
 1048 }
 1049 
 1050 void
 1051 r_freeblock(void *p, int id)
 1052 {
 1053     nextfree[id].active--;
 1054     free(p);
 1055 }
 1056 
 1057 #else
 1058 
 1059 /* more_blocks --- get more blocks of memory and add to the free list;
 1060     size of a block must be >= sizeof(struct block_item)
 1061  */
 1062 
 1063 void *
 1064 more_blocks(int id)
 1065 {
 1066     struct block_item *freep, *np, *next;
 1067     char *p, *endp;
 1068     size_t size;
 1069 
 1070     size = nextfree[id].size;
 1071 
 1072     assert(size >= sizeof(struct block_item));
 1073     emalloc(freep, struct block_item *, BLOCKCHUNK * size, "more_blocks");
 1074     p = (char *) freep;
 1075     endp = p + BLOCKCHUNK * size;
 1076 
 1077     for (np = freep; ; np = next) {
 1078         next = (struct block_item *) (p += size);
 1079         if (p >= endp) {
 1080             np->freep = NULL;
 1081             break;
 1082         }
 1083         np->freep = next;
 1084     }
 1085     nextfree[id].freep = freep->freep;
 1086     nextfree[id].highwater += BLOCKCHUNK;
 1087     return freep;
 1088 }
 1089 
 1090 #endif