"Fossies" - the Fresh Open Source Software Archive

Member "gawk-5.1.0/field.c" (10 Apr 2020, 43919 Bytes) of package /linux/misc/gawk-5.1.0.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "field.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 5.0.1_vs_5.1.0.

    1 /*
    2  * field.c - routines for dealing with fields and record parsing
    3  */
    4 
    5 /*
    6  * Copyright (C) 1986, 1988, 1989, 1991-2020 the Free Software Foundation, Inc.
    7  *
    8  * This file is part of GAWK, the GNU implementation of the
    9  * AWK Programming Language.
   10  *
   11  * GAWK is free software; you can redistribute it and/or modify
   12  * it under the terms of the GNU General Public License as published by
   13  * the Free Software Foundation; either version 3 of the License, or
   14  * (at your option) any later version.
   15  *
   16  * GAWK is distributed in the hope that it will be useful,
   17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   19  * GNU General Public License for more details.
   20  *
   21  * You should have received a copy of the GNU General Public License
   22  * along with this program; if not, write to the Free Software
   23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
   24  */
   25 
   26 #include "awk.h"
   27 
   28 /*
   29  * In case that the system doesn't have isblank().
   30  * Don't bother with autoconf ifdef junk, just force it.
   31  * See dfa.c and regex_internal.h and regcomp.c. Bleah.
   32  */
   33 static int
   34 is_blank(int c)
   35 {
   36     return c == ' ' || c == '\t';
   37 }
   38 
   39 typedef void (* Setfunc)(long, char *, long, NODE *);
   40 
   41 /* is the API currently overriding the default parsing mechanism? */
   42 static bool api_parser_override = false;
   43 typedef long (*parse_field_func_t)(long, char **, int, NODE *,
   44                  Regexp *, Setfunc, NODE *, NODE *, bool);
   45 static parse_field_func_t parse_field;
   46 /*
   47  * N.B. The normal_parse_field function pointer contains the parse_field value
   48  * that should be used except when API field parsing is overriding the default
   49  * field parsing mechanism.
   50  */
   51 static parse_field_func_t normal_parse_field;
   52 static long re_parse_field(long, char **, int, NODE *,
   53                  Regexp *, Setfunc, NODE *, NODE *, bool);
   54 static long def_parse_field(long, char **, int, NODE *,
   55                   Regexp *, Setfunc, NODE *, NODE *, bool);
   56 static long null_parse_field(long, char **, int, NODE *,
   57                  Regexp *, Setfunc, NODE *, NODE *, bool);
   58 static long sc_parse_field(long, char **, int, NODE *,
   59                  Regexp *, Setfunc, NODE *, NODE *, bool);
   60 static long fw_parse_field(long, char **, int, NODE *,
   61                  Regexp *, Setfunc, NODE *, NODE *, bool);
   62 static const awk_fieldwidth_info_t *api_fw = NULL;
   63 static long fpat_parse_field(long, char **, int, NODE *,
   64                  Regexp *, Setfunc, NODE *, NODE *, bool);
   65 static void set_element(long num, char * str, long len, NODE *arr);
   66 static void grow_fields_arr(long num);
   67 static void set_field(long num, char *str, long len, NODE *dummy);
   68 static void purge_record(void);
   69 
   70 static char *parse_extent;  /* marks where to restart parse of record */
   71 static long parse_high_water = 0; /* field number that we have parsed so far */
   72 static long nf_high_water = 0;  /* size of fields_arr */
   73 static bool resave_fs;
   74 static NODE *save_FS;       /* save current value of FS when line is read,
   75                  * to be used in deferred parsing
   76                  */
   77 static NODE *save_FPAT;     /* save current value of FPAT when line is read,
   78                  * to be used in deferred parsing
   79                  */
   80 static awk_fieldwidth_info_t *FIELDWIDTHS = NULL;
   81 
   82 NODE **fields_arr;      /* array of pointers to the field nodes */
   83 bool field0_valid;      /* $(>0) has not been changed yet */
   84 int default_FS;         /* true when FS == " " */
   85 Regexp *FS_re_yes_case = NULL;
   86 Regexp *FS_re_no_case = NULL;
   87 Regexp *FS_regexp = NULL;
   88 Regexp *FPAT_re_yes_case = NULL;
   89 Regexp *FPAT_re_no_case = NULL;
   90 Regexp *FPAT_regexp = NULL;
   91 NODE *Null_field = NULL;
   92 
   93 /* init_fields --- set up the fields array to start with */
   94 
   95 void
   96 init_fields()
   97 {
   98     emalloc(fields_arr, NODE **, sizeof(NODE *), "init_fields");
   99 
  100     fields_arr[0] = make_string("", 0);
  101     fields_arr[0]->flags |= NULL_FIELD;
  102 
  103     parse_extent = fields_arr[0]->stptr;
  104     save_FS = dupnode(FS_node->var_value);
  105 
  106     Null_field = make_string("", 0);
  107     Null_field->flags = (STRCUR|STRING|NULL_FIELD); /* do not set MALLOC */
  108 
  109     field0_valid = true;
  110 }
  111 
  112 /* grow_fields --- acquire new fields as needed */
  113 
  114 static void
  115 grow_fields_arr(long num)
  116 {
  117     int t;
  118     NODE *n;
  119 
  120     erealloc(fields_arr, NODE **, (num + 1) * sizeof(NODE *), "grow_fields_arr");
  121     for (t = nf_high_water + 1; t <= num; t++) {
  122         getnode(n);
  123         *n = *Null_field;
  124         fields_arr[t] = n;
  125     }
  126     nf_high_water = num;
  127 }
  128 
  129 /* set_field --- set the value of a particular field */
  130 
  131 /*ARGSUSED*/
  132 static void
  133 set_field(long num,
  134     char *str,
  135     long len,
  136     NODE *dummy ATTRIBUTE_UNUSED)   /* just to make interface same as set_element */
  137 {
  138     NODE *n;
  139 
  140     if (num > nf_high_water)
  141         grow_fields_arr(num);
  142     n = fields_arr[num];
  143     n->stptr = str;
  144     n->stlen = len;
  145     n->flags = (STRCUR|STRING|USER_INPUT);  /* do not set MALLOC */
  146 }
  147 
  148 /* rebuild_record --- Someone assigned a value to $(something).
  149             Fix up $0 to be right */
  150 
  151 void
  152 rebuild_record()
  153 {
  154     /*
  155      * use explicit unsigned longs for lengths, in case
  156      * a size_t isn't big enough.
  157      */
  158     unsigned long tlen;
  159     NODE *tmp;
  160     char *ops;
  161     char *cops;
  162     long i;
  163 
  164     assert(NF != -1);
  165 
  166     tlen = 0;
  167     for (i = NF; i > 0; i--) {
  168         tmp = fields_arr[i];
  169         tmp = force_string(tmp);
  170         tlen += tmp->stlen;
  171     }
  172     tlen += (NF - 1) * OFSlen;
  173     if ((long) tlen < 0)
  174         tlen = 0;
  175     emalloc(ops, char *, tlen + 1, "rebuild_record");
  176     cops = ops;
  177     ops[0] = '\0';
  178     for (i = 1;  i <= NF; i++) {
  179         free_wstr(fields_arr[i]);
  180         tmp = fields_arr[i];
  181         /* copy field */
  182         if (tmp->stlen == 1)
  183             *cops++ = tmp->stptr[0];
  184         else if (tmp->stlen != 0) {
  185             memcpy(cops, tmp->stptr, tmp->stlen);
  186             cops += tmp->stlen;
  187         }
  188         /* copy OFS */
  189         if (i != NF) {
  190             if (OFSlen == 1)
  191                 *cops++ = *OFS;
  192             else if (OFSlen != 0) {
  193                 memcpy(cops, OFS, OFSlen);
  194                 cops += OFSlen;
  195             }
  196         }
  197     }
  198     tmp = make_str_node(ops, tlen, ALREADY_MALLOCED);
  199 
  200     /*
  201      * Since we are about to unref fields_arr[0], we want to find
  202      * any fields that still point into it, and have them point
  203      * into the new field zero.  This has to be done intelligently,
  204      * so that unrefing a field doesn't try to unref into the old $0.
  205      */
  206     for (cops = ops, i = 1; i <= NF; i++) {
  207         NODE *r = fields_arr[i];
  208         /*
  209          * There is no reason to copy malloc'ed fields to point into
  210          * the new $0 buffer, although that's how previous versions did
  211          * it. It seems faster to leave the malloc'ed fields in place.
  212          */
  213         if (r->stlen > 0 && (r->flags & MALLOC) == 0) {
  214             NODE *n;
  215             getnode(n);
  216 
  217             *n = *r;
  218             if (r->valref > 1) {
  219                 /*
  220                  * This can and does happen.  It seems clear that
  221                  * we can't leave r's stptr pointing into the
  222                  * old $0 buffer that we are about to unref.
  223                  */
  224                 emalloc(r->stptr, char *, r->stlen + 1, "rebuild_record");
  225                 memcpy(r->stptr, cops, r->stlen);
  226                 r->stptr[r->stlen] = '\0';
  227                 r->flags |= MALLOC;
  228 
  229                 n->valref = 1;  // reset in the new field to start it off correctly!
  230             }
  231 
  232             n->stptr = cops;
  233             unref(r);
  234             fields_arr[i] = n;
  235             assert((n->flags & WSTRCUR) == 0);
  236         }
  237         cops += fields_arr[i]->stlen + OFSlen;
  238     }
  239 
  240     assert((fields_arr[0]->flags & MALLOC) == 0
  241         ? fields_arr[0]->valref == 1
  242         : true);
  243 
  244     unref(fields_arr[0]);
  245 
  246     fields_arr[0] = tmp;
  247     field0_valid = true;
  248 }
  249 
  250 /*
  251  * set_record:
  252  * setup $0, but defer parsing rest of line until reference is made to $(>0)
  253  * or to NF.  At that point, parse only as much as necessary.
  254  *
  255  * Manage a private buffer for the contents of $0.  Doing so keeps us safe
  256  * if `getline var' decides to rearrange the contents of the IOBUF that
  257  * $0 might have been pointing into.  The cost is the copying of the buffer;
  258  * but better correct than fast.
  259  */
  260 void
  261 set_record(const char *buf, int cnt, const awk_fieldwidth_info_t *fw)
  262 {
  263     NODE *n;
  264     static char *databuf;
  265     static unsigned long databuf_size;
  266 #define INITIAL_SIZE    512
  267 #define MAX_SIZE    ((unsigned long) ~0)    /* maximally portable ... */
  268 
  269     purge_record();
  270 
  271     /* buffer management: */
  272     if (databuf_size == 0) {    /* first time */
  273         ezalloc(databuf, char *, INITIAL_SIZE, "set_record");
  274         databuf_size = INITIAL_SIZE;
  275     }
  276     /*
  277      * Make sure there's enough room. Since we sometimes need
  278      * to place a sentinel at the end, we make sure
  279      * databuf_size is > cnt after allocation.
  280      */
  281     if (cnt >= databuf_size) {
  282         do {
  283             if (databuf_size > MAX_SIZE/2)
  284                 fatal(_("input record too large"));
  285             databuf_size *= 2;
  286         } while (cnt >= databuf_size);
  287         erealloc(databuf, char *, databuf_size, "set_record");
  288         memset(databuf, '\0', databuf_size);
  289     }
  290     /* copy the data */
  291     if (cnt != 0) {
  292         memcpy(databuf, buf, cnt);
  293     }
  294 
  295     /*
  296      * Add terminating '\0' so that C library routines
  297      * will know when to stop.
  298      */
  299     databuf[cnt] = '\0';
  300 
  301     /* manage field 0: */
  302     assert((fields_arr[0]->flags & MALLOC) == 0
  303         ? fields_arr[0]->valref == 1
  304         : true);
  305 
  306     unref(fields_arr[0]);
  307     getnode(n);
  308     n->stptr = databuf;
  309     n->stlen = cnt;
  310     n->valref = 1;
  311     n->type = Node_val;
  312     n->stfmt = STFMT_UNUSED;
  313 #ifdef HAVE_MPFR
  314     n->strndmode = MPFR_round_mode;
  315 #endif
  316     n->flags = (STRING|STRCUR|USER_INPUT);  /* do not set MALLOC */
  317     fields_arr[0] = n;
  318     if (fw != api_fw) {
  319         if ((api_fw = fw) != NULL) {
  320             if (! api_parser_override) {
  321                 api_parser_override = true;
  322                 parse_field = fw_parse_field;
  323                 update_PROCINFO_str("FS", "API");
  324             }
  325         } else if (api_parser_override) {
  326             api_parser_override = false;
  327             parse_field = normal_parse_field;
  328             update_PROCINFO_str("FS", current_field_sep_str());
  329         }
  330     }
  331 
  332 #undef INITIAL_SIZE
  333 #undef MAX_SIZE
  334 }
  335 
  336 /* reset_record --- start over again with current $0 */
  337 
  338 void
  339 reset_record()
  340 {
  341     fields_arr[0] = force_string(fields_arr[0]);
  342     purge_record();
  343     if (api_parser_override) {
  344         api_parser_override = false;
  345         parse_field = normal_parse_field;
  346         update_PROCINFO_str("FS", current_field_sep_str());
  347     }
  348 }
  349 
  350 static void
  351 purge_record()
  352 {
  353     int i;
  354 
  355     NF = -1;
  356     for (i = 1; i <= parse_high_water; i++) {
  357         NODE *n;
  358         NODE *r = fields_arr[i];
  359         if ((r->flags & MALLOC) == 0 && r->valref > 1) {
  360             /* This can and does happen. We must copy the string! */
  361             const char *save = r->stptr;
  362             emalloc(r->stptr, char *, r->stlen + 1, "purge_record");
  363             memcpy(r->stptr, save, r->stlen);
  364             r->stptr[r->stlen] = '\0';
  365             r->flags |= MALLOC;
  366         }
  367         unref(r);
  368         getnode(n);
  369         *n = *Null_field;
  370         fields_arr[i] = n;
  371     }
  372 
  373     parse_high_water = 0;
  374     /*
  375      * $0 = $0 should resplit using the current value of FS.
  376      */
  377     if (resave_fs) {
  378         resave_fs = false;
  379         unref(save_FS);
  380         save_FS = dupnode(FS_node->var_value);
  381     }
  382 
  383     field0_valid = true;
  384 }
  385 
  386 /* set_NF --- handle what happens to $0 and fields when NF is changed */
  387 
  388 void
  389 set_NF()
  390 {
  391     int i;
  392     long nf;
  393     NODE *n;
  394 
  395     assert(NF != -1);
  396 
  397     (void) force_number(NF_node->var_value);
  398     nf = get_number_si(NF_node->var_value);
  399     if (nf < 0)
  400         fatal(_("NF set to negative value"));
  401 
  402     static bool warned = false;
  403     if (do_lint && NF > nf && ! warned) {
  404         warned = true;
  405         lintwarn(_("decrementing NF is not portable to many awk versions"));
  406     }
  407 
  408     NF = nf;
  409 
  410     if (NF > nf_high_water)
  411         grow_fields_arr(NF);
  412     if (parse_high_water < NF) {
  413         for (i = parse_high_water + 1; i >= 0 && i <= NF; i++) {
  414             unref(fields_arr[i]);
  415             getnode(n);
  416             *n = *Null_field;
  417             fields_arr[i] = n;
  418         }
  419         parse_high_water = NF;
  420     } else if (parse_high_water > 0) {
  421         for (i = NF + 1; i >= 0 && i <= parse_high_water; i++) {
  422             unref(fields_arr[i]);
  423             getnode(n);
  424             *n = *Null_field;
  425             fields_arr[i] = n;
  426         }
  427         parse_high_water = NF;
  428     }
  429     field0_valid = false;
  430 }
  431 
  432 /*
  433  * re_parse_field --- parse fields using a regexp.
  434  *
  435  * This is called both from get_field() and from do_split()
  436  * via (*parse_field)().  This variation is for when FS is a regular
  437  * expression -- either user-defined or because RS=="" and FS==" "
  438  */
  439 static long
  440 re_parse_field(long up_to,  /* parse only up to this field number */
  441     char **buf, /* on input: string to parse; on output: point to start next */
  442     int len,
  443     NODE *fs ATTRIBUTE_UNUSED,
  444     Regexp *rp,
  445     Setfunc set,    /* routine to set the value of the parsed field */
  446     NODE *n,
  447     NODE *sep_arr,  /* array of field separators (maybe NULL) */
  448     bool in_middle)
  449 {
  450     char *scan = *buf;
  451     long nf = parse_high_water;
  452     char *field;
  453     char *end = scan + len;
  454     int regex_flags = RE_NEED_START;
  455     char *sep;
  456     size_t mbclen = 0;
  457     mbstate_t mbs;
  458 
  459     memset(&mbs, 0, sizeof(mbstate_t));
  460 
  461     if (in_middle)
  462         regex_flags |= RE_NO_BOL;
  463 
  464     if (up_to == UNLIMITED)
  465         nf = 0;
  466     if (len == 0)
  467         return nf;
  468 
  469     bool default_field_splitting = (RS_is_null && default_FS);
  470 
  471     if (default_field_splitting) {
  472         sep = scan;
  473         while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
  474             scan++;
  475         if (sep_arr != NULL && sep < scan)
  476             set_element(nf, sep, (long)(scan - sep), sep_arr);
  477     }
  478 
  479     if (rp == NULL) /* use FS */
  480         rp = FS_regexp;
  481 
  482     field = scan;
  483     while (scan < end
  484            && research(rp, scan, 0, (end - scan), regex_flags) != -1
  485            && nf < up_to) {
  486         regex_flags |= RE_NO_BOL;
  487         if (REEND(rp, scan) == RESTART(rp, scan)) {   /* null match */
  488             if (gawk_mb_cur_max > 1)    {
  489                 mbclen = mbrlen(scan, end-scan, &mbs);
  490                 if ((mbclen == 1) || (mbclen == (size_t) -1)
  491                     || (mbclen == (size_t) -2) || (mbclen == 0)) {
  492                     /* We treat it as a singlebyte character.  */
  493                     mbclen = 1;
  494                 }
  495                 scan += mbclen;
  496             } else
  497                 scan++;
  498             if (scan == end) {
  499                 (*set)(++nf, field, (long)(scan - field), n);
  500                 up_to = nf;
  501                 break;
  502             }
  503             continue;
  504         }
  505         (*set)(++nf, field,
  506                (long)(scan + RESTART(rp, scan) - field), n);
  507         if (sep_arr != NULL)
  508                 set_element(nf, scan + RESTART(rp, scan),
  509                     (long) (REEND(rp, scan) - RESTART(rp, scan)), sep_arr);
  510         scan += REEND(rp, scan);
  511         field = scan;
  512         if (scan == end && ! default_field_splitting)   /* FS at end of record */
  513             (*set)(++nf, field, 0L, n);
  514     }
  515     if (nf != up_to && scan < end) {
  516         (*set)(++nf, scan, (long)(end - scan), n);
  517         scan = end;
  518     }
  519     *buf = scan;
  520     return nf;
  521 }
  522 
  523 /*
  524  * def_parse_field --- default field parsing.
  525  *
  526  * This is called both from get_field() and from do_split()
  527  * via (*parse_field)().  This variation is for when FS is a single space
  528  * character.
  529  */
  530 
  531 static long
  532 def_parse_field(long up_to, /* parse only up to this field number */
  533     char **buf, /* on input: string to parse; on output: point to start next */
  534     int len,
  535     NODE *fs,
  536     Regexp *rp ATTRIBUTE_UNUSED,
  537     Setfunc set,    /* routine to set the value of the parsed field */
  538     NODE *n,
  539     NODE *sep_arr,  /* array of field separators (maybe NULL) */
  540     bool in_middle ATTRIBUTE_UNUSED)
  541 {
  542     char *scan = *buf;
  543     long nf = parse_high_water;
  544     char *field;
  545     char *end = scan + len;
  546     char sav;
  547     char *sep;
  548 
  549     if (up_to == UNLIMITED)
  550         nf = 0;
  551     if (len == 0)
  552         return nf;
  553 
  554     /*
  555      * Nasty special case. If FS set to "", return whole record
  556      * as first field. This is not worth a separate function.
  557      */
  558     if (fs->stlen == 0) {
  559         (*set)(++nf, *buf, len, n);
  560         *buf += len;
  561         return nf;
  562     }
  563 
  564     /* before doing anything save the char at *end */
  565     sav = *end;
  566     /* because it will be destroyed now: */
  567 
  568     *end = ' '; /* sentinel character */
  569     sep = scan;
  570     for (; nf < up_to; scan++) {
  571         /*
  572          * special case:  fs is single space, strip leading whitespace
  573          */
  574         while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
  575             scan++;
  576 
  577         if (sep_arr != NULL && scan > sep)
  578             set_element(nf, sep, (long) (scan - sep), sep_arr);
  579 
  580         if (scan >= end)
  581             break;
  582 
  583         field = scan;
  584 
  585         while (*scan != ' ' && *scan != '\t' && *scan != '\n')
  586             scan++;
  587 
  588         (*set)(++nf, field, (long)(scan - field), n);
  589 
  590         if (scan == end)
  591             break;
  592 
  593         sep = scan;
  594     }
  595 
  596     /* everything done, restore original char at *end */
  597     *end = sav;
  598 
  599     *buf = scan;
  600     return nf;
  601 }
  602 
  603 /*
  604  * null_parse_field --- each character is a separate field
  605  *
  606  * This is called both from get_field() and from do_split()
  607  * via (*parse_field)().  This variation is for when FS is the null string.
  608  */
  609 static long
  610 null_parse_field(long up_to,    /* parse only up to this field number */
  611     char **buf, /* on input: string to parse; on output: point to start next */
  612     int len,
  613     NODE *fs ATTRIBUTE_UNUSED,
  614     Regexp *rp ATTRIBUTE_UNUSED,
  615     Setfunc set,    /* routine to set the value of the parsed field */
  616     NODE *n,
  617     NODE *sep_arr,  /* array of field separators (maybe NULL) */
  618     bool in_middle ATTRIBUTE_UNUSED)
  619 {
  620     char *scan = *buf;
  621     long nf = parse_high_water;
  622     char *end = scan + len;
  623 
  624     if (up_to == UNLIMITED)
  625         nf = 0;
  626     if (len == 0)
  627         return nf;
  628 
  629     if (gawk_mb_cur_max > 1) {
  630         mbstate_t mbs;
  631         memset(&mbs, 0, sizeof(mbstate_t));
  632         for (; nf < up_to && scan < end;) {
  633             size_t mbclen = mbrlen(scan, end-scan, &mbs);
  634             if ((mbclen == 1) || (mbclen == (size_t) -1)
  635                 || (mbclen == (size_t) -2) || (mbclen == 0)) {
  636                 /* We treat it as a singlebyte character.  */
  637                 mbclen = 1;
  638             }
  639             if (sep_arr != NULL && nf > 0)
  640                 set_element(nf, scan, 0L, sep_arr);
  641             (*set)(++nf, scan, mbclen, n);
  642             scan += mbclen;
  643         }
  644     } else {
  645         for (; nf < up_to && scan < end; scan++) {
  646             if (sep_arr != NULL && nf > 0)
  647                 set_element(nf, scan, 0L, sep_arr);
  648             (*set)(++nf, scan, 1L, n);
  649         }
  650     }
  651 
  652     *buf = scan;
  653     return nf;
  654 }
  655 
  656 /*
  657  * sc_parse_field --- single character field separator
  658  *
  659  * This is called both from get_field() and from do_split()
  660  * via (*parse_field)().  This variation is for when FS is a single character
  661  * other than space.
  662  */
  663 static long
  664 sc_parse_field(long up_to,  /* parse only up to this field number */
  665     char **buf, /* on input: string to parse; on output: point to start next */
  666     int len,
  667     NODE *fs,
  668     Regexp *rp ATTRIBUTE_UNUSED,
  669     Setfunc set,    /* routine to set the value of the parsed field */
  670     NODE *n,
  671     NODE *sep_arr,  /* array of field separators (maybe NULL) */
  672     bool in_middle ATTRIBUTE_UNUSED)
  673 {
  674     char *scan = *buf;
  675     char fschar;
  676     long nf = parse_high_water;
  677     char *field;
  678     char *end = scan + len;
  679     char sav;
  680     size_t mbclen = 0;
  681     mbstate_t mbs;
  682 
  683     memset(&mbs, 0, sizeof(mbstate_t));
  684 
  685     if (up_to == UNLIMITED)
  686         nf = 0;
  687     if (len == 0)
  688         return nf;
  689 
  690     if (RS_is_null && fs->stlen == 0)
  691         fschar = '\n';
  692     else
  693         fschar = fs->stptr[0];
  694 
  695     /* before doing anything save the char at *end */
  696     sav = *end;
  697     /* because it will be destroyed now: */
  698     *end = fschar;  /* sentinel character */
  699 
  700     for (; nf < up_to;) {
  701         field = scan;
  702         if (gawk_mb_cur_max > 1) {
  703             while (*scan != fschar) {
  704                 mbclen = mbrlen(scan, end-scan, &mbs);
  705                 if ((mbclen == 1) || (mbclen == (size_t) -1)
  706                     || (mbclen == (size_t) -2) || (mbclen == 0)) {
  707                     /* We treat it as a singlebyte character.  */
  708                     mbclen = 1;
  709                 }
  710                 scan += mbclen;
  711             }
  712         } else {
  713             while (*scan != fschar)
  714                 scan++;
  715         }
  716         (*set)(++nf, field, (long)(scan - field), n);
  717         if (scan == end)
  718             break;
  719         if (sep_arr != NULL)
  720             set_element(nf, scan, 1L, sep_arr);
  721         scan++;
  722         if (scan == end) {  /* FS at end of record */
  723             (*set)(++nf, field, 0L, n);
  724             break;
  725         }
  726     }
  727 
  728     /* everything done, restore original char at *end */
  729     *end = sav;
  730 
  731     *buf = scan;
  732     return nf;
  733 }
  734 
  735 /*
  736  * calc_mbslen --- calculate the length in bytes of a multi-byte string
  737  * containing len characters.
  738  */
  739 
  740 static size_t
  741 calc_mbslen(char *scan, char *end, size_t len, mbstate_t *mbs)
  742 {
  743 
  744     size_t mbclen;
  745     char *mbscan = scan;
  746 
  747     while (len-- > 0 && mbscan < end) {
  748         mbclen = mbrlen(mbscan, end - mbscan, mbs);
  749         if (!(mbclen > 0 && mbclen <= (size_t)(end - mbscan)))
  750             /*
  751              * We treat it as a singlebyte character. This should
  752              * catch error codes 0, (size_t) -1, and (size_t) -2.
  753              */
  754             mbclen = 1;
  755         mbscan += mbclen;
  756     }
  757     return mbscan - scan;
  758 }
  759 
  760 /*
  761  * fw_parse_field --- field parsing using FIELDWIDTHS spec
  762  *
  763  * This is called from get_field() via (*parse_field)().
  764  * This variation is for fields are fixed widths.
  765  */
  766 static long
  767 fw_parse_field(long up_to,  /* parse only up to this field number */
  768     char **buf, /* on input: string to parse; on output: point to start next */
  769     int len,
  770     NODE *fs ATTRIBUTE_UNUSED,
  771     Regexp *rp ATTRIBUTE_UNUSED,
  772     Setfunc set,    /* routine to set the value of the parsed field */
  773     NODE *n,
  774     NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
  775     bool in_middle ATTRIBUTE_UNUSED)
  776 {
  777     char *scan = *buf;
  778     long nf = parse_high_water;
  779     char *end = scan + len;
  780     const awk_fieldwidth_info_t *fw;
  781     mbstate_t mbs;
  782     size_t skiplen;
  783     size_t flen;
  784 
  785     fw = (api_parser_override ? api_fw : FIELDWIDTHS);
  786 
  787     if (up_to == UNLIMITED)
  788         nf = 0;
  789     if (len == 0)
  790         return nf;
  791     if (gawk_mb_cur_max > 1 && fw->use_chars) {
  792         /*
  793          * Reset the shift state. Arguably, the shift state should
  794          * be part of the file state and carried forward at all times,
  795          * but nobody has complained so far, so this may not matter
  796          * in practice.
  797          */
  798         memset(&mbs, 0, sizeof(mbstate_t));
  799         while (nf < up_to && scan < end) {
  800             if (nf >= fw->nf) {
  801                 *buf = end;
  802                 return nf;
  803             }
  804             scan += calc_mbslen(scan, end, fw->fields[nf].skip, &mbs);
  805             flen = calc_mbslen(scan, end, fw->fields[nf].len, &mbs);
  806             (*set)(++nf, scan, (long) flen, n);
  807             scan += flen;
  808         }
  809     } else {
  810         while (nf < up_to && scan < end) {
  811             if (nf >= fw->nf) {
  812                 *buf = end;
  813                 return nf;
  814             }
  815             skiplen = fw->fields[nf].skip;
  816             if (skiplen > end - scan)
  817                 skiplen = end - scan;
  818             scan += skiplen;
  819             flen = fw->fields[nf].len;
  820             if (flen > end - scan)
  821                 flen = end - scan;
  822             (*set)(++nf, scan, (long) flen, n);
  823             scan += flen;
  824         }
  825     }
  826     *buf = scan;
  827     return nf;
  828 }
  829 
  830 /* invalidate_field0 --- $0 needs reconstruction */
  831 
  832 void
  833 invalidate_field0()
  834 {
  835     field0_valid = false;
  836 }
  837 
  838 /* get_field --- return a particular $n */
  839 
  840 /* assign is not NULL if this field is on the LHS of an assign */
  841 
  842 NODE **
  843 get_field(long requested, Func_ptr *assign)
  844 {
  845     bool in_middle = false;
  846     static bool warned = false;
  847     extern int currule;
  848     NODE *saved_fs;
  849     Regexp *fs_regexp;
  850 
  851     if (do_lint && currule == END && ! warned) {
  852         warned = true;
  853         lintwarn(_("accessing fields from an END rule may not be portable"));
  854     }
  855 
  856     /*
  857      * if requesting whole line but some other field has been altered,
  858      * then the whole line must be rebuilt
  859      */
  860     if (requested == 0) {
  861         if (! field0_valid) {
  862             /* first, parse remainder of input record */
  863             if (NF == -1) {
  864                 in_middle = (parse_high_water != 0);
  865                 if (current_field_sep() == Using_FPAT) {
  866                     saved_fs = save_FPAT;
  867                     fs_regexp = FPAT_regexp;
  868                 } else {
  869                     saved_fs = save_FS;
  870                     fs_regexp = FS_regexp;
  871                 }
  872                 NF = (*parse_field)(UNLIMITED - 1, &parse_extent,
  873                         fields_arr[0]->stlen -
  874                     (parse_extent - fields_arr[0]->stptr),
  875                     saved_fs, fs_regexp, set_field,
  876                     (NODE *) NULL,
  877                     (NODE *) NULL,
  878                     in_middle);
  879                 parse_high_water = NF;
  880             }
  881             rebuild_record();
  882         }
  883         if (assign != NULL)
  884             *assign = reset_record;
  885         return &fields_arr[0];
  886     }
  887 
  888     /* assert(requested > 0); */
  889 
  890 #if 0
  891     if (assign != NULL)
  892         field0_valid = false;       /* $0 needs reconstruction */
  893 #else
  894     /*
  895      * Keep things uniform. Also, mere intention of assigning something
  896      * to $n should not make $0 invalid. Makes sense to invalidate $0
  897      * after the actual assignment is performed. Not a real issue in
  898      * the interpreter otherwise, but causes problem in the
  899      * debugger when watching or printing fields.
  900      */
  901 
  902     if (assign != NULL)
  903         *assign = invalidate_field0;    /* $0 needs reconstruction */
  904 #endif
  905 
  906     if (requested <= parse_high_water)  /* already parsed this field */
  907         return &fields_arr[requested];
  908 
  909     if (NF == -1) { /* have not yet parsed to end of record */
  910         /*
  911          * parse up to requested fields, calling set_field() for each,
  912          * saving in parse_extent the point where the parse left off
  913          */
  914         if (parse_high_water == 0)  /* starting at the beginning */
  915             parse_extent = fields_arr[0]->stptr;
  916         else
  917             in_middle = true;
  918         parse_high_water = (*parse_field)(requested, &parse_extent,
  919              fields_arr[0]->stlen - (parse_extent - fields_arr[0]->stptr),
  920              save_FS, NULL, set_field, (NODE *) NULL, (NODE *) NULL, in_middle);
  921 
  922         /*
  923          * if we reached the end of the record, set NF to the number of
  924          * fields so far.  Note that requested might actually refer to
  925          * a field that is beyond the end of the record, but we won't
  926          * set NF to that value at this point, since this is only a
  927          * reference to the field and NF only gets set if the field
  928          * is assigned to -- this case is handled below
  929          */
  930         if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen)
  931             NF = parse_high_water;
  932         if (requested == UNLIMITED - 1) /* UNLIMITED-1 means set NF */
  933             requested = parse_high_water;
  934     }
  935     if (parse_high_water < requested) { /* requested beyond end of record */
  936         if (assign != NULL) {   /* expand record */
  937             if (requested > nf_high_water)
  938                 grow_fields_arr(requested);
  939 
  940             NF = requested;
  941             parse_high_water = requested;
  942         } else
  943             return &Null_field;
  944     }
  945 
  946     return &fields_arr[requested];
  947 }
  948 
  949 /* set_element --- set an array element, used by do_split() */
  950 
  951 static void
  952 set_element(long num, char *s, long len, NODE *n)
  953 {
  954     NODE *it;
  955     NODE *sub;
  956 
  957     it = make_string(s, len);
  958     it->flags |= USER_INPUT;
  959     sub = make_number((AWKNUM) (num));
  960     assoc_set(n, sub, it);
  961 }
  962 
  963 /* do_split --- implement split(), semantics are same as for field splitting */
  964 
  965 NODE *
  966 do_split(int nargs)
  967 {
  968     NODE *src, *arr, *sep, *fs, *tmp, *sep_arr = NULL;
  969     char *s;
  970     long (*parseit)(long, char **, int, NODE *,
  971              Regexp *, Setfunc, NODE *, NODE *, bool);
  972     Regexp *rp = NULL;
  973 
  974     if (nargs == 4) {
  975         static bool warned = false;
  976 
  977         if (do_traditional || do_posix) {
  978             fatal(_("split: fourth argument is a gawk extension"));
  979         }
  980         sep_arr = POP_PARAM();
  981         if (sep_arr->type != Node_var_array)
  982             fatal(_("split: fourth argument is not an array"));
  983         if ((do_lint_extensions || do_lint_old) && ! warned) {
  984             warned = true;
  985             lintwarn(_("split: fourth argument is a gawk extension"));
  986         }
  987     }
  988 
  989     sep = POP();
  990     arr = POP_PARAM();
  991     if (arr->type != Node_var_array)
  992         fatal(_("split: second argument is not an array"));
  993 
  994     if (sep_arr != NULL) {
  995         if (sep_arr == arr)
  996             fatal(_("split: cannot use the same array for second and fourth args"));
  997 
  998         /* This checks need to be done before clearing any of the arrays */
  999         for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
 1000             if (tmp == arr)
 1001                 fatal(_("split: cannot use a subarray of second arg for fourth arg"));
 1002         for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
 1003             if (tmp == sep_arr)
 1004                 fatal(_("split: cannot use a subarray of fourth arg for second arg"));
 1005         assoc_clear(sep_arr);
 1006     }
 1007     assoc_clear(arr);
 1008 
 1009     src = TOP_STRING();
 1010     if (src->stlen == 0) {
 1011         /*
 1012          * Skip the work if first arg is the null string.
 1013          */
 1014         tmp = POP_SCALAR();
 1015         DEREF(tmp);
 1016         return make_number((AWKNUM) 0);
 1017     }
 1018 
 1019     if ((sep->flags & REGEX) != 0)
 1020         sep = sep->typed_re;
 1021 
 1022     if (   (sep->re_flags & FS_DFLT) != 0
 1023         && current_field_sep() == Using_FS
 1024         && ! RS_is_null) {
 1025         parseit = parse_field;
 1026         fs = force_string(FS_node->var_value);
 1027         rp = FS_regexp;
 1028     } else {
 1029         fs = sep->re_exp;
 1030 
 1031         if (fs->stlen == 0) {
 1032             static bool warned = false;
 1033 
 1034             parseit = null_parse_field;
 1035 
 1036             if (do_lint && ! warned) {
 1037                 warned = true;
 1038                 lintwarn(_("split: null string for third arg is a non-standard extension"));
 1039             }
 1040         } else if (fs->stlen == 1 && (sep->re_flags & CONSTANT) == 0) {
 1041             if (fs->stptr[0] == ' ') {
 1042                 parseit = def_parse_field;
 1043             } else
 1044                 parseit = sc_parse_field;
 1045         } else {
 1046             parseit = re_parse_field;
 1047             rp = re_update(sep);
 1048         }
 1049     }
 1050 
 1051     s = src->stptr;
 1052     tmp = make_number((AWKNUM) (*parseit)(UNLIMITED, &s, (int) src->stlen,
 1053                          fs, rp, set_element, arr, sep_arr, false));
 1054 
 1055     src = POP_SCALAR(); /* really pop off stack */
 1056     DEREF(src);
 1057     return tmp;
 1058 }
 1059 
 1060 /*
 1061  * do_patsplit --- implement patsplit(), semantics are same as for field
 1062  *         splitting with FPAT.
 1063  */
 1064 
 1065 NODE *
 1066 do_patsplit(int nargs)
 1067 {
 1068     NODE *src, *arr, *sep, *fpat, *tmp, *sep_arr = NULL;
 1069     char *s;
 1070     Regexp *rp = NULL;
 1071 
 1072     if (nargs == 4) {
 1073         sep_arr = POP_PARAM();
 1074         if (sep_arr->type != Node_var_array)
 1075             fatal(_("patsplit: fourth argument is not an array"));
 1076     }
 1077     sep = POP();
 1078     arr = POP_PARAM();
 1079     if (arr->type != Node_var_array)
 1080         fatal(_("patsplit: second argument is not an array"));
 1081 
 1082     src = TOP_STRING();
 1083 
 1084     if ((sep->flags & REGEX) != 0)
 1085         sep = sep->typed_re;
 1086 
 1087     fpat = sep->re_exp;
 1088     if (fpat->stlen == 0)
 1089         fatal(_("patsplit: third argument must be non-null"));
 1090 
 1091     if (sep_arr != NULL) {
 1092         if (sep_arr == arr)
 1093             fatal(_("patsplit: cannot use the same array for second and fourth args"));
 1094 
 1095         /* These checks need to be done before clearing any of the arrays */
 1096         for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
 1097             if (tmp == arr)
 1098                 fatal(_("patsplit: cannot use a subarray of second arg for fourth arg"));
 1099         for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
 1100             if (tmp == sep_arr)
 1101                 fatal(_("patsplit: cannot use a subarray of fourth arg for second arg"));
 1102         assoc_clear(sep_arr);
 1103     }
 1104     assoc_clear(arr);
 1105 
 1106     if (src->stlen == 0) {
 1107         /*
 1108          * Skip the work if first arg is the null string.
 1109          */
 1110         tmp =  make_number((AWKNUM) 0);
 1111     } else {
 1112         rp = re_update(sep);
 1113         s = src->stptr;
 1114         tmp = make_number((AWKNUM) fpat_parse_field(UNLIMITED, &s,
 1115                 (int) src->stlen, fpat, rp,
 1116                 set_element, arr, sep_arr, false));
 1117     }
 1118 
 1119     src = POP_SCALAR(); /* really pop off stack */
 1120     DEREF(src);
 1121     return tmp;
 1122 }
 1123 
 1124 /* set_parser --- update the current (non-API) parser */
 1125 
 1126 static void
 1127 set_parser(parse_field_func_t func)
 1128 {
 1129     normal_parse_field = func;
 1130     if (! api_parser_override && parse_field != func) {
 1131         parse_field = func;
 1132             update_PROCINFO_str("FS", current_field_sep_str());
 1133     }
 1134 }
 1135 
 1136 /* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
 1137 
 1138 void
 1139 set_FIELDWIDTHS()
 1140 {
 1141     char *scan;
 1142     char *end;
 1143     int i;
 1144     static int fw_alloc = 4;
 1145     static bool warned = false;
 1146     bool fatal_error = false;
 1147     NODE *tmp;
 1148 
 1149     if (do_lint_extensions && ! warned) {
 1150         warned = true;
 1151         lintwarn(_("`FIELDWIDTHS' is a gawk extension"));
 1152     }
 1153     if (do_traditional) /* quick and dirty, does the trick */
 1154         return;
 1155 
 1156     /*
 1157      * If changing the way fields are split, obey least-surprise
 1158      * semantics, and force $0 to be split totally.
 1159      */
 1160     if (fields_arr != NULL)
 1161         (void) get_field(UNLIMITED - 1, 0);
 1162 
 1163     set_parser(fw_parse_field);
 1164     tmp = force_string(FIELDWIDTHS_node->var_value);
 1165     scan = tmp->stptr;
 1166 
 1167     if (FIELDWIDTHS == NULL) {
 1168         emalloc(FIELDWIDTHS, awk_fieldwidth_info_t *, awk_fieldwidth_info_size(fw_alloc), "set_FIELDWIDTHS");
 1169         FIELDWIDTHS->use_chars = awk_true;
 1170     }
 1171     FIELDWIDTHS->nf = 0;
 1172     for (i = 0; ; i++) {
 1173         unsigned long int tmp;
 1174         if (i >= fw_alloc) {
 1175             fw_alloc *= 2;
 1176             erealloc(FIELDWIDTHS, awk_fieldwidth_info_t *, awk_fieldwidth_info_size(fw_alloc), "set_FIELDWIDTHS");
 1177         }
 1178         /* Ensure that there is no leading `-' sign.  Otherwise,
 1179            strtoul would accept it and return a bogus result.  */
 1180         while (is_blank(*scan)) {
 1181             ++scan;
 1182         }
 1183         if (*scan == '-') {
 1184             fatal_error = true;
 1185             break;
 1186         }
 1187         if (*scan == '\0')
 1188             break;
 1189 
 1190         // Look for skip value. We allow N:M and N:*.
 1191         /*
 1192          * Detect an invalid base-10 integer, a valid value that
 1193          * is followed by something other than a blank or '\0',
 1194          * or a value that is not in the range [1..UINT_MAX].
 1195          */
 1196         errno = 0;
 1197         tmp = strtoul(scan, &end, 10);
 1198         if (errno == 0 && *end == ':' && (0 < tmp && tmp <= UINT_MAX)) {
 1199             FIELDWIDTHS->fields[i].skip = tmp;
 1200             scan = end + 1;
 1201             if (*scan == '-' || is_blank(*scan)) {
 1202                 fatal_error = true;
 1203                 break;
 1204             }
 1205             // try scanning for field width
 1206             tmp = strtoul(scan, &end, 10);
 1207         }
 1208         else
 1209             FIELDWIDTHS->fields[i].skip = 0;
 1210 
 1211         if (errno != 0
 1212                 || (*end != '\0' && ! is_blank(*end))
 1213                 || !(0 < tmp && tmp <= UINT_MAX)
 1214         ) {
 1215             if (*scan == '*') {
 1216                 for (scan++; is_blank(*scan); scan++)
 1217                     continue;
 1218 
 1219                 if (*scan != '\0')
 1220                     fatal(_("`*' must be the last designator in FIELDWIDTHS"));
 1221 
 1222                 FIELDWIDTHS->fields[i].len = UINT_MAX;
 1223                 FIELDWIDTHS->nf = i+1;
 1224             }
 1225             else
 1226                 fatal_error = true;
 1227             break;
 1228         }
 1229         FIELDWIDTHS->fields[i].len = tmp;
 1230         FIELDWIDTHS->nf = i+1;
 1231         scan = end;
 1232         /* Skip past any trailing blanks.  */
 1233         while (is_blank(*scan)) {
 1234             ++scan;
 1235         }
 1236         if (*scan == '\0')
 1237             break;
 1238     }
 1239 
 1240     if (fatal_error)
 1241         fatal(_("invalid FIELDWIDTHS value, for field %d, near `%s'"),
 1242                   i + 1, scan);
 1243 }
 1244 
 1245 /* set_FS --- handle things when FS is assigned to */
 1246 
 1247 void
 1248 set_FS()
 1249 {
 1250     char buf[10];
 1251     NODE *fs;
 1252     static NODE *save_fs = NULL;
 1253     static NODE *save_rs = NULL;
 1254     bool remake_re = true;
 1255 
 1256     /*
 1257      * If changing the way fields are split, obey least-surprise
 1258      * semantics, and force $0 to be split totally.
 1259      */
 1260     if (fields_arr != NULL)
 1261         (void) get_field(UNLIMITED - 1, 0);
 1262 
 1263     /* It's possible that only IGNORECASE changed, or FS = FS */
 1264     /*
 1265      * This comparison can't use cmp_nodes(), which pays attention
 1266      * to IGNORECASE, and that's not what we want.
 1267      */
 1268     if (save_fs
 1269         && FS_node->var_value->stlen == save_fs->stlen
 1270         && memcmp(FS_node->var_value->stptr, save_fs->stptr, save_fs->stlen) == 0
 1271         && save_rs
 1272         && RS_node->var_value->stlen == save_rs->stlen
 1273         && memcmp(RS_node->var_value->stptr, save_rs->stptr, save_rs->stlen) == 0) {
 1274         if (FS_regexp != NULL)
 1275             FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
 1276 
 1277         /* FS = FS */
 1278         if (current_field_sep() == Using_FS) {
 1279             return;
 1280         } else {
 1281             remake_re = false;
 1282             goto choose_fs_function;
 1283         }
 1284     }
 1285 
 1286     unref(save_fs);
 1287     save_fs = dupnode(FS_node->var_value);
 1288     unref(save_rs);
 1289     save_rs = dupnode(RS_node->var_value);
 1290     resave_fs = true;
 1291 
 1292     /* If FS_re_no_case assignment is fatal (make_regexp in remake_re)
 1293      * FS_regexp will be NULL with a non-null FS_re_yes_case.
 1294      * refree() handles null argument; no need for `if (FS_regexp != NULL)' below.
 1295      * Please do not remerge.
 1296      */
 1297     refree(FS_re_yes_case);
 1298     refree(FS_re_no_case);
 1299     FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
 1300 
 1301 
 1302 choose_fs_function:
 1303     buf[0] = '\0';
 1304     default_FS = false;
 1305     fs = force_string(FS_node->var_value);
 1306 
 1307     if (! do_traditional && fs->stlen == 0) {
 1308         static bool warned = false;
 1309 
 1310         set_parser(null_parse_field);
 1311 
 1312         if (do_lint_extensions && ! warned) {
 1313             warned = true;
 1314             lintwarn(_("null string for `FS' is a gawk extension"));
 1315         }
 1316     } else if (fs->stlen > 1 || (fs->flags & REGEX) != 0) {
 1317         if (do_lint_old)
 1318             lintwarn(_("old awk does not support regexps as value of `FS'"));
 1319         set_parser(re_parse_field);
 1320     } else if (RS_is_null) {
 1321         /* we know that fs->stlen <= 1 */
 1322         set_parser(sc_parse_field);
 1323         if (fs->stlen == 1) {
 1324             if (fs->stptr[0] == ' ') {
 1325                 default_FS = true;
 1326                 strcpy(buf, "[ \t\n]+");
 1327             } else if (fs->stptr[0] == '\\') {
 1328                 /* yet another special case */
 1329                 strcpy(buf, "[\\\\\n]");
 1330             } else if (fs->stptr[0] == '\0') {
 1331                 /* and yet another special case */
 1332                 strcpy(buf, "[\\000\n]");
 1333             } else if (fs->stptr[0] != '\n') {
 1334                 sprintf(buf, "[%c\n]", fs->stptr[0]);
 1335             }
 1336         }
 1337     } else {
 1338         set_parser(def_parse_field);
 1339 
 1340         if (fs->stlen == 1) {
 1341             if (fs->stptr[0] == ' ')
 1342                 default_FS = true;
 1343             else if (fs->stptr[0] == '\\')
 1344                 /* same special case */
 1345                 strcpy(buf, "[\\\\]");
 1346             else
 1347                 set_parser(sc_parse_field);
 1348         }
 1349     }
 1350     if (remake_re) {
 1351         refree(FS_re_yes_case);
 1352         refree(FS_re_no_case);
 1353         FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
 1354 
 1355         if (buf[0] != '\0') {
 1356             FS_re_yes_case = make_regexp(buf, strlen(buf), false, true, true);
 1357             FS_re_no_case = make_regexp(buf, strlen(buf), true, true, true);
 1358             FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
 1359             set_parser(re_parse_field);
 1360         } else if (parse_field == re_parse_field) {
 1361             FS_re_yes_case = make_regexp(fs->stptr, fs->stlen, false, true, true);
 1362             FS_re_no_case = make_regexp(fs->stptr, fs->stlen, true, true, true);
 1363             FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
 1364         } else
 1365             FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
 1366     }
 1367 
 1368     /*
 1369      * For FS = "c", we don't use IGNORECASE. But we must use
 1370      * re_parse_field to get the character and the newline as
 1371      * field separators.
 1372      */
 1373     if (fs->stlen == 1 && parse_field == re_parse_field)
 1374         FS_regexp = FS_re_yes_case;
 1375 }
 1376 
 1377 /* current_field_sep --- return the field separator type */
 1378 
 1379 field_sep_type
 1380 current_field_sep()
 1381 {
 1382     if (api_parser_override)
 1383         return Using_API;
 1384     else if (parse_field == fw_parse_field)
 1385         return Using_FIELDWIDTHS;
 1386     else if (parse_field == fpat_parse_field)
 1387         return Using_FPAT;
 1388     else
 1389         return Using_FS;
 1390 }
 1391 
 1392 /* current_field_sep_str --- return the field separator type as a string */
 1393 
 1394 const char *
 1395 current_field_sep_str()
 1396 {
 1397     if (api_parser_override)
 1398         return "API";
 1399     else if (parse_field == fw_parse_field)
 1400         return "FIELDWIDTHS";
 1401     else if (parse_field == fpat_parse_field)
 1402         return "FPAT";
 1403     else
 1404         return "FS";
 1405 }
 1406 
 1407 /* update_PROCINFO_str --- update PROCINFO[sub] with string value */
 1408 
 1409 void
 1410 update_PROCINFO_str(const char *subscript, const char *str)
 1411 {
 1412     NODE *tmp;
 1413 
 1414     if (PROCINFO_node == NULL)
 1415         return;
 1416     tmp = make_string(subscript, strlen(subscript));
 1417     assoc_set(PROCINFO_node, tmp, make_string(str, strlen(str)));
 1418 }
 1419 
 1420 /* update_PROCINFO_num --- update PROCINFO[sub] with numeric value */
 1421 
 1422 void
 1423 update_PROCINFO_num(const char *subscript, AWKNUM val)
 1424 {
 1425     NODE *tmp;
 1426 
 1427     if (PROCINFO_node == NULL)
 1428         return;
 1429     tmp = make_string(subscript, strlen(subscript));
 1430     assoc_set(PROCINFO_node, tmp, make_number(val));
 1431 }
 1432 
 1433 /* set_FPAT --- handle an assignment to FPAT */
 1434 
 1435 void
 1436 set_FPAT()
 1437 {
 1438     static bool warned = false;
 1439     bool remake_re = true;
 1440     NODE *fpat;
 1441 
 1442     if (do_lint_extensions && ! warned) {
 1443         warned = true;
 1444         lintwarn(_("`FPAT' is a gawk extension"));
 1445     }
 1446     if (do_traditional) /* quick and dirty, does the trick */
 1447         return;
 1448 
 1449     /*
 1450      * If changing the way fields are split, obey least-suprise
 1451      * semantics, and force $0 to be split totally.
 1452      */
 1453     if (fields_arr != NULL)
 1454         (void) get_field(UNLIMITED - 1, 0);
 1455 
 1456     /* It's possible that only IGNORECASE changed, or FPAT = FPAT */
 1457     /*
 1458      * This comparison can't use cmp_nodes(), which pays attention
 1459      * to IGNORECASE, and that's not what we want.
 1460      */
 1461     if (save_FPAT
 1462         && FPAT_node->var_value->stlen == save_FPAT->stlen
 1463         && memcmp(FPAT_node->var_value->stptr, save_FPAT->stptr, save_FPAT->stlen) == 0) {
 1464         if (FPAT_regexp != NULL)
 1465             FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
 1466 
 1467         /* FPAT = FPAT */
 1468         if (current_field_sep() == Using_FPAT) {
 1469             return;
 1470         } else {
 1471             remake_re = false;
 1472             goto set_fpat_function;
 1473         }
 1474     }
 1475 
 1476     unref(save_FPAT);
 1477     save_FPAT = dupnode(FPAT_node->var_value);
 1478     refree(FPAT_re_yes_case);
 1479     refree(FPAT_re_no_case);
 1480     FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
 1481 
 1482 set_fpat_function:
 1483     fpat = force_string(FPAT_node->var_value);
 1484     set_parser(fpat_parse_field);
 1485 
 1486     if (remake_re) {
 1487         refree(FPAT_re_yes_case);
 1488         refree(FPAT_re_no_case);
 1489         FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
 1490 
 1491         FPAT_re_yes_case = make_regexp(fpat->stptr, fpat->stlen, false, true, true);
 1492         FPAT_re_no_case = make_regexp(fpat->stptr, fpat->stlen, true, true, true);
 1493         FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
 1494     }
 1495 }
 1496 
 1497 /*
 1498  * increment_scan --- macro to move scan pointer ahead by one character.
 1499  *          Implementation varies if doing MBS or not.
 1500  */
 1501 
 1502 #define increment_scan(scanp, len) incr_scan(scanp, len, & mbs)
 1503 
 1504 /* incr_scan --- MBS version of increment_scan() */
 1505 
 1506 static void
 1507 incr_scan(char **scanp, size_t len, mbstate_t *mbs)
 1508 {
 1509     size_t mbclen = 0;
 1510 
 1511     if (gawk_mb_cur_max > 1) {
 1512         mbclen = mbrlen(*scanp, len, mbs);
 1513         if (   (mbclen == 1)
 1514             || (mbclen == (size_t) -1)
 1515             || (mbclen == (size_t) -2)
 1516             || (mbclen == 0)) {
 1517             /* We treat it as a singlebyte character.  */
 1518             mbclen = 1;
 1519         }
 1520         *scanp += mbclen;
 1521     } else
 1522         (*scanp)++;
 1523 }
 1524 
 1525 /*
 1526  * fpat_parse_field --- parse fields using a regexp.
 1527  *
 1528  * This is called both from get_field() and from do_patsplit()
 1529  * via (*parse_field)().  This variation is for when FPAT is a regular
 1530  * expression -- use the value to find field contents.
 1531  *
 1532  * The FPAT parsing logic is a bit difficult to specify. In particular
 1533  * to allow null fields at certain locations. To make the code as robust
 1534  * as possible, an awk reference implementation was written and tested
 1535  * as a first step, and later recoded in C, preserving its structure as
 1536  * much as possible.
 1537  *
 1538  * # Reference implementation of the FPAT record parsing.
 1539  * #
 1540  * # Each loop iteration identifies a (separator[n-1],field[n]) pair.
 1541  * # Each loop iteration must consume some characters, except for the first field.
 1542  * # So a null field is only valid as a first field or after a non-null separator.
 1543  * # A null record has no fields (not a single null field).
 1544  * 
 1545  * function refpatsplit(string, fields, pattern, seps,
 1546  *         parse_start, sep_start, field_start, field_length, field_found, nf) # locals
 1547  * {
 1548  *     # Local state variables:
 1549  *     # - parse_start: pointer to the first not yet consumed character
 1550  *     # - sep_start: pointer to the beginning of the parsed separator
 1551  *     # - field start: pointer to the beginning of the parsed field
 1552  *     # - field length: length of the parsed field
 1553  *     # - field_found: flag for succesful field match
 1554  *     # - nf: Number of fields found so far
 1555  *     
 1556  *     # Prepare for parsing
 1557  *     parse_start = 1   # first not yet parsed char
 1558  *     nf = 0            # fields found so far
 1559  *     delete fields
 1560  *     delete seps
 1561  * 
 1562  *     # Loop that consumes the whole record
 1563  *     while (parse_start <= length(string)) {  # still something to parse
 1564  *     
 1565  *         # first attempt to match the next field
 1566  *         sep_start = parse_start
 1567  *         field_found = match(substr(string, parse_start), pattern)
 1568  *         
 1569  *         # check for an invalid null field and retry one character away
 1570  *         if (nf > 0 && field_found && RSTART == 1 && RLENGTH == 0) {
 1571  *             parse_start++
 1572  *             field_found = match(substr(string, parse_start), pattern)
 1573  *         }
 1574  *         
 1575  *         # store the (sep[n-1],field[n]) pair
 1576  *         if (field_found) {
 1577  *             field_start = parse_start + RSTART - 1
 1578  *             field_length = RLENGTH
 1579  *             seps[nf] = substr(string, sep_start, field_start-sep_start)
 1580  *             fields[++nf] = substr(string, field_start, field_length)
 1581  *             parse_start = field_start + field_length
 1582  *             
 1583  *         # store the final extra sep after the last field
 1584  *         } else {
 1585  *             seps[nf] = substr(string, sep_start)
 1586  *             parse_start = length(string) + 1
 1587  *         }
 1588  *     }
 1589  *     
 1590  *     return nf
 1591  * }
 1592  */
 1593 static long
 1594 fpat_parse_field(long up_to,    /* parse only up to this field number */
 1595     char **buf, /* on input: string to parse; on output: point to start next */
 1596     int len,
 1597     NODE *fs ATTRIBUTE_UNUSED,
 1598     Regexp *rp,
 1599     Setfunc set,    /* routine to set the value of the parsed field */
 1600     NODE *n,
 1601     NODE *sep_arr,  /* array of field separators (may be NULL) */
 1602     bool in_middle)
 1603 {
 1604     char *scan = *buf;
 1605     long nf = parse_high_water;
 1606     char *start;
 1607     char *end = scan + len;
 1608     int regex_flags = RE_NEED_START;
 1609     mbstate_t mbs;
 1610     char* field_start;
 1611     bool field_found = false;
 1612 
 1613     memset(&mbs, 0, sizeof(mbstate_t));
 1614 
 1615     if (up_to == UNLIMITED)
 1616         nf = 0;
 1617 
 1618     if (len == 0)
 1619         return nf;
 1620 
 1621     if (rp == NULL) /* use FPAT */
 1622         rp = FPAT_regexp;
 1623 
 1624     while (scan < end && nf < up_to) {  /* still something to parse */
 1625 
 1626         /* first attempt to match the next field */
 1627         start = scan;
 1628         field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1;
 1629 
 1630         /* check for an invalid null field and retry one character away */ 
 1631         if (nf > 0 && field_found && REEND(rp, scan) == 0) { /* invalid null field */
 1632             increment_scan(& scan, end - scan);
 1633             field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1;
 1634         }
 1635 
 1636         /* store the (sep[n-1],field[n]) pair */
 1637         if (field_found) {
 1638             field_start = scan + RESTART(rp, scan);
 1639             if (sep_arr != NULL) { /* store the separator */
 1640                 if (field_start == start) /* match at front */
 1641                     set_element(nf, start, 0L, sep_arr);
 1642                 else
 1643                     set_element(nf,
 1644                         start,
 1645                         (long) (field_start - start),
 1646                         sep_arr);
 1647             }
 1648             /* field is text that matched */
 1649             (*set)(++nf,
 1650                 field_start,
 1651                 (long)(REEND(rp, scan) - RESTART(rp, scan)),
 1652                 n);
 1653             scan += REEND(rp, scan);
 1654 
 1655         } else {
 1656             /*
 1657              * No match, store the final extra separator after
 1658              * the last field.
 1659              */
 1660             if (sep_arr != NULL)
 1661                 set_element(nf, start, (long) (end - start), sep_arr);
 1662             scan = end;
 1663         }
 1664     }
 1665 
 1666     /*
 1667      * If the last field extends up to the end of the record, generate
 1668      * a null trailing separator
 1669      */
 1670     if (sep_arr != NULL && scan == end && field_found) 
 1671         set_element(nf, scan, 0L, sep_arr);
 1672 
 1673     *buf = scan;
 1674     return nf;
 1675 }