"Fossies" - the Fresh Open Source Software Archive

Member "gretl-2020e/lib/src/csvdata.c" (1 Nov 2020, 164066 Bytes) of package /linux/misc/gretl-2020e.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "csvdata.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 2020d_vs_2020e.

    1 /*
    2  *  gretl -- Gnu Regression, Econometrics and Time-series Library
    3  *  Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
    4  *
    5  *  This program is free software: you can redistribute it and/or modify
    6  *  it under the terms of the GNU General Public License as published by
    7  *  the Free Software Foundation, either version 3 of the License, or
    8  *  (at your option) any later version.
    9  *
   10  *  This program is distributed in the hope that it will be useful,
   11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
   12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   13  *  GNU General Public License for more details.
   14  *
   15  *  You should have received a copy of the GNU General Public License
   16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
   17  *
   18  */
   19 
   20 #include "libgretl.h"
   21 #include "gretl_string_table.h"
   22 #include "libset.h"
   23 #include "usermat.h"
   24 #include "uservar.h"
   25 #include "genparse.h"
   26 #include "gretl_xml.h"
   27 #include "gretl_midas.h"
   28 #include "matrix_extra.h"
   29 #include "gretl_www.h"
   30 #include "csvdata.h"
   31 
   32 #ifdef WIN32
   33 # include "gretl_win32.h"
   34 #endif
   35 
   36 #include <errno.h>
   37 
   38 #define CDEBUG 0    /* CSV reading in general */
   39 #define AGGDEBUG 0  /* aggregation in "join" */
   40 #define TDEBUG 0    /* handling of time keys in "join" */
   41 
   42 #define CSVSTRLEN 128
   43 
   44 enum {
   45     CSV_HAVEDATA = 1 << 0,
   46     CSV_GOTDELIM = 1 << 1,
   47     CSV_GOTTAB   = 1 << 2,
   48     CSV_GOTSEMI  = 1 << 3,
   49     CSV_BLANK1   = 1 << 4,
   50     CSV_OBS1     = 1 << 5,
   51     CSV_TRAIL    = 1 << 6,
   52     CSV_AUTONAME = 1 << 7,
   53     CSV_REVERSED = 1 << 8,
   54     CSV_DOTSUB   = 1 << 9,
   55     CSV_ALLCOLS  = 1 << 10,
   56     CSV_BOM      = 1 << 11,
   57     CSV_VERBOSE  = 1 << 12,
   58     CSV_THOUSEP  = 1 << 13,
   59     CSV_NOHEADER = 1 << 14,
   60     CSV_QUOTES   = 1 << 15,
   61     CSV_AS_MAT   = 1 << 16
   62 };
   63 
   64 enum {
   65     JOIN_KEY,
   66     JOIN_F1,
   67     JOIN_F2,
   68     JOIN_F3,
   69     JOIN_KEY2,
   70     JOIN_AUX,
   71     JOIN_TARG
   72 };
   73 
   74 typedef struct joinspec_ joinspec;
   75 typedef struct csvprobe_ csvprobe;
   76 typedef struct csvdata_ csvdata;
   77 
   78 struct joinspec_ {
   79     int ncols;
   80     const char **colnames;
   81     const char *mdsbase;
   82     int *colnums;
   83     int *timecols;
   84     csvdata *c;
   85     DATASET *dset;
   86     int wildcard;
   87     int auto_midas;
   88     int midas_pd;
   89     char **wildnames;
   90     char **mdsnames;
   91     char **tmpnames;
   92     int n_tmp;
   93 };
   94 
   95 struct csvprobe_ {
   96     DATASET *dset; /* more info might be wanted */
   97 };
   98 
   99 struct csvdata_ {
  100     int flags;
  101     char delim;
  102     char decpoint;
  103     char thousep;
  104     char qchar;
  105     int markerpd;
  106     int maxlinelen;
  107     int real_n;
  108     char *line;
  109     DATASET *dset;
  110     int ncols, nrows;
  111     long datapos;
  112     char str[CSVSTRLEN];
  113     char skipstr[8];
  114     int *codelist;
  115     char *descrip;
  116     const char *user_na;
  117     gretl_string_table *st;
  118     int *cols_list;
  119     int *width_list;
  120     const gretl_matrix *rowmask;
  121     int masklen;
  122     joinspec *jspec; /* info used for "join" command */
  123     csvprobe *probe; /* used in connection with "join" */
  124 };
  125 
  126 #define csv_has_trailing_comma(c) (c->flags & CSV_TRAIL)
  127 #define csv_has_obs_column(c)     (c->flags & CSV_OBS1)
  128 #define csv_has_blank_column(c)   (c->flags & CSV_BLANK1)
  129 #define csv_got_tab(c)            (c->flags & CSV_GOTTAB)
  130 #define csv_got_semi(c)           (c->flags & CSV_GOTSEMI)
  131 #define csv_got_delim(c)          (c->flags & CSV_GOTDELIM)
  132 #define csv_autoname(c)           (c->flags & CSV_AUTONAME)
  133 #define csv_skip_col_1(c)         (c->flags & (CSV_OBS1 | CSV_BLANK1))
  134 #define csv_have_data(c)          (c->flags & CSV_HAVEDATA)
  135 #define csv_data_reversed(c)      (c->flags & CSV_REVERSED)
  136 #define csv_do_dotsub(c)          (c->flags & CSV_DOTSUB)
  137 #define csv_all_cols(c)           (c->flags & CSV_ALLCOLS)
  138 #define csv_has_bom(c)            (c->flags & CSV_BOM)
  139 #define csv_is_verbose(c)         (c->flags & CSV_VERBOSE)
  140 #define csv_scrub_thousep(c)      (c->flags & CSV_THOUSEP)
  141 #define csv_no_header(c)          (c->flags & CSV_NOHEADER)
  142 #define csv_keep_quotes(c)        (c->flags & CSV_QUOTES)
  143 #define csv_as_matrix(c)          (c->flags & CSV_AS_MAT)
  144 
  145 #define csv_set_trailing_comma(c)   (c->flags |= CSV_TRAIL)
  146 #define csv_unset_trailing_comma(c) (c->flags &= ~CSV_TRAIL)
  147 #define csv_set_obs_column(c)       (c->flags |= CSV_OBS1)
  148 #define csv_set_blank_column(c)     (c->flags |= CSV_BLANK1)
  149 #define csv_set_got_tab(c)          (c->flags |= CSV_GOTTAB)
  150 #define csv_set_got_semi(c)         (c->flags |= CSV_GOTSEMI)
  151 #define csv_set_got_delim(c)        (c->flags |= CSV_GOTDELIM)
  152 #define csv_set_autoname(c)         (c->flags |= CSV_AUTONAME)
  153 #define csv_set_data_reversed(c)    (c->flags |= CSV_REVERSED)
  154 #define csv_set_dotsub(c)           (c->flags |= CSV_DOTSUB)
  155 #define csv_set_all_cols(c)         (c->flags |= CSV_ALLCOLS)
  156 #define csv_set_has_bom(c)          (c->flags |= CSV_BOM)
  157 #define csv_set_verbose(c)          (c->flags |= CSV_VERBOSE)
  158 #define csv_set_scrub_thousep(c)    (c->flags |= CSV_THOUSEP)
  159 #define csv_set_no_header(c)        (c->flags |= CSV_NOHEADER)
  160 #define csv_unset_keep_quotes(c)    (c->flags &= ~CSV_QUOTES)
  161 #define csv_set_as_matrix(c)        (c->flags |= CSV_AS_MAT)
  162 
  163 #define csv_skip_bad(c)        (*c->skipstr != '\0')
  164 #define csv_has_non_numeric(c) (c->st != NULL)
  165 
  166 #define fixed_format(c) (c->cols_list != NULL && c->width_list != NULL)
  167 #define cols_subset(c) (c->cols_list != NULL && c->width_list == NULL)
  168 #define rows_subset(c) (c->rowmask != NULL)
  169 
  170 #define joining(c) (c->jspec != NULL)
  171 #define probing(c) (c->probe != NULL)
  172 
  173 #define is_wildstr(s) (strchr(s, '*') || strchr(s, '?'))
  174 
  175 static int
  176 time_series_label_check (DATASET *dset, int reversed, char *skipstr,
  177              int convert_pd, PRN *prn);
  178 
  179 static int format_uses_quarterly (char *fmt);
  180 
  181 /* file-scope global */
  182 static char import_na[8];
  183 
  184 struct time_mapper {
  185     int ncols;         /* number of "timeconv" columns */
  186     char **colnames;   /* array of outer-dataset column names */
  187     char *tname;       /* the name of the "tkey", if among colnames, or NULL */
  188     char **fmt;        /* array of up to two time-format strings, or NULL */
  189     char m_means_q[2]; /* array of "monthly means quarterly" flags */
  190 };
  191 
  192 /* file-scope global */
  193 struct time_mapper tconv_map;
  194 
  195 enum {
  196     TCONV_FMT = 0,
  197     TKEY_FMT = 1
  198 };
  199 
  200 #define no_formats(map) (map.fmt == NULL)
  201 #define no_tkey_format(map) (map.tname == NULL)
  202 #define has_tconv_format(map) (map.fmt[TCONV_FMT] != NULL)
  203 #define is_tkey_variable(name, map) (strcmp(name, map.tname) == 0)
  204 
  205 static void timeconv_map_set (int ncols, char **colnames,
  206                   char *tname, char **fmt)
  207 {
  208     tconv_map.ncols = ncols;
  209     tconv_map.colnames = colnames;
  210     tconv_map.tname = tname;
  211     tconv_map.fmt = fmt;
  212 
  213     if (fmt != NULL) {
  214     if (fmt[TCONV_FMT] != NULL) {
  215         tconv_map.m_means_q[TCONV_FMT] =
  216         format_uses_quarterly(fmt[TCONV_FMT]);
  217     }
  218     if (fmt[TKEY_FMT] != NULL) {
  219         tconv_map.m_means_q[TKEY_FMT] =
  220         format_uses_quarterly(fmt[TKEY_FMT]);
  221     }
  222     }
  223 }
  224 
  225 static void timeconv_map_init (void)
  226 {
  227     timeconv_map_set(0, NULL, NULL, NULL);
  228 }
  229 
  230 static void timeconv_map_destroy (void)
  231 {
  232     if (tconv_map.colnames != NULL) {
  233     strings_array_free(tconv_map.colnames, tconv_map.ncols);
  234     }
  235     if (tconv_map.fmt != NULL) {
  236     strings_array_free(tconv_map.fmt, 2);
  237     }
  238     timeconv_map_init();
  239 }
  240 
  241 static int timecol_get_format (const DATASET *dset, int v,
  242                    char **pfmt, int *q)
  243 {
  244     if (no_formats(tconv_map)) {
  245     return 0;
  246     } else if (no_tkey_format(tconv_map)) {
  247     /* get the common "tconvert" format */
  248     *pfmt = tconv_map.fmt[TCONV_FMT];
  249     *q = tconv_map.m_means_q[TCONV_FMT];
  250     return 1;
  251     } else if (is_tkey_variable(dset->varname[v], tconv_map)) {
  252     /* get the tkey-specific format */
  253     *pfmt = tconv_map.fmt[TKEY_FMT];
  254     *q = tconv_map.m_means_q[TKEY_FMT];
  255     return 1;
  256     } else if (has_tconv_format(tconv_map)) {
  257     /* get the other one */
  258     *pfmt = tconv_map.fmt[TCONV_FMT];
  259     *q = tconv_map.m_means_q[TCONV_FMT];
  260     return 1;
  261     }
  262 
  263     return 0;
  264 }
  265 
  266 static int column_is_timecol (const char *colname)
  267 {
  268     int i, n = tconv_map.ncols;
  269 
  270     for (i=0; i<n; i++) {
  271     if (!strcmp(colname, tconv_map.colnames[i])) {
  272         return 1;
  273     }
  274     }
  275 
  276     return 0;
  277 }
  278 
  279 static void csvdata_free (csvdata *c)
  280 {
  281     if (c == NULL) {
  282     return;
  283     }
  284 
  285     if (c->descrip != NULL) {
  286     free(c->descrip);
  287     }
  288 
  289     if (c->st != NULL) {
  290     gretl_string_table_destroy(c->st);
  291     }
  292 
  293     if (c->codelist != NULL) {
  294     free(c->codelist);
  295     }
  296 
  297     if (c->line != NULL) {
  298     free(c->line);
  299     }
  300 
  301     if (c->cols_list != NULL) {
  302     free(c->cols_list);
  303     free(c->width_list);
  304     }
  305 
  306     destroy_dataset(c->dset);
  307 
  308     free(c);
  309 }
  310 
  311 static csvdata *csvdata_new (DATASET *dset)
  312 {
  313     csvdata *c = malloc(sizeof *c);
  314 
  315     if (c == NULL) {
  316     return NULL;
  317     }
  318 
  319     c->flags = CSV_QUOTES;
  320     c->delim = '\t';
  321     c->thousep = 0;
  322     c->qchar = 0;
  323     c->markerpd = -1;
  324     c->maxlinelen = 0;
  325     c->real_n = 0;
  326     c->line = NULL;
  327     c->dset = NULL;
  328     c->ncols = 0;
  329     c->nrows = 0;
  330     c->datapos = 0;
  331     *c->str = '\0';
  332     *c->skipstr = '\0';
  333     c->codelist = NULL;
  334     c->descrip = NULL;
  335     c->user_na = NULL;
  336     c->st = NULL;
  337     c->cols_list = NULL;
  338     c->width_list = NULL;
  339     c->rowmask = NULL;
  340     c->masklen = 0;
  341 
  342     if (strcmp(import_na, "default")) {
  343     c->user_na = import_na;
  344     }
  345 
  346     c->jspec = NULL;
  347     c->probe = NULL;
  348 
  349     c->dset = datainfo_new();
  350 
  351     if (c->dset == NULL) {
  352     free(c);
  353     c = NULL;
  354     } else {
  355     c->delim = get_data_export_delimiter();
  356     c->decpoint = get_data_export_decpoint();
  357     if (dset != NULL && dset->Z != NULL) {
  358         c->flags |= CSV_HAVEDATA;
  359     }
  360 #if CDEBUG
  361     fprintf(stderr, "csvdata_new: c->delim = '%c', c->decpoint = '%c'\n",
  362         c->delim, c->decpoint);
  363 #endif
  364     }
  365 
  366     return c;
  367 }
  368 
  369 static int *cols_list_from_matrix (const char *s, int *err)
  370 {
  371     gretl_matrix *m = get_matrix_by_name(s);
  372     int i, n = gretl_vector_get_length(m);
  373     int *list = NULL;
  374 
  375     if (n == 0) {
  376     *err = E_DATA;
  377     } else {
  378     list = gretl_list_new(n);
  379     if (list == NULL) {
  380         *err = E_ALLOC;
  381     } else {
  382         for (i=0; i<n; i++) {
  383         list[i+1] = gretl_vector_get(m, i);
  384         }
  385     }
  386     }
  387 
  388     return list;
  389 }
  390 
  391 /* The interpretation of the "cols" specification depends on
  392    @opt: if this includes OPT_L then it should provide a 1-based
  393    list of columns to be read; but if @opt includes OPT_F it
  394    should provide a fixed-format spec, consisting of pairs
  395    (start column, width).
  396 */
  397 
  398 static int csvdata_add_cols_list (csvdata *c, const char *s,
  399                   gretlopt opt)
  400 {
  401     int delimited = (opt & OPT_L);
  402     int *list, *clist = NULL, *wlist = NULL;
  403     int i, n, m = 0;
  404     int err = 0;
  405 
  406     if (get_matrix_by_name(s)) {
  407     list = cols_list_from_matrix(s, &err);
  408     } else {
  409     list = gretl_list_from_string(s, &err);
  410     }
  411 
  412     if (!err) {
  413     n = list[0];
  414     if (n == 0) {
  415         err = E_DATA;
  416     } else if (delimited) {
  417         m = n;
  418         clist = list;
  419     } else {
  420         /* fixed format: we need two lists */
  421         if (n % 2 != 0) {
  422         err = E_DATA;
  423         } else {
  424         m = n / 2;
  425         clist = gretl_list_new(m);
  426         wlist = gretl_list_new(m);
  427         if (clist == NULL || wlist == NULL) {
  428             err = E_ALLOC;
  429         } else {
  430             int j = 1;
  431 
  432             for (i=1; i<=n; i+=2, j++) {
  433             clist[j] = list[i];
  434             wlist[j] = list[i+1];
  435             }
  436         }
  437         }
  438     }
  439     }
  440 
  441     /* clist = column (start) list: must be a set of increasing
  442        positive integers; and wlist = respective column widths,
  443        must all be positive, if present
  444     */
  445 
  446     for (i=1; i<=m && !err; i++) {
  447     if (clist[i] <= 0 || (i > 1 && clist[i] <= clist[i-1])) {
  448         err = E_DATA;
  449     } else if (wlist != NULL && wlist[i] <= 0) {
  450         err = E_DATA;
  451     } else if (wlist != NULL && wlist[i] >= CSVSTRLEN) {
  452         fprintf(stderr, "Warning: field %d too wide (%d), truncating\n",
  453             i, wlist[i]);
  454         wlist[i] = CSVSTRLEN - 1;
  455     }
  456     }
  457 
  458     if (list != clist) {
  459     free(list);
  460     }
  461 
  462     if (!err) {
  463     c->cols_list = clist;
  464     c->width_list = wlist;
  465     } else {
  466     free(clist);
  467     free(wlist);
  468     if (err == E_DATA) {
  469         gretl_errmsg_set(_("Invalid column specification"));
  470     }
  471     }
  472 
  473     return err;
  474 }
  475 
  476 static int csvdata_add_row_mask (csvdata *c, const char *s)
  477 {
  478     int err = 0;
  479 
  480     c->rowmask = get_matrix_by_name(s);
  481     if (c->rowmask == NULL) {
  482     gretl_errmsg_sprintf(_("'%s': no such matrix"), s);
  483     err = E_DATA;
  484     } else {
  485     c->masklen = gretl_vector_get_length(c->rowmask);
  486     if (c->masklen == 0) {
  487         err = E_NONCONF;
  488     }
  489     }
  490 
  491     return err;
  492 }
  493 
  494 static int n_from_row_mask (csvdata *c)
  495 {
  496     int i, n = 0;
  497 
  498     for (i=0; i<c->masklen && i<=c->nrows; i++) {
  499     if (gretl_vector_get(c->rowmask, i) != 0) {
  500         n++;
  501     }
  502     }
  503 
  504     return n;
  505 }
  506 
  507 static int add_obs_marker (DATASET *dset, int n)
  508 {
  509     char **S = realloc(dset->S, n * sizeof *S);
  510     int err = 0;
  511 
  512     if (S == NULL) {
  513     err = E_ALLOC;
  514     } else {
  515     dset->S = S;
  516     dset->S[n-1] = malloc(OBSLEN);
  517     if (dset->S[n-1] == NULL) {
  518         err = E_ALLOC;
  519     } else {
  520         strcpy(dset->S[n-1], "NA");
  521     }
  522     }
  523 
  524     return err;
  525 }
  526 
  527 static int add_single_obs (DATASET *dset)
  528 {
  529     double *x;
  530     int i, err = 0;
  531 
  532     for (i=0; i<dset->v && !err; i++) {
  533     x = realloc(dset->Z[i], (dset->n + 1) * sizeof *x);
  534     if (x != NULL) {
  535         dset->Z[i] = x;
  536     } else {
  537         err = E_ALLOC;
  538     }
  539     }
  540 
  541     if (!err) {
  542     dset->n += 1;
  543     dset->Z[0][dset->n - 1] = 1.0;
  544     for (i=1; i<dset->v; i++) {
  545         dset->Z[i][dset->n - 1] = NADBL;
  546     }
  547     if (dset->S != NULL) {
  548         err = add_obs_marker(dset, dset->n);
  549     }
  550     }
  551 
  552     return err;
  553 }
  554 
  555 static int pad_weekly_data (DATASET *dset, int add)
  556 {
  557     int oldn = dset->n;
  558     int ttarg, offset = 0, skip = 0;
  559     int i, s, t, tc, err;
  560 
  561     err = dataset_add_observations(dset, add, OPT_A);
  562 
  563     if (!err) {
  564     for (t=0; t<oldn; t++) {
  565         tc = calendar_obs_number(dset->S[t], dset) - offset;
  566         if (tc != t) {
  567         skip = tc - t;
  568         fprintf(stderr, "Gap of size %d at original t = %d\n", skip, t);
  569         offset += skip;
  570         ttarg = oldn - 1 + offset;
  571         for (s=0; s<oldn-t+skip; s++) {
  572             for (i=1; i<dset->v; i++) {
  573             if (s < oldn - t) {
  574                 if (s == 0 || s == oldn-t-1) {
  575                 fprintf(stderr, "shifting obs %d to obs %d\n",
  576                     ttarg-skip, ttarg);
  577                 }
  578                 dset->Z[i][ttarg] = dset->Z[i][ttarg - skip];
  579             } else {
  580                 fprintf(stderr, "inserting NA at obs %d\n", ttarg);
  581                 dset->Z[i][ttarg] = NADBL;
  582             }
  583             }
  584             ttarg--;
  585         }
  586         }
  587     }
  588     }
  589 
  590     return err;
  591 }
  592 
  593 /* FIXME the following needs to be made more flexible? */
  594 
  595 static int csv_weekly_data (DATASET *dset)
  596 {
  597     char *lbl2 = dset->S[dset->n - 1];
  598     int ret = 1;
  599     int misscount = 0;
  600     int t, tc;
  601 
  602     for (t=0; t<dset->n; t++) {
  603     tc = calendar_obs_number(dset->S[t], dset) - misscount;
  604     if (tc != t) {
  605         misscount += tc - t;
  606     }
  607     }
  608 
  609     if (misscount > 0) {
  610     double missfrac = (double) misscount / dset->n;
  611 
  612     fprintf(stderr, "nobs = %d, misscount = %d (%.2f%%)\n",
  613         dset->n, misscount, 100.0 * missfrac);
  614     if (missfrac > 0.05) {
  615         ret = 0;
  616     } else {
  617         int Tc = calendar_obs_number(lbl2, dset) + 1;
  618         int altmiss = Tc - dset->n;
  619 
  620         fprintf(stderr, "check: Tc = %d, missing = %d\n", Tc, altmiss);
  621         if (altmiss != misscount) {
  622         ret = 0;
  623         } else if (dset->Z != NULL) {
  624         int err;
  625 
  626         fprintf(stderr, "OK, consistent\n");
  627         err = pad_weekly_data(dset, misscount);
  628         if (err) ret = 0;
  629         }
  630     }
  631     }
  632 
  633     return ret;
  634 }
  635 
  636 #define DAY_DEBUG 1
  637 
  638 static int check_daily_dates (DATASET *dset, int *pd,
  639                   int *reversed, PRN *prn)
  640 {
  641     int T = dset->n;
  642     char *lbl1 = dset->S[0];
  643     char *lbl2 = dset->S[T - 1];
  644     int fulln = 0, n, t, nbak;
  645     int alt_pd = 0;
  646     int oldpd = dset->pd;
  647     double oldsd0 = dset->sd0;
  648     guint32 ed1, ed2;
  649     int nmiss = 0, err = 0;
  650 
  651     *pd = 0;
  652 
  653     ed1 = get_epoch_day(lbl1);
  654     ed2 = get_epoch_day(lbl2);
  655     if (ed1 <= 0 || ed2 <= 0) {
  656     err = 1;
  657     }
  658 
  659 #if DAY_DEBUG
  660     fprintf(stderr, "check_daily_dates: '%s' -> %d, '%s' -> %d\n",
  661         lbl1, (int) ed1, lbl2, (int) ed2);
  662 #endif
  663 
  664     dset->pd = guess_daily_pd(dset);
  665     dset->structure = TIME_SERIES;
  666 
  667 #if DAY_DEBUG
  668     fprintf(stderr, "guessed at daily pd = %d\n", dset->pd);
  669 #endif
  670 
  671     if (!err) {
  672     if (ed2 < ed1) {
  673 #if DAY_DEBUG
  674         fprintf(stderr, "check_daily_dates: data are reversed?\n");
  675 #endif
  676         dset->sd0 = ed2;
  677         *reversed = 1;
  678     } else {
  679         dset->sd0 = ed1;
  680     }
  681     }
  682 
  683  recompute:
  684 
  685     alt_pd = 0;
  686     nbak = 0;
  687 
  688     if (!err) {
  689     guint32 n1 = (*reversed)? ed2 : ed1;
  690     guint32 n2 = (*reversed)? ed1 : ed2;
  691 
  692     fulln = n2 - n1 + 1;
  693 
  694     if (T > fulln) {
  695         err = 1;
  696     } else {
  697         nmiss = fulln - T;
  698         pprintf(prn, A_("Observations: %d; days in sample: %d\n"),
  699             T, fulln);
  700         if (nmiss > 300 * T) {
  701         pprintf(prn, A_("Probably annual data\n"));
  702         *pd = 1;
  703         } else if (nmiss > 50 * T) {
  704         pprintf(prn, A_("Probably quarterly data\n"));
  705         *pd = 4;
  706         } else if (nmiss > 20 * T) {
  707         pprintf(prn, A_("Probably monthly data\n"));
  708         *pd = 12;
  709         } else if (nmiss > 3 * T) {
  710         pprintf(prn, A_("Probably weekly data\n"));
  711         *pd = dset->pd = 52;
  712         } else {
  713         pprintf(prn, A_("Missing daily rows: %d\n"), nmiss);
  714         }
  715     }
  716     }
  717 
  718     nbak = 0;
  719 
  720     for (t=0; t<dset->n && !err; t++) {
  721     int wd, s = (*reversed)? (dset->n - 1 - t) : t;
  722 
  723     wd = weekday_from_date(dset->S[s]);
  724 
  725     if (dset->pd == 5 && (wd == 6 || wd == 0)) {
  726         /* Got Sat or Sun, can't be 5-day daily? */
  727         alt_pd = (wd == 6)? 6 : 7;
  728         pprintf(prn, "Found a Saturday (%s): re-trying with pd = %d\n",
  729             dset->S[s], alt_pd);
  730         break;
  731     } else if (dset->pd == 6 && wd == 0) {
  732         /* Got Sun, can't be 6-day daily? */
  733         alt_pd = 7;
  734         pprintf(prn, "Found a Sunday (%s): re-trying with pd = %d\n",
  735             dset->S[s], alt_pd);
  736         break;
  737     }
  738 
  739     n = calendar_obs_number(dset->S[s], dset);
  740     if (n < t) {
  741         pprintf(prn, "Daily dates error at t = %d:\n"
  742             "  calendar_obs_number() for '%s' = %d but t = %d\n",
  743             t, dset->S[s], n, t);
  744         err = 1;
  745     } else if (n > fulln - 1) {
  746         pprintf(prn, "Error: date '%s' out of bounds\n", dset->S[s]);
  747         err = 1;
  748     } else if (nbak > 0 && n == nbak) {
  749         pprintf(prn, "Error: date '%s' is repeated\n", dset->S[s]);
  750         err = 1;
  751     }
  752     nbak = n;
  753     }
  754 
  755     if (alt_pd > 0) {
  756     dset->pd = alt_pd;
  757     goto recompute;
  758     }
  759 
  760     if (err) {
  761     dset->pd = oldpd;
  762     dset->sd0 = oldsd0;
  763     dset->structure = CROSS_SECTION;
  764     } else {
  765     strcpy(dset->stobs, (*reversed)? lbl2 : lbl1);
  766     strcpy(dset->endobs, (*reversed)? lbl1 : lbl2);
  767     dset->t2 = dset->n - 1;
  768     if (nmiss > 0 && *pd == 0) {
  769         dset->markers = DAILY_DATE_STRINGS;
  770     }
  771     }
  772 
  773 #if DAY_DEBUG
  774     fprintf(stderr, "check_daily_dates: daily pd = %d, reversed = %d, err = %d\n",
  775         dset->pd, *reversed, err);
  776 #endif
  777 
  778     return (err)? -1 : dset->pd;
  779 }
  780 
  781 /* convert from daily date label to a lower frequency --
  782    annual, monthly or quarterly -- if @pd indicates this
  783    is required
  784 */
  785 
  786 static void convert_daily_label (char *targ, const char *src,
  787                  int pd)
  788 {
  789     int y, m, d;
  790 
  791     sscanf(src, YMD_READ_FMT, &y, &m, &d);
  792 
  793     if (pd == 1) {
  794     sprintf(targ, "%d", y);
  795     } else if (pd == 12) {
  796     sprintf(targ, "%d:%02d", y, m);
  797     } else if (pd == 4) {
  798     sprintf(targ, "%d:%d", y, m / 3 + (m % 3 != 0));
  799     }
  800 }
  801 
  802 /* There's a special case (ugh!) where observation strings are
  803    given as in monthly data, but the frequency is in fact
  804    quarterly, as in:
  805 
  806    1947.06
  807    1947.09
  808    1947.12
  809    1948.03
  810 
  811    we'll make a brave attempt to handle this.
  812 */
  813 
  814 #define fakequarter(m) (m==3 || m==6 || m==9 || m==12)
  815 
  816 static int consistent_qm_labels (DATASET *dset, int reversed,
  817                  int convert_pd, char *skipstr,
  818                  int *ppd, const char *fmt,
  819                  int *extra_zero, PRN *prn)
  820 {
  821     char bad[16], skip[8];
  822     char label[OBSLEN];
  823     int Ey; /* expected year */
  824     int Ep; /* expected sub-period */
  825     int t, s, yr, per;
  826     int pmin = 1;
  827     int pd, pd0;
  828     int ret = 1;
  829 
  830     pd = pd0 = *ppd;
  831 
  832  restart:
  833 
  834     s = reversed ? (dset->n - 1) : 0;
  835 
  836     if (convert_pd) {
  837     convert_daily_label(label, dset->S[s], pd);
  838     } else {
  839     strcpy(label, dset->S[s]);
  840     }
  841 
  842     if (sscanf(label, fmt, &yr, &per) != 2) {
  843     return 0;
  844     }
  845 
  846     for (t=1; t<dset->n; t++) {
  847     s = (reversed)? (dset->n - 1 - t) : t;
  848     Ey = (per == pd)? yr + 1 : yr;
  849     Ep = (per == pd)? pmin : per + pmin;
  850 
  851     if (convert_pd) {
  852         convert_daily_label(label, dset->S[s], pd);
  853     } else {
  854         strcpy(label, dset->S[s]);
  855     }
  856 
  857     if (sscanf(label, fmt, &yr, &per) != 2) {
  858         ret = 0;
  859     } else if (Ep == 1 && pd == pd0 && per == pd + 1
  860            && skipstr != NULL) {
  861         *skip = *bad = '\0';
  862         strncat(skip, label + 4, 7);
  863         strncat(bad, label, OBSLEN-1);
  864         pd = pd0 + 1;
  865         goto restart;
  866     } else if (per == Ep + 2 && pmin == 1 && fakequarter(per)) {
  867         *bad = '\0';
  868         strncat(bad, label, OBSLEN-1);
  869         pmin = 3;
  870         goto restart;
  871     } else if (pd == 12 && Ep == 5 && per == 1 && yr == Ey + 1) {
  872         /* apparently monthly but really quarterly? */
  873         pprintf(prn, "   \"%s\": quarterly date with spurious zero?\n", label);
  874         *extra_zero = 1;
  875         *ppd = pd0 = pd = 4;
  876         goto restart;
  877     } else if (yr != Ey || per != Ep) {
  878         ret = 0;
  879     }
  880 
  881     if (!ret) {
  882         pprintf(prn, "   %s: not a consistent date\n", label);
  883         break;
  884     }
  885     }
  886 
  887     if (ret) {
  888     if (pmin == 3) {
  889         pprintf(prn, "   \"%s\": quarterly data pretending to be monthly?\n",
  890             bad);
  891         *ppd = 4;
  892     } else if (pd == pd0 + 1) {
  893         pprintf(prn, "   \"%s\": BLS-type nonsense? Trying again\n",
  894             bad);
  895         strcpy(skipstr, skip);
  896     }
  897     }
  898 
  899     return ret;
  900 }
  901 
  902 static int consistent_year_labels (const DATASET *dset,
  903                    int reversed,
  904                    int convert_pd)
  905 {
  906     char label[OBSLEN];
  907     int s, t, yr, yprev;
  908     int ret = 1;
  909 
  910     s = (reversed)? (dset->n - 1) : 0;
  911     yprev = atoi(dset->S[s]);
  912 
  913     for (t=1; t<dset->n; t++) {
  914     s = reversed ? (dset->n - 1 - t) : t;
  915     if (convert_pd) {
  916         convert_daily_label(label, dset->S[s], 1);
  917         yr = atoi(label);
  918     } else {
  919         yr = atoi(dset->S[s]);
  920     }
  921     if (yr != yprev + 1) {
  922         ret = 0;
  923         break;
  924     }
  925     yprev = yr;
  926     }
  927 
  928     return ret;
  929 }
  930 
  931 /* check for all 1s in first column of dates: this may
  932    indicate start-of-period dates, day first */
  933 
  934 static int all_day_ones (DATASET *dset)
  935 {
  936     int t;
  937 
  938     for (t=1; t<dset->n; t++) {
  939     if (atoi(dset->S[t]) != 1) {
  940         return 0;
  941     } else if (t > 31) {
  942         /* "1" can't mean January */
  943         return 1;
  944     }
  945     }
  946 
  947     return 0;
  948 }
  949 
  950 enum date_orders {
  951     YYYYMMDD = 1,
  952     MMDDYYYY,
  953     DDMMYYYY
  954 };
  955 
  956 static int get_date_order (int f0, int fn, DATASET *dset)
  957 {
  958     if (f0 > 31 || fn > 31) {
  959     /* first field must be year */
  960     return YYYYMMDD;
  961     } else if (f0 > 12 || fn > 12) {
  962     /* first field must be day */
  963     return DDMMYYYY;
  964     } else if (f0 == 1 && fn == 1 && all_day_ones(dset)) {
  965     /* start-of-period dates, day first? */
  966     return DDMMYYYY;
  967     } else {
  968     /* could be wrong here */
  969     return MMDDYYYY;
  970     }
  971 }
  972 
  973 static void retransform_daily_dates (DATASET *dset)
  974 {
  975     int t, y, m, d;
  976 
  977     /* we apparently guessed wrongly at MMDDYYYY, so
  978        put the dates back as they were for another try,
  979        at DDMMYYYY.
  980     */
  981 
  982     for (t=0; t<dset->n; t++) {
  983     sscanf(dset->S[t], YMD_READ_FMT, &y, &d, &m);
  984     sprintf(dset->S[t], YMD_WRITE_FMT, d, m, y);
  985     }
  986 }
  987 
  988 static int transform_daily_dates (DATASET *dset, int dorder,
  989                   char sep)
  990 {
  991     char *label, fmt[16];
  992     int t, yr, mon, day;
  993     int n, err = 0;
  994 
  995     if (sep > 0) {
  996     sprintf(fmt, "%%d%c%%d%c%%d", sep, sep);
  997     } else {
  998     strcpy(fmt, "%4d%2d%2d");
  999     }
 1000 
 1001     for (t=0; t<dset->n && !err; t++) {
 1002     label = dset->S[t];
 1003     if (dorder == YYYYMMDD) {
 1004         n = sscanf(label, fmt, &yr, &mon, &day);
 1005     } else if (dorder == DDMMYYYY) {
 1006         n = sscanf(label, fmt, &day, &mon, &yr);
 1007     } else {
 1008         n = sscanf(label, fmt, &mon, &day, &yr);
 1009     }
 1010     if (n == 3) {
 1011         sprintf(label, YMD_WRITE_Y2_FMT, yr, mon, day);
 1012     } else {
 1013         err = 1;
 1014     }
 1015     }
 1016 
 1017     return err;
 1018 }
 1019 
 1020 void reverse_data (DATASET *dset, PRN *prn)
 1021 {
 1022     char tmp[OBSLEN];
 1023     double x;
 1024     int T = dset->n / 2;
 1025     int i, t, s;
 1026 
 1027     pprintf(prn, A_("reversing the data!\n"));
 1028 
 1029     for (t=0; t<T; t++) {
 1030     s = dset->n - 1 - t;
 1031     for (i=1; i<dset->v; i++) {
 1032         x = dset->Z[i][t];
 1033         dset->Z[i][t] = dset->Z[i][s];
 1034         dset->Z[i][s] = x;
 1035     }
 1036     if (dset->S != NULL) {
 1037         strcpy(tmp, dset->S[t]);
 1038         strcpy(dset->S[t], dset->S[s]);
 1039         strcpy(dset->S[s], tmp);
 1040     }
 1041     }
 1042 }
 1043 
 1044 static int csv_daily_date_check (DATASET *dset, int *reversed,
 1045                  char *skipstr, PRN *prn)
 1046 {
 1047     int d1[3], d2[3];
 1048     char s1 = 0, s2 = 0;
 1049     char *lbl1 = dset->S[0];
 1050     char *lbl2 = dset->S[dset->n - 1];
 1051     int dorder = 0;
 1052 
 1053     if ((sscanf(lbl1, "%d%c%d%c%d", &d1[0], &s1, &d1[1], &s2, &d1[2]) == 5 &&
 1054      sscanf(lbl2, "%d%c%d%c%d", &d2[0], &s1, &d2[1], &s2, &d2[2]) == 5 &&
 1055      s1 == s2 && ispunct(s1)) ||
 1056     (sscanf(lbl1, "%4d%2d%2d", &d1[0], &d1[1], &d1[2]) == 3 &&
 1057      sscanf(lbl2, "%4d%2d%2d", &d2[0], &d2[1], &d2[2]) == 3)) {
 1058     int mon1, day1;
 1059     int mon2, day2;
 1060     int pd, ret = 0;
 1061 
 1062     dorder = get_date_order(d1[0], d2[0], dset);
 1063 
 1064     tryagain:
 1065 
 1066     if (dorder == YYYYMMDD) {
 1067         pputs(prn, A_("Trying date order YYYYMMDD\n"));
 1068         mon1 = d1[1];
 1069         day1 = d1[2];
 1070         mon2 = d2[1];
 1071         day2 = d2[2];
 1072     } else if (dorder == DDMMYYYY) {
 1073         pputs(prn, A_("Trying date order DDMMYYYY\n"));
 1074         day1 = d1[0];
 1075         mon1 = d1[1];
 1076         day2 = d2[0];
 1077         mon2 = d2[1];
 1078     } else {
 1079         pputs(prn, A_("Trying date order MMDDYYYY\n"));
 1080         mon1 = d1[0];
 1081         day1 = d1[1];
 1082         mon2 = d2[0];
 1083         day2 = d2[1];
 1084     }
 1085 
 1086     if (mon1 > 0 && mon1 < 13 &&
 1087         mon2 > 0 && mon2 < 13 &&
 1088         day1 > 0 && day1 < 32 &&
 1089         day2 > 0 && day2 < 32) {
 1090         /* looks promising for calendar dates, but check
 1091            further if we don't have the canonical order
 1092            or separator
 1093         */
 1094         if (dorder != YYYYMMDD || s1 != '-') {
 1095         if (transform_daily_dates(dset, dorder, s1)) {
 1096             return -1;
 1097         }
 1098         s1 = '-';
 1099         }
 1100         pprintf(prn, A_("Could be %s - %s\n"), lbl1, lbl2);
 1101         ret = check_daily_dates(dset, &pd, reversed, prn);
 1102         if (ret >= 0 && pd > 0) {
 1103         if (pd == 52) {
 1104             if (csv_weekly_data(dset)) {
 1105             ret = 52;
 1106             } else if (dorder == MMDDYYYY) {
 1107             /* maybe we guessed wrong */
 1108             retransform_daily_dates(dset);
 1109             dorder = DDMMYYYY;
 1110             goto tryagain;
 1111             } else {
 1112             ret = -1;
 1113             }
 1114         } else {
 1115             int convert_pd = 0;
 1116 
 1117             if (pd == 1 || pd == 4 || pd == 12) {
 1118             convert_pd = pd;
 1119             }
 1120             ret = time_series_label_check(dset,
 1121                           *reversed,
 1122                           skipstr,
 1123                           convert_pd,
 1124                           prn);
 1125             if (ret < 0 && dorder == MMDDYYYY) {
 1126             retransform_daily_dates(dset);
 1127             dorder = DDMMYYYY;
 1128             goto tryagain;
 1129             }
 1130         }
 1131         }
 1132         return ret;
 1133     }
 1134     } else {
 1135     pprintf(prn, A_("'%s' and '%s': couldn't get dates\n"), lbl1, lbl2);
 1136     }
 1137 
 1138     return -1;
 1139 }
 1140 
 1141 static int pd_from_date_label (const char *lbl, char *year, char *subp,
 1142                    char *format, PRN *prn)
 1143 {
 1144     const char *subchars = ".:QqMmPp-";
 1145     int len = strlen(lbl);
 1146     int try, pd = -1;
 1147 
 1148     strncat(year, lbl, 4);
 1149     try = atoi(year);
 1150 
 1151     if (try > 0 && try < 3000) {
 1152     pprintf(prn, A_("   %s: probably a year... "), year);
 1153     } else {
 1154     pprintf(prn, A_("   %s: probably not a year\n"), year);
 1155     }
 1156 
 1157     if (len == 5) {
 1158     pputs(prn, A_("   but I can't make sense of the extra bit\n"));
 1159     } else if (len == 4) {
 1160     pputs(prn, A_("and just a year\n"));
 1161     pd = 1;
 1162     } else {
 1163     char sep = lbl[4];
 1164     char sub[3], *s = NULL;
 1165     int dashQ = 0;
 1166     int p;
 1167 
 1168     if (strchr(subchars, sep)) {
 1169         *sub = '\0';
 1170         strncat(sub, lbl + 5, 2);
 1171         s = sub;
 1172         if (len == 6 || (len == 7 && (sep == 'q' || sep == 'Q'))) {
 1173         if (len == 7) s++;
 1174         p = atoi(s);
 1175         if (p > 0 && p < 5) {
 1176             pprintf(prn, A_("quarter %s?\n"), s);
 1177             pd = 4;
 1178         } else {
 1179             pprintf(prn, "quarter %d: not possible\n", p);
 1180         }
 1181         } else if (len == 7) {
 1182         if (*s == 'Q') {
 1183             /* YYYY-Qn? This is supported by SDMX */
 1184             dashQ = 1;
 1185             s++;
 1186         }
 1187         p = atoi(s);
 1188         if (dashQ) {
 1189             if (p > 0 && p < 5) {
 1190             pprintf(prn, A_("quarter %d?\n"), p);
 1191             pd = 4;
 1192             } else {
 1193             pprintf(prn, "quarter %d: not possible\n", p);
 1194             }
 1195         } else {
 1196             if (p > 0 && p < 13) {
 1197             pprintf(prn, A_("month %s?\n"), s);
 1198             pd = 12;
 1199             } else {
 1200             pprintf(prn, "month %d: not possible\n", p);
 1201             }
 1202         }
 1203         }
 1204         strcpy(subp, s);
 1205         if (format != NULL && (pd == 4 || pd == 12)) {
 1206         if (dashQ) {
 1207             sprintf(format, "%%d%cQ%%d", sep);
 1208         } else {
 1209             sprintf(format, "%%d%c%%d", sep);
 1210         }
 1211         }
 1212     }
 1213     }
 1214 
 1215     return pd;
 1216 }
 1217 
 1218 static int time_series_label_check (DATASET *dset, int reversed,
 1219                     char *skipstr, int convert_pd,
 1220                     PRN *prn)
 1221 {
 1222     char year[5], sub[3];
 1223     char format[8] = {0};
 1224     char *lbl1 = dset->S[0];
 1225     char *lbl2 = dset->S[dset->n - 1];
 1226     char *label;
 1227     int pd = -1;
 1228 
 1229     *year = *sub = '\0';
 1230     label = reversed ? lbl2 : lbl1;
 1231 
 1232     if (convert_pd) {
 1233     char altobs[OBSLEN];
 1234 
 1235     convert_daily_label(altobs, label, convert_pd);
 1236     pd = pd_from_date_label(altobs, year, sub, format, prn);
 1237     } else {
 1238     pd = pd_from_date_label(label, year, sub, format, prn);
 1239     }
 1240 
 1241     if (pd == 1) {
 1242     if (consistent_year_labels(dset, reversed, convert_pd)) {
 1243         dset->pd = pd;
 1244         strcpy(dset->stobs, year);
 1245         dset->sd0 = atof(dset->stobs);
 1246         strcpy(dset->endobs, lbl2);
 1247     } else {
 1248         pputs(prn, A_("   but the dates are not complete and consistent\n"));
 1249         pd = -1;
 1250     }
 1251     } else if (pd == 4 || pd == 12) {
 1252     int savepd = pd;
 1253     int extra_zero = 0;
 1254 
 1255     if (consistent_qm_labels(dset, reversed, convert_pd,
 1256                  skipstr, &pd, format,
 1257                  &extra_zero, prn)) {
 1258         dset->pd = pd;
 1259         if (savepd == 12 && pd == 4) {
 1260         /* we switched the interpretation from
 1261            monthly to quarterly */
 1262         int s;
 1263 
 1264         if (extra_zero) {
 1265             /* e.g. 1960Q1 written as 1960:01 */
 1266             s = atoi(sub + 1);
 1267         } else {
 1268             /* e.g. 1960Q1 written as 1960:03 */
 1269             s = atoi(sub) / 3;
 1270         }
 1271         sprintf(dset->stobs, "%s:%d", year, s);
 1272         } else {
 1273         sprintf(dset->stobs, "%s:%s", year, sub);
 1274         }
 1275         dset->sd0 = obs_str_to_double(dset->stobs);
 1276         ntolabel(dset->endobs, dset->n - 1, dset);
 1277     } else {
 1278         pputs(prn, A_("   but the dates are not complete and consistent\n"));
 1279         pd = -1;
 1280     }
 1281     }
 1282 
 1283     return pd;
 1284 }
 1285 
 1286 static int dates_maybe_reversed (const char *s1,
 1287                  const char *s2,
 1288                  PRN *prn)
 1289 {
 1290     char d1[5], d2[5];
 1291     int ret = 0;
 1292 
 1293     *d1 = *d2 = '\0';
 1294 
 1295     strncat(d1, s1, 4);
 1296     strncat(d2, s2, 4);
 1297 
 1298     ret = atoi(d1) > atoi(d2);
 1299 
 1300     if (ret) {
 1301     pputs(prn, A_("   dates are reversed?\n"));
 1302     }
 1303 
 1304     return ret;
 1305 }
 1306 
 1307 /* e.g. "M1 1957", "M12 2009" */
 1308 
 1309 static int fix_IFS_data_labels (DATASET *dset)
 1310 {
 1311     char *s1 = dset->S[0];
 1312     char *s2 = dset->S[dset->n - 1];
 1313     int ret = 0;
 1314 
 1315     if ((*s1 == 'M' || *s1 == 'Q') && *s2 == *s1) {
 1316     int n1 = strlen(s1);
 1317     int n2 = strlen(s2);
 1318 
 1319     if ((n1 == 7 || n1 == 8) && (n2 == 7 || n2 == 8) &&
 1320         isdigit(s1[1]) && isdigit(s2[1])) {
 1321         int pmax = (*s1 == 'M')? 12 : 4;
 1322         char c, tmp[8], *s;
 1323         int y, p, pbak = 0;
 1324         int i, n, doit = 1;
 1325 
 1326         for (i=0; i<dset->n; i++) {
 1327         s = dset->S[i];
 1328         n = strlen(s);
 1329         if (n != 7 && n != 8) {
 1330             doit = 0;
 1331             break;
 1332         }
 1333         n = sscanf(s, "%c%d %d", &c, &p, &y);
 1334         if (n != 3 || c != *s1) {
 1335             doit = 0;
 1336             break;
 1337         }
 1338         if (y < 1800 || y > 2500 || p <= 0 || p > pmax) {
 1339             doit = 0;
 1340             break;
 1341         }
 1342         if (i > 0 && p != pbak + 1 && p != 1) {
 1343             doit = 0;
 1344             break;
 1345         }
 1346         pbak = p;
 1347         }
 1348 
 1349         if (doit) {
 1350         for (i=0; i<dset->n; i++) {
 1351             s = dset->S[i];
 1352             sscanf(s, "%c%d %d", &c, &p, &y);
 1353             if (pmax == 12) {
 1354             sprintf(tmp, "%d:%02d", y, p);
 1355             } else {
 1356             sprintf(tmp, "%d:%d", y, p);
 1357             }
 1358             if (strlen(tmp) > strlen(s)) {
 1359             free(s);
 1360             dset->S[i] = gretl_strdup(tmp);
 1361             } else {
 1362             strcpy(s, tmp);
 1363             }
 1364         }
 1365         ret = 1;
 1366         }
 1367     }
 1368     }
 1369 
 1370     return ret;
 1371 }
 1372 
 1373 static int month_number (char *s)
 1374 {
 1375     const char *mo[] = {
 1376     "jan", "feb", "mar", "apr",
 1377     "may", "jun", "jul", "aug",
 1378     "sep", "oct", "nov", "dec"
 1379     };
 1380     int i;
 1381 
 1382     gretl_lower(s);
 1383 
 1384     for (i=0; i<12; i++) {
 1385     if (!strcmp(s, mo[i])) {
 1386         return i+1;
 1387     }
 1388     }
 1389 
 1390     return 0;
 1391 }
 1392 
 1393 /* e.g. "Jan-1980", for monthly or quarterly data */
 1394 
 1395 static int fix_mon_year_labels (DATASET *dset)
 1396 {
 1397     char *s1 = dset->S[0];
 1398     char *s2 = dset->S[dset->n - 1];
 1399     char m1[4] = {0};
 1400     char m2[4] = {0};
 1401     int yr1 = 0, yr2 = 0;
 1402     int ret = 0;
 1403 
 1404     if (strlen(s1) == 8 && strlen(s2) == 8 &&
 1405     s1[3] == '-' && s2[3] == '-') {
 1406     yr1 = atoi(s1 + 4);
 1407     yr2 = atoi(s2 + 4);
 1408     strncat(m1, s1, 3);
 1409     strncat(m2, s2, 3);
 1410     }
 1411 
 1412     if (yr1 > 999 && yr1 < 3000 && yr2 > 999 && yr2 < 3000 &&
 1413     month_number(m1) && month_number(m2)) {
 1414     int i, p, pbak = 0;
 1415     int dt, pd = 0;
 1416     char *s;
 1417 
 1418     for (i=0; i<dset->n; i++) {
 1419         s = dset->S[i];
 1420         if (strlen(s) != 8 || s[3] != '-') {
 1421         pd = 0;
 1422         break;
 1423         }
 1424         yr1 = atoi(s + 4);
 1425         *m1 = '\0';
 1426         strncat(m1, s, 3);
 1427         if (yr1 < 1000 || yr1 >= 3000 ||
 1428         (p = month_number(m1)) < 1) {
 1429         pd = 0;
 1430         break;
 1431         }
 1432         if (i > 0) {
 1433         dt = p - pbak;
 1434         if (dt != 1 && dt != 3 && p != 1) {
 1435             pd = 0;
 1436             break;
 1437         }
 1438         if (pd == 0 && dt > 0) {
 1439             pd = (dt == 1)? 12 : 4;
 1440         }
 1441         }
 1442         pbak = p;
 1443     }
 1444 
 1445     if (pd > 0) {
 1446         for (i=0; i<dset->n; i++) {
 1447         s = dset->S[i];
 1448         yr1 = atoi(s + 4);
 1449         *m1 = '\0';
 1450         strncat(m1, s, 3);
 1451         p = month_number(m1);
 1452         if (pd == 12) {
 1453             sprintf(dset->S[i], "%d:%02d", yr1, p);
 1454         } else {
 1455             sprintf(dset->S[i], "%d:%g", yr1, ceil((3+p)/4.0));
 1456         }
 1457         }
 1458         ret = 1;
 1459     }
 1460     }
 1461 
 1462     return ret;
 1463 }
 1464 
 1465 /* Attempt to parse CSV row labels as dates.  Return -1 if this
 1466    doesn't work out, or 0 if the labels seem to be just integer
 1467    observation numbers, else return the inferred data frequency.
 1468 */
 1469 
 1470 int test_markers_for_dates (DATASET *dset, int *reversed,
 1471                 char *skipstr, PRN *prn)
 1472 {
 1473     char endobs[OBSLEN];
 1474     int n = dset->n;
 1475     char *lbl1 = dset->S[0];
 1476     char *lbl2 = dset->S[n - 1];
 1477     int len1 = strlen(lbl1);
 1478     int len2 = strlen(lbl2);
 1479     int pd = -1;
 1480 
 1481     if (skipstr != NULL && *skipstr != '\0') {
 1482     return time_series_label_check(dset, *reversed, skipstr, 0, prn);
 1483     }
 1484 
 1485     pprintf(prn, A_("   first row label \"%s\", last label \"%s\"\n"),
 1486         lbl1, lbl2);
 1487 
 1488     /* are the labels (probably) just 1, 2, 3 etc.? */
 1489     sprintf(endobs, "%d", n);
 1490     if (!strcmp(lbl1, "1") && !strcmp(lbl2, endobs)) {
 1491     return 0;
 1492     }
 1493 
 1494     if (fix_IFS_data_labels(dset) || fix_mon_year_labels(dset)) {
 1495     lbl1 = dset->S[0];
 1496     lbl2 = dset->S[n - 1];
 1497     len1 = strlen(lbl1);
 1498     }
 1499 
 1500     /* labels are of different lengths? */
 1501     if (len1 != len2) {
 1502     if (abs(len1 - len2) > 1) {
 1503         return -1;
 1504     } else if (len2 > len1) {
 1505         len1 = len2;
 1506     }
 1507     }
 1508 
 1509     pputs(prn, A_("trying to parse row labels as dates...\n"));
 1510 
 1511     if (len1 == 8 || len1 == 10) {
 1512     /* daily data? */
 1513     pd = csv_daily_date_check(dset, reversed, skipstr, prn);
 1514     } else if (len1 >= 4) {
 1515     /* annual, quarterly, monthly? */
 1516     if (isdigit((unsigned char) lbl1[0]) &&
 1517         isdigit((unsigned char) lbl1[1]) &&
 1518         isdigit((unsigned char) lbl1[2]) &&
 1519         isdigit((unsigned char) lbl1[3])) {
 1520         *reversed = dates_maybe_reversed(lbl1, lbl2, prn);
 1521         pd = time_series_label_check(dset, *reversed, skipstr, 0, prn);
 1522     } else {
 1523         pputs(prn, A_("   definitely not a four-digit year\n"));
 1524     }
 1525     }
 1526 
 1527     if (pd <= 0 && *reversed) {
 1528     /* give up the "reversed" notion if we didn't get
 1529        a workable time-series interpretation */
 1530     *reversed = 0;
 1531     }
 1532 
 1533     return pd;
 1534 }
 1535 
 1536 static int utf8_ok (gzFile fp, int pos)
 1537 {
 1538     long mark = gztell(fp);
 1539     int len = pos + 9;
 1540     char *test = malloc(len + 1);
 1541     int i, ret = 0;
 1542 
 1543     gzseek(fp, mark - pos - 1, SEEK_SET);
 1544 
 1545     for (i=0; i<len; i++) {
 1546     test[i] = gzgetc(fp);
 1547     }
 1548     test[i] = '\0';
 1549 
 1550     if (g_utf8_validate(test, -1, NULL)) {
 1551     ret = 1;
 1552     } else {
 1553     GError *gerr = NULL;
 1554     gsize wrote = 0;
 1555     gchar *tr;
 1556 
 1557     /* try for iso-8859? */
 1558     tr = g_convert(test, -1, "UTF-8", "ISO-8859-15",
 1559                NULL, &wrote, &gerr);
 1560     if (gerr != NULL) {
 1561         g_error_free(gerr);
 1562     } else {
 1563         g_free(tr);
 1564         ret = 1;
 1565     }
 1566     }
 1567 
 1568     free(test);
 1569 
 1570     gzseek(fp, mark, SEEK_SET);
 1571 
 1572     return ret;
 1573 }
 1574 
 1575 enum {
 1576     UTF_8 = 1,
 1577     UTF_16,
 1578     UTF_32
 1579 };
 1580 
 1581 /* If we got a UTF-16 or UTF-32 BOM, try recoding to
 1582    UTF-8 before parsing data. We write the recoded text
 1583    to a temporary file in the user's "dotdir" (and
 1584    then delete that file once we're done).
 1585 */
 1586 
 1587 static int csv_recode_input (gzFile *fpp,
 1588                  const char *fname,
 1589                  gchar **pfname,
 1590                  int ucode,
 1591                  PRN *prn)
 1592 {
 1593     const gchar *from_set =
 1594     (ucode == UTF_32)? "UTF-32" : "UTF-16";
 1595     gchar *altname = NULL;
 1596     int err = 0;
 1597 
 1598     /* the current stream is not useable as is,
 1599        so shut it down
 1600     */
 1601     gzclose(*fpp);
 1602     *fpp = NULL;
 1603 
 1604     /* we'll recode to a temp file in dotdir */
 1605     altname = g_strdup_printf("%srecode_tmp.u8", gretl_dotdir());
 1606 
 1607     err = gretl_recode_file(fname, altname,
 1608                 from_set, "UTF-8",
 1609                 prn);
 1610 
 1611     if (!err) {
 1612     /* try reattaching the stream */
 1613     *fpp = gretl_gzopen(altname, "rb");
 1614     if (*fpp == NULL) {
 1615         gretl_remove(altname);
 1616         err = E_FOPEN;
 1617     } else {
 1618         pputs(prn, "switched to recoded input\n");
 1619         *pfname = altname;
 1620         altname = NULL;
 1621     }
 1622     }
 1623 
 1624     g_free(altname);
 1625 
 1626     return err;
 1627 }
 1628 
 1629 /* Check the first 4 bytes of "CSV" input for a Byte Order
 1630    Mark. If we find the UTF-8 BOM (typically written by
 1631    Microsoft tools), simply record the fact so that we can
 1632    skip it on reading. But if we find a BOM indicating a
 1633    16-bit or 32-bit unicode encoding, flag this by returning
 1634    a non-zero @ucode value; in that case we'll attempt a
 1635    full recording of the input (via GLib) before we start
 1636    reading data.
 1637 */
 1638 
 1639 static int csv_unicode_check (gzFile fp, csvdata *c, PRN *prn)
 1640 {
 1641     unsigned char b[4];
 1642     int n = gzread(fp, b, 4);
 1643     int ucode = 0;
 1644 
 1645     if (n == 4) {
 1646     if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) {
 1647         pputs(prn, "got UTF-8 BOM\n");
 1648         ucode = UTF_8;
 1649     } else if (b[0] == 0xFE && b[1] == 0xFF) {
 1650         pputs(prn, "got UTF-16BE, will try recoding\n");
 1651         ucode = UTF_16;
 1652     } else if (b[0] == 0xFF && b[1] == 0xFE) {
 1653         if (b[2] == 0 && b[3] == 0) {
 1654         pputs(prn, "got UTF-32LE, will try recoding\n");
 1655         ucode = UTF_32;
 1656         } else {
 1657         pputs(prn, "got UTF-16LE, will try recoding\n");
 1658         ucode = UTF_16;
 1659         }
 1660     } else if (b[0] == 0 && b[1] == 0 &&
 1661            b[0] == 0xFE && b[1] == 0xFF) {
 1662         pputs(prn, "got UTF-32BE, will try recoding\n");
 1663         ucode = UTF_32;
 1664     }
 1665     }
 1666 
 1667     if (ucode == UTF_8) {
 1668     csv_set_has_bom(c);
 1669     gzseek(fp, 3, SEEK_SET);
 1670     ucode = 0;
 1671     } else {
 1672     gzrewind(fp);
 1673     }
 1674 
 1675     return ucode;
 1676 }
 1677 
 1678 /* The function below checks for the maximum line length in the given
 1679    file.  It also checks for extraneous binary data (the file is
 1680    supposed to be plain text), and checks whether the 'delim'
 1681    character is present in the file, on a non-comment line (where
 1682    a comment line is one that starts with '#').
 1683 
 1684    In addition, we check whether the file has a trailing comma on every
 1685    line, and for the numbers of double- and single-quote characters
 1686    to try to determine which, if either, is used to indicate quoted
 1687    fields in the input.
 1688 */
 1689 
 1690 static int csv_max_line_length (gzFile fp, csvdata *cdata, PRN *prn)
 1691 {
 1692     int c, c1, cbak = 0, cc = 0;
 1693     int comment = 0, maxlinelen = 0;
 1694     int max_ldquo = 0, max_lsquo = 0;
 1695     int min_ldquo = 0, min_lsquo = 0;
 1696     int ldquo = 0, lsquo = 0;
 1697     int ndquo = 0, nsquo = 0;
 1698     int crlf = 0, lines = 0;
 1699 
 1700     csv_set_trailing_comma(cdata); /* just provisionally */
 1701 
 1702     while ((c = gzgetc(fp)) != EOF) {
 1703     if (c == 0x0d) {
 1704         /* CR */
 1705         c1 = gzgetc(fp);
 1706         if (c1 == EOF) {
 1707         break;
 1708         } else if (c1 == 0x0a) {
 1709         /* CR + LF -> LF */
 1710         crlf = 1;
 1711         c = c1;
 1712         } else {
 1713         /* Mac-style: CR not followed by LF */
 1714         c = 0x0a;
 1715         gzungetc(c1, fp);
 1716         }
 1717     }
 1718     if (c == 0x0a) {
 1719         if (cc > maxlinelen) {
 1720         maxlinelen = cc;
 1721         }
 1722         cc = 0;
 1723         if (cbak != 0 && cbak != ',') {
 1724         csv_unset_trailing_comma(cdata);
 1725         }
 1726         lines++;
 1727         if (ldquo > max_ldquo) {
 1728         max_ldquo = ldquo;
 1729         } else if (ldquo > 0 && ldquo < max_ldquo) {
 1730         min_ldquo = ldquo;
 1731         }
 1732         if (lsquo > max_lsquo) {
 1733         max_lsquo = lsquo;
 1734         } else if (lsquo > 0 && lsquo < max_lsquo) {
 1735         min_lsquo = lsquo;
 1736         }
 1737         ldquo = lsquo = 0;
 1738         continue;
 1739     }
 1740     cbak = c;
 1741     if (!isspace((unsigned char) c) && !isprint((unsigned char) c) &&
 1742         !(c == CTRLZ) && !utf8_ok(fp, cc)) {
 1743         pprintf(prn, A_("Binary data (%d) encountered (line %d:%d): "
 1744                 "this is not a valid text file\n"),
 1745             c, lines + 1, cc + 1);
 1746         return -1;
 1747     }
 1748     if (cc == 0) {
 1749         comment = (c == '#');
 1750     }
 1751     if (!comment) {
 1752         if (c == '\t') {
 1753         /* let's ignore trailing tabs in this heuristic */
 1754         c1 = gzgetc(fp);
 1755         if (c1 != 0x0d && c1 != 0x0a) {
 1756             csv_set_got_tab(cdata);
 1757         }
 1758         gzungetc(c1, fp);
 1759         }
 1760         if (c == ';') {
 1761         csv_set_got_semi(cdata);
 1762         }
 1763         if (c == cdata->delim) {
 1764         csv_set_got_delim(cdata);
 1765         } else if (c == '"') {
 1766         ldquo++;
 1767         ndquo++;
 1768         } else if (c == '\'') {
 1769         lsquo++;
 1770         nsquo++;
 1771         }
 1772     }
 1773     cc++;
 1774     }
 1775 
 1776     if (maxlinelen == 0) {
 1777     pputs(prn, A_("Data file is empty\n"));
 1778     } else if (csv_has_trailing_comma(cdata)) {
 1779     pputs(prn, A_("Data file has trailing commas\n"));
 1780     }
 1781 
 1782     if (ndquo > 0 || nsquo > 0) {
 1783     /* candidates for quotation character? */
 1784     int cands[2] = {0};
 1785 
 1786     if (ndquo > 0) {
 1787         pprintf(prn, A_("Found %d double-quotes, max %d per line\n"),
 1788             ndquo, max_ldquo);
 1789     }
 1790     if (nsquo > 0) {
 1791         pprintf(prn, A_("Found %d single-quotes, max %d per line\n"),
 1792             nsquo, max_lsquo);
 1793     }
 1794     if (max_ldquo > 0 && max_ldquo % 2 == 0) {
 1795         /* double-quote is a candidate? */
 1796         if (min_ldquo > 0 && min_ldquo % 2) {
 1797         ; /* nope */
 1798         } else {
 1799         cands[0] = 1;
 1800         }
 1801     }
 1802     if (max_lsquo > 0 && max_lsquo % 2 == 0) {
 1803         /* single-quote is a candidate? */
 1804         if (min_lsquo > 0 && min_lsquo % 2) {
 1805         ; /* nope */
 1806         } else {
 1807         cands[1] = 1;
 1808         }
 1809     }
 1810     if (cands[0] && cands[1]) {
 1811         /* hmm, rule one out: prefer the more numerous */
 1812         if (nsquo > ndquo) {
 1813         cands[0] = 0;
 1814         } else {
 1815         cands[1] = 0;
 1816         }
 1817     }
 1818     if (cands[0]) {
 1819         pputs(prn, A_("Assuming double-quote is the relevant "
 1820               "quotation character\n"));
 1821         cdata->qchar = '"';
 1822     } else if (cands[1]) {
 1823         pputs(prn, A_("Assuming single-quote is the relevant "
 1824               "quotation character\n"));
 1825         cdata->qchar = '\'';
 1826     }
 1827     }
 1828 
 1829     if (maxlinelen > 0) {
 1830     /* allow for newline and null terminator */
 1831     maxlinelen += 2 + crlf;
 1832     }
 1833 
 1834     return maxlinelen;
 1835 }
 1836 
 1837 #define nonspace_delim(d) (d != ',' && d != ';' && d != '\t')
 1838 
 1839 static int count_csv_fields (csvdata *c)
 1840 {
 1841     const char *s = c->line;
 1842     int inquote = 0;
 1843     int cbak, nf = 0;
 1844 
 1845     if (*s == c->delim && *s == ' ') {
 1846     s++;
 1847     }
 1848 
 1849     while (*s) {
 1850     if (csv_keep_quotes(c) && *s == c->qchar) {
 1851         inquote = !inquote;
 1852     } else if (!inquote && *s == c->delim) {
 1853         nf++;
 1854     }
 1855     cbak = *s;
 1856     s++;
 1857     /* Problem: (when) should a trailing delimiter be read as an
 1858        implicit NA?  For now we'll so treat it if the delimiter
 1859        is not plain space.
 1860     */
 1861     if (*s == '\0' && cbak == c->delim && nonspace_delim(c->delim)) {
 1862         nf--;
 1863     }
 1864     }
 1865 
 1866     return nf + 1;
 1867 }
 1868 
 1869 static void purge_quoted_commas (char *s)
 1870 {
 1871     int inquote = 0;
 1872 
 1873     while (*s) {
 1874     if (*s == '"') {
 1875         inquote = !inquote;
 1876     } else if (inquote && *s == ',') {
 1877         *s = ' ';
 1878     }
 1879     s++;
 1880     }
 1881 }
 1882 
 1883 static void purge_unquoted_spaces (char *s)
 1884 {
 1885     int inquote = 0;
 1886 
 1887     while (*s) {
 1888     if (*s == '"') {
 1889         inquote = !inquote;
 1890     } else if (!inquote && *s == ' ') {
 1891         shift_string_left(s, 1);
 1892     }
 1893     s++;
 1894     }
 1895 }
 1896 
 1897 static void compress_csv_line (csvdata *c, int nospace)
 1898 {
 1899     int n = strlen(c->line);
 1900     char *p = c->line + n - 1;
 1901 
 1902     if (*p == 0x0a) {
 1903     *p = '\0';
 1904     p--;
 1905     }
 1906 
 1907     if (*p == 0x0d) {
 1908     *p = '\0';
 1909     }
 1910 
 1911     if (!csv_keep_quotes(c) && c->delim == ',') {
 1912     purge_quoted_commas(c->line);
 1913     }
 1914 
 1915     if (c->delim != ' ') {
 1916     if (nospace) {
 1917         purge_unquoted_spaces(c->line);
 1918     }
 1919     } else {
 1920     compress_spaces(c->line);
 1921     }
 1922 
 1923     if (!csv_keep_quotes(c)) {
 1924         gretl_delchar('"', c->line);
 1925     }
 1926 
 1927     if (csv_has_trailing_comma(c)) {
 1928     /* chop trailing comma */
 1929     n = strlen(c->line);
 1930     if (n > 0) {
 1931         c->line[n-1] = '\0';
 1932     }
 1933     }
 1934 }
 1935 
 1936 int import_obs_label (const char *s)
 1937 {
 1938     char tmp[VNAMELEN];
 1939 
 1940     if (s == NULL) {
 1941     return 1;
 1942     }
 1943 
 1944     if (!strcmp(s, "\"\"") || !strcmp(s, "''")) {
 1945     return 1;
 1946     }
 1947 
 1948     if (*s == '"' || *s == '\'') s++;
 1949 
 1950     if (*s == '\0') {
 1951     return 1;
 1952     }
 1953 
 1954     if (strlen(s) > VNAMELEN - 1) {
 1955     return 0;
 1956     }
 1957 
 1958     *tmp = '\0';
 1959     strncat(tmp, s, VNAMELEN - 1);
 1960     gretl_lower(tmp);
 1961 
 1962     return (!strcmp(tmp, "obs") ||
 1963         !strcmp(tmp, "date") ||
 1964         !strcmp(tmp, "year") ||
 1965         !strcmp(tmp, "period") ||
 1966         !strcmp(tmp, "observation") ||
 1967         !strcmp(tmp, "observation_date"));
 1968 }
 1969 
 1970 static int join_wants_col_zero (csvdata *c, const char *s)
 1971 {
 1972     const char *colname;
 1973     int i;
 1974 
 1975     if (*s == '\0') {
 1976     return 0;
 1977     }
 1978 
 1979     for (i=0; i<c->jspec->ncols; i++) {
 1980     colname = c->jspec->colnames[i];
 1981     if (colname != NULL && !strcmp(s, colname)) {
 1982         return 1;
 1983     }
 1984     }
 1985 
 1986     return 0;
 1987 }
 1988 
 1989 static void check_first_field (const char *line, csvdata *c, PRN *prn)
 1990 {
 1991     const char *s;
 1992 
 1993  tryagain:
 1994     s = line;
 1995 
 1996     if (c->delim != ' ' && *s == c->delim) {
 1997     csv_set_blank_column(c);
 1998     } else {
 1999     char field1[OBSLEN];
 2000     int i = 0;
 2001 
 2002     if (c->delim == ' ' && *s == ' ') {
 2003         s++;
 2004     }
 2005 
 2006     while (*s && i < sizeof field1) {
 2007         if (*s == c->delim) {
 2008         break;
 2009         } else if (*s == '\t') {
 2010         /* presence of a tab must indicate tab-separation? */
 2011         c->delim = '\t';
 2012         goto tryagain;
 2013         }
 2014         field1[i++] = *s++;
 2015     }
 2016 
 2017     field1[i] = '\0';
 2018     iso_to_ascii(field1);
 2019 
 2020     if (joining(c) && join_wants_col_zero(c, field1)) {
 2021         return;
 2022     } else if (csv_all_cols(c)) {
 2023         /* open/append wants all columns as data */
 2024         return;
 2025     }
 2026 
 2027     pprintf(prn, A_("   first field: '%s'\n"), field1);
 2028 
 2029     if (import_obs_label(field1)) {
 2030         pputs(prn, A_("   seems to be observation label\n"));
 2031         csv_set_obs_column(c);
 2032     }
 2033     }
 2034 }
 2035 
 2036 void import_na_init (void)
 2037 {
 2038     const char *s = get_csv_na_read_string();
 2039 
 2040     strcpy(import_na, s);
 2041 }
 2042 
 2043 /* Returns 1 if the string @s should be counted representing
 2044    an NA or missing value, 0 otherwise. If there is a user-set
 2045    "csv_read_na" value this is used for comparison, otherwise
 2046    a set of default values is consulted.
 2047 */
 2048 
 2049 int import_na_string (const char *s)
 2050 {
 2051     if (*import_na != '\0' && strcmp(import_na, "default")) {
 2052     /* the user has set a specific "NA" string, so
 2053        respect it */
 2054     return !strcmp(s, import_na);
 2055     } else {
 2056     /* consult a list of common representations of NA */
 2057     const char *defaults[] = {
 2058         "NA",
 2059         "N.A.",
 2060         "n.a.",
 2061         "na",
 2062         "n/a",
 2063         "N/A",
 2064         "#N/A",
 2065         "NaN",
 2066         ".NaN",
 2067         ".",
 2068         "..",
 2069         "-999",
 2070         "-9999",
 2071         "-",
 2072         NULL
 2073     };
 2074     int i;
 2075 
 2076     for (i=0; defaults[i] != NULL; i++) {
 2077         if (!strcmp(s, defaults[i])) {
 2078         return 1;
 2079         }
 2080     }
 2081     }
 2082 
 2083     return 0;
 2084 }
 2085 
 2086 static int csv_missval (const char *str, int i, int t,
 2087             int *miss_shown, PRN *prn)
 2088 {
 2089     int miss = 0;
 2090 
 2091     if (*str == '\0') {
 2092     if (miss_shown != NULL) {
 2093         if (t < 80 || *miss_shown < i) {
 2094         pprintf(prn, A_("   the cell for variable %d, obs %d "
 2095                 "is empty: treating as missing value\n"),
 2096             i, t);
 2097         *miss_shown += 1;
 2098         }
 2099     }
 2100     miss = 1;
 2101     }
 2102 
 2103     if (import_na_string(str)) {
 2104     if (miss_shown != NULL) {
 2105         if (t < 80 || *miss_shown < i) {
 2106         pprintf(prn, A_("   warning: missing value for variable "
 2107                 "%d, obs %d\n"), i, t);
 2108         *miss_shown += 1;
 2109         }
 2110     }
 2111     miss = 1;
 2112     }
 2113 
 2114     return miss;
 2115 }
 2116 
 2117 /* In the case where we think we've found thousands
 2118    separators in numerical input, provisionally mark
 2119    all "non-numeric" values as NAs; we do this prior
 2120    to a second pass through the data.
 2121 */
 2122 
 2123 static void revise_non_numeric_values (csvdata *c)
 2124 {
 2125     int i, t;
 2126 
 2127     for (i=1; i<c->dset->v; i++) {
 2128     for (t=0; t<c->dset->n; t++) {
 2129         if (c->dset->Z[i][t] == NON_NUMERIC) {
 2130         c->dset->Z[i][t] = NADBL;
 2131         }
 2132     }
 2133     }
 2134 }
 2135 
 2136 int non_numeric_check (DATASET *dset, int **plist,
 2137                gretl_string_table **pst,
 2138                PRN *prn)
 2139 {
 2140     int *list = NULL;
 2141     int i, j, t, nn = 0;
 2142     int err = 0;
 2143 
 2144 #if CDEBUG > 1
 2145     fprintf(stderr, "non_numeric_check: testing %d series, pst = %p\n",
 2146         dset->v - 1, (void *) pst);
 2147 #endif
 2148 
 2149     if (pst == NULL) {
 2150     /* not interested in string-valued series/columns */
 2151     for (i=1; i<dset->v; i++) {
 2152         for (t=0; t<dset->n; t++) {
 2153         if (dset->Z[i][t] == NON_NUMERIC) {
 2154             dset->Z[i][t] = NADBL;
 2155         }
 2156         }
 2157     }
 2158     return 0;
 2159     }
 2160 
 2161     for (i=1; i<dset->v; i++) {
 2162     for (t=0; t<dset->n; t++) {
 2163         if (dset->Z[i][t] == NON_NUMERIC) {
 2164         nn++;
 2165         break;
 2166         }
 2167     }
 2168     }
 2169 
 2170 #if CDEBUG > 1
 2171     fprintf(stderr, " found %d candidate series\n", nn);
 2172 #endif
 2173 
 2174     if (nn == 0) {
 2175     return 0; /* nothing to be done */
 2176     }
 2177 
 2178     list = gretl_list_new(nn);
 2179     if (list == NULL) {
 2180     return E_ALLOC;
 2181     }
 2182 
 2183     j = 1;
 2184     for (i=1; i<dset->v; i++) {
 2185     for (t=0; t<dset->n; t++) {
 2186         if (dset->Z[i][t] == NON_NUMERIC) {
 2187         list[j++] = i;
 2188         break;
 2189         }
 2190     }
 2191     }
 2192 
 2193 #if CDEBUG > 1
 2194     printlist(list, "non-numeric vars list");
 2195 #endif
 2196 
 2197     for (i=1; i<=list[0]; i++) {
 2198     /* check each member of @list */
 2199     double nnfrac;
 2200     int nnon = 0;
 2201     int nok = 0;
 2202     int tn = 0;
 2203     int v = list[i];
 2204 
 2205     series_set_flag(dset, v, VAR_DISCRETE);
 2206 
 2207     for (t=0; t<dset->n; t++) {
 2208         if (dset->Z[v][t] == NON_NUMERIC) {
 2209         if (tn == 0) {
 2210             /* record the first non-numeric obs */
 2211             tn = t + 1;
 2212         }
 2213         nnon++;
 2214         } else if (!na(dset->Z[v][t])) {
 2215         nok++;
 2216         }
 2217     }
 2218 
 2219     nnfrac = (nok == 0)? 1.0 : nnon / (double) (nnon + nok);
 2220     pprintf(prn, A_("variable %d (%s): non-numeric values = %d "
 2221             "(%.2f percent)\n"), v, dset->varname[v],
 2222         nnon, 100 * nnfrac);
 2223     if ((nnon < 2 && dset->n > 2) || nnfrac < 0.05) {
 2224         /* if we got just a few non-numeric values, we'll assume
 2225            that the data file is broken
 2226         */
 2227         pprintf(prn, A_("ERROR: variable %d (%s), observation %d, "
 2228                 "expected numeric value\n"),
 2229             v, dset->varname[v], tn);
 2230         err = E_DATA;
 2231         break;
 2232     }
 2233     }
 2234 
 2235     if (!err) {
 2236     pputs(prn, _("allocating string table\n"));
 2237     *pst = gretl_string_table_new(list);
 2238     if (*pst == NULL) {
 2239         err = E_ALLOC;
 2240     }
 2241     }
 2242 
 2243     if (err) {
 2244     free(list);
 2245     } else {
 2246     *plist = list;
 2247     }
 2248 
 2249     return err;
 2250 }
 2251 
 2252 static int csv_non_numeric_check (csvdata *c, PRN *prn)
 2253 {
 2254     gretl_string_table *st = NULL;
 2255     int *nlist = NULL;
 2256     int err = 0;
 2257 
 2258     if (csv_as_matrix(c)) {
 2259     err = non_numeric_check(c->dset, &nlist, NULL, prn);
 2260     } else {
 2261     err = non_numeric_check(c->dset, &nlist, &st, prn);
 2262     }
 2263 
 2264     if (!err) {
 2265     c->codelist = nlist;
 2266     c->st = st;
 2267     }
 2268 
 2269     return err;
 2270 }
 2271 
 2272 /* Handle the case in "join" where the user specified some time
 2273    columns for conversion to numeric and also gave a specific format
 2274    for the conversion.
 2275 */
 2276 
 2277 static double special_time_val (const char *s, const char *fmt,
 2278                 int m_means_q)
 2279 {
 2280     struct tm t = {0};
 2281     char *test;
 2282 
 2283     test = strptime(s, fmt, &t);
 2284 
 2285     if (test == NULL || *test != '\0') {
 2286     /* conversion didn't work right */
 2287     return NADBL;
 2288     } else {
 2289     int y, m, d;
 2290 
 2291     y = t.tm_year + 1900;
 2292     m = t.tm_mon + 1;
 2293     d = t.tm_mday;
 2294 
 2295     if (m_means_q) {
 2296         /* convert to 1st month of quarter */
 2297         if (m == 2) m = 4;
 2298         else if (m == 3) m = 7;
 2299         else if (m == 4) m = 10;
 2300         else if (m != 1) {
 2301         return NADBL;
 2302         }
 2303     }
 2304 
 2305     if (d == 0) d = 1;
 2306 
 2307     return 10000*y + 100*m + d;
 2308     }
 2309 }
 2310 
 2311 static int char_count (char c, const char *s)
 2312 {
 2313     int n = 0;
 2314 
 2315     while (*s) {
 2316     if (*s == c) n++;
 2317     s++;
 2318     }
 2319 
 2320     return n;
 2321 }
 2322 
 2323 /* Follow-up check for the case where we think we might
 2324    have found a thousands separator: each occurrence of
 2325    the putative separator must be followed by exactly 3
 2326    digits: we set c->thousep to an invalid value if this
 2327    is not the case.
 2328 */
 2329 
 2330 static void validate_thousep (csvdata *c, const char *s)
 2331 {
 2332     int nd;
 2333 
 2334     while (*s) {
 2335     if (*s == c->thousep) {
 2336         nd = 0;
 2337         s++;
 2338         while (*s) {
 2339         if (isdigit(*s)) {
 2340             nd++;
 2341             s++;
 2342         } else {
 2343             break;
 2344         }
 2345         }
 2346         if (nd != 3) {
 2347         /* nope! */
 2348 #if CDEBUG
 2349         fprintf(stderr, "validate_thousep: no: '%c' is followed by %d digits\n",
 2350             c->thousep, nd);
 2351 #endif
 2352         c->thousep = -1;
 2353         break;
 2354         }
 2355     } else {
 2356         s++;
 2357     }
 2358     }
 2359 }
 2360 
 2361 /* Initial heuristic for detecting a thousands separator,
 2362    where the string @s has been determined to contain
 2363    nothing but digits, dot and comma (allowing for a leading
 2364    minus).
 2365 
 2366    1) If the string contains both comma and dot, whichever
 2367    character appears to the left cannot be the decimal
 2368    separator and may be a thousands separator.
 2369 
 2370    2) If more than one comma appears in the string, comma
 2371    cannot be the decimal character and might be a thousands
 2372    separator; mutatis mutandis for dot.
 2373 */
 2374 
 2375 static void test_for_thousands_sep (csvdata *c, const char *s)
 2376 {
 2377     const char *p1 = strrchr(s, '.');
 2378     const char *p2 = strrchr(s, ',');
 2379     char thousep = 0;
 2380 
 2381     if (p1 != NULL && p2 != NULL) {
 2382     thousep = (p2 - p1 > 0)? '.' : ',';
 2383     } else if (p1 != NULL && char_count('.', s) > 0) {
 2384     thousep = '.';
 2385     } else if (p2 != NULL && char_count(',', s) > 0) {
 2386     thousep = ',';
 2387     }
 2388 
 2389     if (c->thousep > 0) {
 2390     if (thousep != 0 && thousep != c->thousep) {
 2391         /* no consistent interpretation exists */
 2392         c->thousep = -1; /* invalid */
 2393     }
 2394     } else if (thousep != 0) {
 2395     /* we have a candidate for testing */
 2396     char *test, tmp[CSVSTRLEN];
 2397 
 2398     strcpy(tmp, s);
 2399     gretl_delchar(thousep, tmp);
 2400     if (thousep == '.' && get_local_decpoint() == '.') {
 2401         gretl_charsub(tmp, ',', '.');
 2402     }
 2403     errno = 0;
 2404     strtod(tmp, &test);
 2405     if (*test == '\0' && errno == 0) {
 2406         c->thousep = thousep;
 2407     }
 2408     }
 2409 
 2410     if (c->thousep && thousep != 0) {
 2411     validate_thousep(c, s);
 2412     }
 2413 }
 2414 
 2415 static int all_digits_and_seps (const char *s)
 2416 {
 2417     const char *test = "0123456789.,";
 2418 
 2419     if (*s == '-') s++;
 2420 
 2421     return strspn(s, test) == strlen(s);
 2422 }
 2423 
 2424 static double eval_non_numeric (csvdata *c, int i, const char *s)
 2425 {
 2426     double x = NON_NUMERIC;
 2427 
 2428     if (series_get_flags(c->dset, i) & VAR_TIMECOL) {
 2429     char *fmt = NULL;
 2430     int mq = 0;
 2431 
 2432     if (timecol_get_format(c->dset, i, &fmt, &mq)) {
 2433         /* the user gave a specific format for this */
 2434         x = special_time_val(s, fmt, mq);
 2435     } else {
 2436         /* default: ISO 8601 extended */
 2437         int y, m, d, n;
 2438 
 2439         n = sscanf(s, "%d-%d-%d", &y, &m, &d);
 2440         if (n == 3) {
 2441         x = 10000*y + 100*m + d;
 2442         } else {
 2443         x = NADBL;
 2444         }
 2445     }
 2446     } else if (c->thousep >= 0 && !csv_scrub_thousep(c)) {
 2447     /* Here we consider the possibility although @s does not
 2448        validate as numeric according to the C library, it is by
 2449        intent numeric but includes one or more thousands
 2450        separators.
 2451 
 2452        The condition c->thousep >= 0 requires that we haven't
 2453        already ruled out this interpretation due to inconsistency,
 2454        and !csv_scrub_thousep(c) requires that we're not on a
 2455        second pass through the data.
 2456     */
 2457     if (all_digits_and_seps(s)) {
 2458         test_for_thousands_sep(c, s);
 2459     }
 2460     }
 2461 
 2462     return x;
 2463 }
 2464 
 2465 static int converted_ok (const char *s, char *test, double x)
 2466 {
 2467     if (*test != '\0') {
 2468     if (errno) perror(s);
 2469     return 0; /* definitely not OK */
 2470     } else if (errno == ERANGE && fabs(x) > 0 && fabs(x) < 0.001) {
 2471     return 1; /* subnormal, but we'll let that pass */
 2472     } else if (errno) {
 2473     perror(s);
 2474     return 0;
 2475     } else {
 2476     return 1;
 2477     }
 2478 }
 2479 
 2480 static char *csv_unquote (char *s)
 2481 {
 2482     if (s[0] == '"') {
 2483     int i, n = strlen(s);
 2484 
 2485     if (n > 1 && s[n-1] == '"') {
 2486         for (i=0; i<n-2; i++) {
 2487         s[i] = s[i+1];
 2488         }
 2489         s[i] = '\0';
 2490     }
 2491     }
 2492     return s;
 2493 }
 2494 
 2495 static double csv_atof (csvdata *c, int i)
 2496 {
 2497     char tmp[CSVSTRLEN], clean[CSVSTRLEN];
 2498     double x = NON_NUMERIC;
 2499     const char *s = c->str;
 2500     char *test;
 2501 
 2502     if (csv_scrub_thousep(c) && strchr(s, c->thousep) &&
 2503     all_digits_and_seps(s)) {
 2504     /* second pass through the data: pre-process fields
 2505        that we reckon include thousands separators
 2506     */
 2507     strcpy(clean, s);
 2508     gretl_delchar(c->thousep, clean);
 2509     s = clean;
 2510     }
 2511 
 2512     if (c->decpoint == '.' || !csv_do_dotsub(c) || strchr(s, ',') == NULL) {
 2513     /* either we're currently set to the correct locale,
 2514        or there's no problematic decimal point in @s
 2515     */
 2516     errno = 0;
 2517     x = strtod(s, &test);
 2518     if (converted_ok(s, test, x)) {
 2519         return x; /* handled */
 2520     }
 2521     } else if (csv_do_dotsub(c)) {
 2522     /* in C numeric locale: substitute dot for comma */
 2523     strcpy(tmp, s);
 2524     gretl_charsub(tmp, ',', '.');
 2525     errno = 0;
 2526     x = strtod(tmp, &test);
 2527     if (converted_ok(s, test, x)) {
 2528         return x; /* handled */
 2529     }
 2530     }
 2531 
 2532     if (c->decpoint == '.' && strchr(s, ',') != NULL) {
 2533     /* try remediation for decimal comma? */
 2534     strcpy(tmp, s);
 2535     gretl_charsub(tmp, ',', '.');
 2536     errno = 0;
 2537     x = strtod(tmp, &test);
 2538     if (converted_ok(s, test, x)) {
 2539         return x; /* handled */
 2540     }
 2541     }
 2542 
 2543     /* fallback */
 2544     /* revised 2020-02-13 to use csv_unquote */
 2545     return eval_non_numeric(c, i, csv_unquote(c->str));
 2546 }
 2547 
 2548 static int process_csv_obs (csvdata *c, int i, int t, int *miss_shown,
 2549                 PRN *prn)
 2550 {
 2551     int err = 0;
 2552 
 2553     if (c->st != NULL) {
 2554     /* second round, handling string-valued variables */
 2555     if (in_gretl_list(c->codelist, i)) {
 2556         double zit = c->dset->Z[i][t];
 2557         int ix;
 2558 
 2559         if (na(zit) && *c->str != '\0' && c->user_na == NULL) {
 2560         /* by default (no user_na) only blanks count as NAs */
 2561         zit = NON_NUMERIC;
 2562         }
 2563         if (!na(zit)) {
 2564         ix = gretl_string_table_index(c->st, c->str, i, 0, prn);
 2565         if (ix > 0) {
 2566             c->dset->Z[i][t] = (double) ix;
 2567         } else {
 2568             err = E_DATA;
 2569         }
 2570         }
 2571     }
 2572     } else if (csv_missval(c->str, i, t+1, miss_shown, prn)) {
 2573     c->dset->Z[i][t] = NADBL;
 2574     } else {
 2575     gretl_strstrip(c->str);
 2576     c->dset->Z[i][t] = csv_atof(c, i);
 2577     }
 2578 
 2579     return err;
 2580 }
 2581 
 2582 /* Emulation of fgets(), designed to handle any sort of line
 2583    termination (unix, DOS, Mac or even an unholy mixture).
 2584    Line-endings are converted to LF (0x0a).
 2585 */
 2586 
 2587 static char *csv_fgets (csvdata *cdata, gzFile fp)
 2588 {
 2589     char *s = cdata->line;
 2590     int n = cdata->maxlinelen;
 2591     int i, c1, c = 0;
 2592 
 2593     for (i=0; i<n-1 && c!=0x0a; i++) {
 2594     c = gzgetc(fp);
 2595     if (c == EOF) {
 2596         if (i == 0) {
 2597         /* signal end of read */
 2598         return NULL;
 2599         } else {
 2600         break;
 2601         }
 2602     } else if (c == 0x0d) {
 2603         /* CR: convert to LF and peek at next char: if it's
 2604            LF swallow it, otherwise put it back */
 2605         c = 0x0a;
 2606         c1 = gzgetc(fp);
 2607         if (c1 != 0x0a) {
 2608         gzungetc(c1, fp);
 2609         }
 2610     }
 2611     s[i] = c;
 2612     }
 2613 
 2614     s[i] = '\0';
 2615 
 2616     return s;
 2617 }
 2618 
 2619 /* pick up any comments following the data block in a CSV file */
 2620 
 2621 static char *get_csv_descrip (csvdata *c, gzFile fp)
 2622 {
 2623     char *line = c->line;
 2624     char *desc = NULL;
 2625     size_t llen, totlen;
 2626 
 2627     while (csv_fgets(c, fp)) {
 2628     tailstrip(line);
 2629     llen = strlen(line);
 2630     if (desc == NULL) {
 2631         totlen = llen + 4;
 2632         desc = malloc(totlen);
 2633         if (desc == NULL) {
 2634         return NULL;
 2635         }
 2636         sprintf(desc, "%s\n", line);
 2637     } else {
 2638         char *tmp;
 2639 
 2640         totlen = strlen(desc) + llen + 4;
 2641         tmp = realloc(desc, totlen);
 2642         if (tmp == NULL) {
 2643         free(desc);
 2644         return NULL;
 2645         }
 2646         desc = tmp;
 2647         strcat(desc, line);
 2648         strcat(desc, "\n");
 2649     }
 2650     }
 2651 
 2652     if (desc != NULL && string_is_blank(desc)) {
 2653     free(desc);
 2654     desc = NULL;
 2655     }
 2656 
 2657     return desc;
 2658 }
 2659 
 2660 static const char *
 2661 csv_msg = N_("\nPlease note:\n"
 2662          "- The first row of the CSV file should contain the "
 2663          "names of the variables.\n"
 2664          "- The first column may optionally contain date "
 2665          "strings or other 'markers':\n  in that case its row 1 entry "
 2666          "should be blank, or should say 'obs' or 'date'.\n"
 2667          "- The remainder of the file must be a rectangular "
 2668          "array of data.\n");
 2669 
 2670 /* Here we check whether we get a consistent reading on
 2671    the number of fields per line in the CSV file
 2672 */
 2673 
 2674 static int csv_fields_check (gzFile fp, csvdata *c, PRN *prn)
 2675 {
 2676     int gotdata = 0;
 2677     int chkcols = 0;
 2678     int err = 0;
 2679 
 2680     c->ncols = c->nrows = 0;
 2681 
 2682     if (csv_has_bom(c)) {
 2683     gzseek(fp, 3, SEEK_SET);
 2684     }
 2685 
 2686     while (csv_fgets(c, fp) && !err) {
 2687 
 2688     /* skip comment lines */
 2689     if (*c->line == '#') {
 2690         continue;
 2691     }
 2692 
 2693     /* skip blank lines -- but finish if the blank comes after data */
 2694     if (string_is_blank(c->line)) {
 2695         if (gotdata) {
 2696         if (!csv_have_data(c)) {
 2697             c->descrip = get_csv_descrip(c, fp);
 2698         }
 2699         break;
 2700         } else {
 2701         continue;
 2702         }
 2703     }
 2704 
 2705     c->nrows += 1;
 2706 
 2707     if (fixed_format(c)) {
 2708         tailstrip(c->line);
 2709         gotdata = 1;
 2710         chkcols = strlen(c->line);
 2711         if (chkcols < c->cols_list[c->cols_list[0]]) {
 2712         gretl_errmsg_set(_("Invalid column specification"));
 2713         err = E_DATA;
 2714         break;
 2715         } else {
 2716         continue;
 2717         }
 2718     }
 2719 
 2720     compress_csv_line(c, 1);
 2721 
 2722     if (!gotdata) {
 2723         /* scrutinize the first "real" line */
 2724         check_first_field(c->line, c, prn);
 2725         gotdata = 1;
 2726     }
 2727 
 2728     chkcols = count_csv_fields(c);
 2729     if (c->ncols == 0) {
 2730         c->ncols = chkcols;
 2731         pprintf(prn, A_("   number of columns = %d\n"), c->ncols);
 2732     } else if (chkcols != c->ncols) {
 2733         pprintf(prn, A_("   ...but row %d has %d fields: aborting\n"),
 2734             c->nrows, chkcols);
 2735         err = E_DATA;
 2736     } else if (cols_subset(c)) {
 2737         int datacols = csv_skip_col_1(c) ? (c->ncols - 1) : c->ncols;
 2738 
 2739         if (c->cols_list[c->cols_list[0]] > datacols) {
 2740         gretl_errmsg_set(_("Invalid column specification"));
 2741         err = E_DATA;
 2742         }
 2743     }
 2744     }
 2745 
 2746     if (!err && fixed_format(c)) {
 2747     c->ncols = c->cols_list[0];
 2748     }
 2749 
 2750     return err;
 2751 }
 2752 
 2753 static void strip_illegals (char *s)
 2754 {
 2755     char name[VNAMELEN] = {0};
 2756     int i, j = 0;
 2757 
 2758     for (i=0; s[i] != '\0'; i++) {
 2759     if (isalnum(s[i]) || s[i] == '_') {
 2760         name[j++] = s[i];
 2761     }
 2762     }
 2763 
 2764     name[j] = '\0';
 2765     strcpy(s, name);
 2766 }
 2767 
 2768 static int intercept_nan_as_name (const char *s)
 2769 {
 2770     if (strlen(s) == 3) {
 2771     char screen[4];
 2772 
 2773     strcpy(screen, s);
 2774     gretl_lower(screen);
 2775     if (!strcmp(screen, "nan")) {
 2776         return 1;
 2777     }
 2778     }
 2779 
 2780     return 0;
 2781 }
 2782 
 2783 static int csv_is_numeric (const char *s, csvdata *c)
 2784 {
 2785     int ret = 0;
 2786 
 2787     if (c->decpoint == '.') {
 2788     ret = numeric_string(s);
 2789     } else {
 2790     /* decimal comma in force */
 2791     char *tmp = gretl_strdup(s);
 2792 
 2793     gretl_charsub(tmp, ',', '.');
 2794     ret = numeric_string(tmp);
 2795     free(tmp);
 2796     }
 2797 
 2798     return ret;
 2799 }
 2800 
 2801 static int process_csv_varname (csvdata *c, int j, int *numcount,
 2802                 PRN *prn)
 2803 {
 2804     char *vname = c->dset->varname[j];
 2805     char *src = c->str;
 2806     int err = 0;
 2807 
 2808     *vname = '\0';
 2809 
 2810     if (intercept_nan_as_name(src)) {
 2811     gretl_errmsg_sprintf(_("If '%s' is intended as the name of a variable, "
 2812                    "please change it --\nstrings of this sort usually "
 2813                    "mean 'not a number'."), src);
 2814     err = E_DATA;
 2815     } else if (*src == '\0') {
 2816     fprintf(stderr, "variable name %d is missing\n", j);
 2817     sprintf(vname, "v%d", j);
 2818     } else if (csv_is_numeric(src, c)) {
 2819     *numcount += 1;
 2820     } else {
 2821     const char *s = src;
 2822 
 2823     while (*s && !isalpha(*s)) s++;
 2824     if (*s == '\0') {
 2825         fprintf(stderr, "variable name %d (%s) is garbage\n", j, src);
 2826         sprintf(vname, "v%d", j);
 2827     } else {
 2828         strncat(vname, s, VNAMELEN - 1);
 2829     }
 2830     iso_to_ascii(vname);
 2831     strip_illegals(vname);
 2832     if (check_varname(vname)) {
 2833         errmsg(1, prn);
 2834         err = E_DATA;
 2835     }
 2836     }
 2837 
 2838     return err;
 2839 }
 2840 
 2841 static int csv_reconfigure_for_markers (DATASET *dset)
 2842 {
 2843     int err = dataset_allocate_obs_markers(dset);
 2844 
 2845     if (!err) {
 2846     err = dataset_drop_last_variables(dset, 1);
 2847     }
 2848 
 2849     return err;
 2850 }
 2851 
 2852 static int skip_data_column (csvdata *c, int k)
 2853 {
 2854     int col = csv_skip_col_1(c) ? k : k + 1;
 2855 
 2856     if (!in_gretl_list(c->cols_list, col)) {
 2857     return 1;
 2858     } else {
 2859     return 0;
 2860     }
 2861 }
 2862 
 2863 /* special fix-up for column names in the context of "join":
 2864    the algorithm here is also used in the userspace fixname()
 2865    function
 2866 */
 2867 
 2868 void normalize_join_colname (char *targ, const char *src,
 2869                  int underscore, int k)
 2870 {
 2871     const char *letters = "abcdefghijklmnopqrstuvwxyz"
 2872     "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 2873     int i = 0;
 2874 
 2875     /* skip any leading non-letters */
 2876     src += strcspn(src, letters);
 2877 
 2878     while (*src && i < VNAMELEN - 1) {
 2879     if (strspn(src, letters) > 0 || isdigit(*src) || *src == '_') {
 2880         /* transcribe valid characters */
 2881         targ[i++] = *src;
 2882     } else if (*src == ' ' || underscore) {
 2883         /* convert space to underscore */
 2884         if (i > 0 && targ[i-1] == '_') {
 2885         ; /* skip */
 2886         } else {
 2887         targ[i++] = '_';
 2888         }
 2889     }
 2890     src++;
 2891     }
 2892 
 2893     if (i > 0) {
 2894     targ[i] = '\0';
 2895     } else if (k <= 0) {
 2896     strcpy(targ, "col[n]");
 2897     } else {
 2898     sprintf(targ, "col%d", k);
 2899     }
 2900 }
 2901 
 2902 static int update_join_cols_list (csvdata *c, int k)
 2903 {
 2904     int *test;
 2905     int err = 0;
 2906 
 2907     test = gretl_list_append_term(&c->cols_list, k);
 2908     if (test == NULL) {
 2909     err = E_ALLOC;
 2910     }
 2911 
 2912 #if CDEBUG
 2913     printlist(c->cols_list, "c->cols_list for join");
 2914 #endif
 2915 
 2916     return err;
 2917 }
 2918 
 2919 /* handle_join_varname: the index @k contains the column number
 2920    relative to the entire CSV file, while @pj points to j, the column
 2921    number relative to the reduced dataset that will be constructed by
 2922    selection of columns from the file.
 2923 
 2924    Here we're examining a column heading read from file (c->str) to
 2925    see whether it matches any of the column-names required for an
 2926    ongoing join operation (held in c->jspec->colnames). If so, we
 2927    write the index j into the appropriate slot in c->jspec->colnums
 2928    (which starts off filled with zeros), so the joiner will know where
 2929    to find the required data. (The j value is bound to be at least 1
 2930    since column 0 is reserved to the constant.)
 2931 
 2932    In some cases a given named column may perform more than one role in
 2933    a join operation -- for example, it may serve as an element in a
 2934    filter and also as the auxiliary variable in an "aggregation"
 2935    method. To allow for this we don't stop scanning at the first match
 2936    of c->str with a required column name.
 2937 
 2938    The call to update_join_cols_list() uses the index @k to record the
 2939    overall column position of "wanted data", for use by the CSV
 2940    reader.
 2941 */
 2942 
 2943 static int handle_join_varname (csvdata *c, int k, int *pj)
 2944 {
 2945     const char *colname;
 2946     char okname[VNAMELEN];
 2947     int matched = 0;
 2948     int i, j = *pj;
 2949 
 2950     if (!csv_skip_col_1(c)) {
 2951     k++;
 2952     }
 2953 
 2954     if (csv_no_header(c)) {
 2955     sprintf(okname, "col%d", k);
 2956     } else {
 2957     /* convert to valid gretl identifier */
 2958     normalize_join_colname(okname, c->str, 0, k);
 2959     }
 2960 
 2961 #if CDEBUG
 2962     fprintf(stderr, "handle_join_varname: looking at '%s' (%s)\n", c->str, okname);
 2963 #endif
 2964 
 2965     for (i=0; i<c->jspec->ncols; i++) {
 2966     /* find "wanted name" i */
 2967     colname = c->jspec->colnames[i];
 2968     if (colname == NULL || c->jspec->colnums[i] > 0) {
 2969         /* name not wanted, or already found */
 2970         continue;
 2971     }
 2972     if (!strcmp(okname, colname)) {
 2973 #if CDEBUG
 2974         fprintf(stderr, " target %d matched at CSV col %d, j=%d\n", i, k, j);
 2975 #endif
 2976         c->jspec->colnums[i] = j;
 2977         if (!matched) {
 2978         matched = 1;
 2979         strcpy(c->dset->varname[j], okname);
 2980         update_join_cols_list(c, k);
 2981         *pj += 1;
 2982         if (in_gretl_list(c->jspec->timecols, i)) {
 2983             series_set_flag(c->dset, j, VAR_TIMECOL);
 2984         }
 2985         }
 2986     }
 2987     }
 2988 
 2989     return 0;
 2990 }
 2991 
 2992 #define starts_number(c) (isdigit((unsigned char) c) || c == '-' || \
 2993                           c == '+' || c == '.')
 2994 
 2995 #define obs_labels_no_varnames(o,c,n)  (!o && c->v > 3 && n == c->v - 2)
 2996 
 2997 static int csv_varname_scan (csvdata *c, gzFile fp, PRN *prn, PRN *mprn)
 2998 {
 2999     char *p;
 3000     int obscol = csv_has_obs_column(c);
 3001     int i, j, k, numcount;
 3002     int err = 0;
 3003 
 3004     if (!csv_no_header(c)) {
 3005     pputs(mprn, A_("scanning for variable names...\n"));
 3006     }
 3007 
 3008     if (csv_has_bom(c)) {
 3009     gzseek(fp, 3, SEEK_SET);
 3010     }
 3011 
 3012     while (csv_fgets(c, fp)) {
 3013     if (*c->line == '#' || string_is_blank(c->line)) {
 3014         continue;
 3015     } else {
 3016         break;
 3017     }
 3018     }
 3019 
 3020     c->datapos = gztell(fp);
 3021 
 3022     compress_csv_line(c, 1);
 3023 
 3024     p = c->line;
 3025     if (c->delim == ' ' && *p == ' ') p++;
 3026     iso_to_ascii(p);
 3027 
 3028     if (strlen(p) > 118) {
 3029     pprintf(mprn, A_("   line: %.115s...\n"), p);
 3030     } else {
 3031     pprintf(mprn, A_("   line: %s\n"), p);
 3032     }
 3033 
 3034     numcount = 0;
 3035     j = 1; /* for the constant */
 3036 
 3037     for (k=0; k<c->ncols && !err; k++) {
 3038     i = 0;
 3039     while (*p && *p != c->delim) {
 3040         if (i < CSVSTRLEN - 1) {
 3041         c->str[i++] = *p;
 3042         }
 3043         p++;
 3044     }
 3045     c->str[i] = '\0';
 3046     if (*p == c->delim) p++;
 3047 
 3048     if (k == 0 && csv_skip_col_1(c)) {
 3049         ; /* no-op */
 3050     } else if (!joining(c) && cols_subset(c) && skip_data_column(c, k)) {
 3051         ; /* no-op */
 3052     } else {
 3053         if (joining(c)) {
 3054         handle_join_varname(c, k, &j);
 3055         } else if (probing(c) && csv_no_header(c)) {
 3056         sprintf(c->dset->varname[j], "col%d", j);
 3057         j++;
 3058         } else {
 3059         err = process_csv_varname(c, j, &numcount, prn);
 3060         j++;
 3061         }
 3062     }
 3063     if (j == c->dset->v) {
 3064 #if CDEBUG
 3065         fprintf(stderr, "breaking on j = %d (k = %d)\n", j, k);
 3066 #endif
 3067         break;
 3068     }
 3069     }
 3070 
 3071     if (!err && joining(c) && c->cols_list == NULL) {
 3072     /* no relevant columns were found */
 3073     gretl_errmsg_set("No relevant columns were found");
 3074     err = E_UNKVAR;
 3075     }
 3076 
 3077     if (err) {
 3078     return err;
 3079     }
 3080 
 3081     if (csv_no_header(c) || numcount == c->dset->v - 1 ||
 3082     obs_labels_no_varnames(obscol, c->dset, numcount)) {
 3083     if (!csv_no_header(c)) {
 3084         pputs(prn, A_("it seems there are no variable names\n"));
 3085         /* then we undercounted the observations by one? */
 3086         if (!rows_subset(c)) {
 3087         err = add_single_obs(c->dset);
 3088         }
 3089     }
 3090     if (!err) {
 3091         /* set up to handle the "no varnames" case */
 3092         csv_set_autoname(c);
 3093         c->datapos = csv_has_bom(c) ? 3 : 0;
 3094         if (!csv_all_cols(c)) {
 3095         if (obs_labels_no_varnames(obscol, c->dset, numcount)) {
 3096             err = csv_reconfigure_for_markers(c->dset);
 3097             if (!err) {
 3098             csv_set_obs_column(c);
 3099             }
 3100         }
 3101         }
 3102     }
 3103     } else if (numcount > 0) {
 3104     for (i=1; i<c->dset->v; i++) {
 3105         if (check_varname(c->dset->varname[i])) {
 3106         errmsg(1, prn);
 3107         break;
 3108         }
 3109     }
 3110     fprintf(stderr, "numcount = %d\n", numcount);
 3111     err = E_DATA;
 3112     }
 3113 
 3114     return err;
 3115 }
 3116 
 3117 static int row_not_wanted (csvdata *c, int t)
 3118 {
 3119     if (c->rowmask != NULL) {
 3120     if (t >= c->masklen) {
 3121         return 1;
 3122     } else if (gretl_vector_get(c->rowmask, t) == 0) {
 3123         return 1;
 3124     }
 3125     }
 3126 
 3127     return 0;
 3128 }
 3129 
 3130 /* read numerical data when we've been given a fixed column-reading
 3131    specification */
 3132 
 3133 static int fixed_format_read (csvdata *c, gzFile fp, PRN *prn)
 3134 {
 3135     char *p;
 3136     int miss_shown = 0;
 3137     int *missp = NULL;
 3138     int t = 0, s = 0;
 3139     int i, k, n, m;
 3140     int err = 0;
 3141 
 3142     c->real_n = c->dset->n;
 3143 
 3144     if (csv_has_bom(c)) {
 3145     gzseek(fp, 3, SEEK_SET);
 3146     }
 3147 
 3148     if (csv_is_verbose(c)) {
 3149     missp = &miss_shown;
 3150     }
 3151 
 3152     while (csv_fgets(c, fp) && !err) {
 3153     tailstrip(c->line);
 3154     if (*c->line == '#' || string_is_blank(c->line)) {
 3155         continue;
 3156     }
 3157     if (row_not_wanted(c, s)) {
 3158         s++;
 3159         continue;
 3160     }
 3161     m = strlen(c->line);
 3162     for (i=1; i<=c->ncols && !err; i++) {
 3163         k = c->cols_list[i];
 3164         n = c->width_list[i];
 3165         if (k + n - 1 > m) {
 3166         /* attempting to read out of bounds */
 3167         fprintf(stderr, "row %d, column %d: start=%d, width=%d, "
 3168             "but line length = %d\n", t+1, i, k, n, m);
 3169         err = E_DATA;
 3170         break;
 3171         }
 3172         p = c->line + k - 1;
 3173         *c->str = '\0';
 3174         strncat(c->str, p, n);
 3175         /* Added 2016-11-16: allow trailing blanks in a field
 3176            of specified width. This is required for handling
 3177            US CPS data.
 3178         */
 3179         tailstrip(c->str);
 3180         if (csv_missval(c->str, i, t+1, missp, prn)) {
 3181         c->dset->Z[i][t] = NADBL;
 3182         } else {
 3183         c->dset->Z[i][t] = csv_atof(c, i);
 3184         if (c->dset->Z[i][t] == NON_NUMERIC) {
 3185             gretl_errmsg_sprintf(_("At row %d, column %d:\n"), t+1, k);
 3186             gretl_errmsg_sprintf(_("'%s' -- no numeric conversion performed!"),
 3187                      c->str);
 3188             err = E_DATA;
 3189         }
 3190         }
 3191     }
 3192     s++;
 3193     if (++t == c->dset->n) {
 3194         break;
 3195     }
 3196     }
 3197 
 3198     if (err == E_DATA) {
 3199     gretl_errmsg_set(_("Invalid column specification"));
 3200     }
 3201 
 3202     return err;
 3203 }
 3204 
 3205 #define XML1_OK(u) ((u>=0x0020 && u<=0xD7FF) || \
 3206             (u>=0xE000 && u<=0xFFFD))
 3207 
 3208 /* Check that an observation label contains only
 3209    valid UTF-8, and moreover that every character
 3210    is valid in XML 1.0. If not, try recoding from
 3211    ISO 8859.
 3212 */
 3213 
 3214 static int maybe_fix_csv_string (gchar *s)
 3215 {
 3216     int err = 0;
 3217 
 3218     if (!g_utf8_validate(s, -1, NULL)) {
 3219     GError *gerr = NULL;
 3220     gsize wrote = 0;
 3221     gchar *tr;
 3222 
 3223     /* try for iso-8859? */
 3224     tr = g_convert(s, -1, "UTF-8", "ISO-8859-15",
 3225                NULL, &wrote, &gerr);
 3226     if (gerr != NULL) {
 3227         gretl_errmsg_set(gerr->message);
 3228         g_error_free(gerr);
 3229         err = E_DATA;
 3230     } else {
 3231         *s = '\0';
 3232         gretl_utf8_strncat(s, tr, CSVSTRLEN-1);
 3233         g_free(tr);
 3234     }
 3235     }
 3236 
 3237     if (!err) {
 3238     int i, n = g_utf8_strlen(s, -1);
 3239     gunichar u;
 3240 
 3241     for (i=0; i<n; i++) {
 3242         u = g_utf8_get_char(s);
 3243         if (!XML1_OK(u)) {
 3244         return 0;
 3245         }
 3246         s = g_utf8_next_char(s);
 3247     }
 3248     }
 3249 
 3250     return err;
 3251 }
 3252 
 3253 static void transcribe_obs_label (csvdata *c, int t)
 3254 {
 3255     char *s = c->str;
 3256     char c0 = *s;
 3257     int n = strlen(s);
 3258 
 3259     /* skip a leading quote, and unquote fully
 3260        if a matching trailing quote is found
 3261     */
 3262 
 3263     if (c0 == '"' || c0 == '\'') {
 3264     if (s[n-1] == c0) {
 3265         s[n-1] = '\0';
 3266         n--;
 3267     }
 3268     s++;
 3269     n--;
 3270     /* and once more, with feeling... */
 3271     if (s[0] == '\'') {
 3272         s++;
 3273         n--;
 3274     }
 3275     }
 3276 
 3277     if (n > OBSLEN - 1) {
 3278     n = OBSLEN - 1;
 3279     }
 3280 
 3281     c->dset->S[t][0] = '\0';
 3282     gretl_utf8_strncat(c->dset->S[t], s, n);
 3283 }
 3284 
 3285 static int real_read_labels_and_data (csvdata *c, gzFile fp, PRN *prn)
 3286 {
 3287     char *p;
 3288     int miss_shown = 0;
 3289     int *missp = NULL;
 3290     int truncated = 0;
 3291     int t = 0, s = 0;
 3292     int i, j, k;
 3293     int err = 0;
 3294 
 3295     if (csv_is_verbose(c)) {
 3296     missp = &miss_shown;
 3297     }
 3298 
 3299     c->real_n = c->dset->n;
 3300 
 3301     while (csv_fgets(c, fp) && !err) {
 3302     int inquote = 0;
 3303 
 3304     if (*c->line == '#' || string_is_blank(c->line)) {
 3305         continue;
 3306     } else if (*c->skipstr != '\0' && strstr(c->line, c->skipstr)) {
 3307         c->real_n -= 1;
 3308         continue;
 3309     } else if (row_not_wanted(c, s)) {
 3310         s++;
 3311         continue;
 3312     }
 3313 
 3314     compress_csv_line(c, 0);
 3315     p = c->line;
 3316 
 3317     if (c->delim == ' ') {
 3318         if (*p == ' ') p++;
 3319     } else {
 3320         p += strspn(p, " ");
 3321     }
 3322 
 3323     j = 1;
 3324     for (k=0; k<c->ncols && !err; k++) {
 3325         i = 0;
 3326         while (*p) {
 3327         if (csv_keep_quotes(c) && *p == c->qchar) {
 3328             inquote = !inquote;
 3329         } else if (!inquote && *p == c->delim) {
 3330             break;
 3331         }
 3332         if (i < CSVSTRLEN - 1) {
 3333             c->str[i++] = *p;
 3334         } else {
 3335             truncated++;
 3336         }
 3337         p++;
 3338         }
 3339         c->str[i] = '\0';
 3340         err = maybe_fix_csv_string(c->str);
 3341         if (!err) {
 3342         if (k == 0 && csv_skip_col_1(c) && c->dset->S != NULL) {
 3343             transcribe_obs_label(c, t);
 3344         } else if (cols_subset(c) && skip_data_column(c, k)) {
 3345             ; /* no-op */
 3346         } else {
 3347             err = process_csv_obs(c, j++, t, missp, prn);
 3348         }
 3349         }
 3350         if (!err) {
 3351         /* prep for next column */
 3352         if (*p == c->delim) {
 3353             p++;
 3354         }
 3355         if (c->delim != ' ') {
 3356             p += strspn(p, " ");
 3357         }
 3358         }
 3359     }
 3360 
 3361     s++;
 3362     if (++t == c->dset->n) {
 3363         break;
 3364     }
 3365     }
 3366 
 3367     if (truncated) {
 3368     pprintf(prn, A_("warning: %d labels were truncated.\n"), truncated);
 3369     }
 3370 
 3371     if (!err && c->real_n < c->dset->n) {
 3372     int drop = c->dset->n - c->real_n;
 3373 
 3374     err = dataset_drop_observations(c->dset, drop);
 3375     }
 3376 
 3377     return err;
 3378 }
 3379 
 3380 /* When reading a CSV file, should we attempt to parse observation
 3381    strings as dates (and impose time-series structure on the data
 3382    if this is successful)? In general, yes, but maybe not if we're
 3383    reading the data in the context of a "join" operation, since
 3384    in this case automatic detection may collide with time-key
 3385    information supplied by the user. Current status: we'll skip
 3386    the auto-dating stuff when joining unless (a) it's a MIDAS
 3387    join (mixed frequencies) and the user has _not_ supplied any
 3388    time key specification.
 3389 */
 3390 
 3391 static int csv_skip_dates (csvdata *c)
 3392 {
 3393     if (c->jspec != NULL) {
 3394     /* with --aggr=spread (MIDAS) we'll need dates info,
 3395        unless the user have a time key spec
 3396     */
 3397     return c->jspec->auto_midas == 0;
 3398     } else {
 3399     return 0;
 3400     }
 3401 }
 3402 
 3403 static int csv_read_data (csvdata *c, gzFile fp, PRN *prn, PRN *mprn)
 3404 {
 3405     int reversed = csv_data_reversed(c);
 3406     int err;
 3407 
 3408     if (mprn != NULL) {
 3409     if (csv_all_cols(c)) {
 3410         pputs(mprn, A_("scanning for data...\n"));
 3411     } else {
 3412         pputs(mprn, A_("scanning for row labels and data...\n"));
 3413     }
 3414     }
 3415 
 3416     gzseek(fp, c->datapos, SEEK_SET);
 3417 
 3418     err = real_read_labels_and_data(c, fp, prn);
 3419 
 3420     if (!err && csv_skip_col_1(c) && !rows_subset(c) && !csv_skip_dates(c)) {
 3421     c->markerpd = test_markers_for_dates(c->dset, &reversed,
 3422                          c->skipstr, prn);
 3423     if (reversed) {
 3424         csv_set_data_reversed(c);
 3425     }
 3426     }
 3427 
 3428     return err;
 3429 }
 3430 
 3431 static void print_csv_parsing_header (const char *fname, PRN *prn)
 3432 {
 3433     if (!g_utf8_validate(fname, -1, NULL)) {
 3434     gchar *trfname = g_locale_to_utf8(fname, -1, NULL, NULL, NULL);
 3435 
 3436     pprintf(prn, "%s %s...\n", A_("parsing"), trfname);
 3437     g_free(trfname);
 3438     } else {
 3439     pprintf(prn, "%s %s...\n", A_("parsing"), fname);
 3440     }
 3441 }
 3442 
 3443 static int join_unique_columns (csvdata *c)
 3444 {
 3445     const char **cnames = c->jspec->colnames;
 3446     char *counted;
 3447     int i, j, ncols = 0;
 3448 
 3449     counted = calloc(c->jspec->ncols, 1);
 3450 
 3451     for (i=0; i<c->jspec->ncols; i++) {
 3452     if (cnames[i] != NULL && counted[i] == 0) {
 3453         counted[i] = 1;
 3454         /* mark any duplicates as counted too */
 3455         for (j=i+1; j<c->jspec->ncols; j++) {
 3456         if (cnames[j] != NULL && !strcmp(cnames[i], cnames[j])) {
 3457             counted[j] = 1;
 3458         }
 3459         }
 3460 #if CDEBUG
 3461         fprintf(stderr, "join_unique_columns: '%s'\n", cnames[i]);
 3462 #endif
 3463         ncols++;
 3464     }
 3465     }
 3466 
 3467     free(counted);
 3468 
 3469     return ncols;
 3470 }
 3471 
 3472 static int csv_set_dataset_dimensions (csvdata *c)
 3473 {
 3474     int err = 0;
 3475 
 3476     c->dset->v = 0;
 3477 
 3478     if (rows_subset(c)) {
 3479     c->dset->n = n_from_row_mask(c);
 3480     }
 3481 
 3482     if (fixed_format(c)) {
 3483     if (c->dset->n == 0) {
 3484         c->dset->n = c->nrows;
 3485     }
 3486     c->dset->v = c->ncols + 1;
 3487     } else {
 3488     int cols_wanted, cols_present;
 3489 
 3490     if (c->dset->n == 0) {
 3491         if (csv_no_header(c)) {
 3492         c->dset->n = c->nrows;
 3493         } else {
 3494         /* allow for varnames row */
 3495         c->dset->n = c->nrows - 1;
 3496         }
 3497     }
 3498 
 3499     cols_present = csv_skip_col_1(c) ? (c->ncols - 1) : c->ncols;
 3500 
 3501     if (joining(c)) {
 3502         cols_wanted = join_unique_columns(c);
 3503     } else if (cols_subset(c)) {
 3504         cols_wanted = c->cols_list[0];
 3505     } else {
 3506         cols_wanted = cols_present;
 3507     }
 3508 
 3509     if (cols_wanted > cols_present) {
 3510         gretl_errmsg_set(_("Invalid column specification"));
 3511         err = E_DATA;
 3512     } else {
 3513         /* allow for the constant */
 3514         c->dset->v = cols_wanted + 1;
 3515     }
 3516     }
 3517 
 3518     if (probing(c)) {
 3519     /* don't allocate tons of space for data that
 3520        we won't read right now */
 3521     c->dset->n = 1;
 3522     }
 3523 
 3524 #if CDEBUG
 3525     if (joining(c)) {
 3526     fprintf(stderr, "csv dataset dimensions: v=%d, n=%d\n",
 3527         c->dset->v, c->dset->n);
 3528     }
 3529 #endif
 3530 
 3531     return err;
 3532 }
 3533 
 3534 /*
 3535  * real_import_csv:
 3536  * @fname: name of CSV file.
 3537  * @dset: dataset struct.
 3538  * @cols: column specification.
 3539  * @rows: row specification.
 3540  * @join: specification pertaining to "join" command.
 3541  * @probe: also pertains to "join" (via GUI).
 3542  * @pm: location of matrix to accept the data or NULL.
 3543  * @opt: use OPT_N to force interpretation of data colums containing
 3544  * strings as coded (non-numeric) values and not errors; use OPT_H
 3545  * to indicate absence of a header row; use OPT_A to indicate that
 3546  * all columns should be read as data series (i.e. do not try to
 3547  * interpret the first column as observation labels); for use of
 3548  * OPT_T see the help text for the "append" command.
 3549  * @prn: gretl printing struct (or NULL).
 3550  *
 3551  * Open a Comma-Separated Values data file and read the data into
 3552  * the current work space.
 3553  *
 3554  * Returns: 0 on successful completion, non-zero otherwise.
 3555  */
 3556 
 3557 static int real_import_csv (const char *fname,
 3558                 DATASET *dset,
 3559                 const char *cols,
 3560                 const char *rows,
 3561                 joinspec *join,
 3562                 csvprobe *probe,
 3563                 gretl_matrix **pm,
 3564                 gretlopt opt,
 3565                 PRN *prn)
 3566 {
 3567     csvdata *c = NULL;
 3568     gzFile fp = NULL;
 3569     PRN *mprn = NULL;
 3570     gchar *altname = NULL;
 3571     int recode = 0;
 3572     int popit = 0;
 3573     int i, err = 0;
 3574 
 3575     import_na_init();
 3576 
 3577     if (prn != NULL) {
 3578     set_alt_gettext_mode(prn);
 3579     }
 3580 
 3581     if (gretl_messages_on()) {
 3582     mprn = prn;
 3583     }
 3584 
 3585     fp = gretl_gzopen(fname, "rb");
 3586     if (fp == NULL) {
 3587     pprintf(prn, A_("Couldn't open %s\n"), fname);
 3588     err = E_FOPEN;
 3589     goto csv_bailout;
 3590     }
 3591 
 3592     c = csvdata_new(dset);
 3593     if (c == NULL) {
 3594     err = E_ALLOC;
 3595     goto csv_bailout;
 3596     }
 3597 
 3598     recode = csv_unicode_check(fp, c, prn);
 3599     if (recode) {
 3600     err = csv_recode_input(&fp, fname, &altname, recode, prn);
 3601     if (err) {
 3602         goto csv_bailout;
 3603     }
 3604     }
 3605 
 3606     if (cols != NULL) {
 3607     err = csvdata_add_cols_list(c, cols, opt);
 3608     if (err) {
 3609         goto csv_bailout;
 3610     } else if (fixed_format(c)) {
 3611         pprintf(mprn, A_("using fixed column format\n"));
 3612     }
 3613     }
 3614 
 3615     if (rows != NULL) {
 3616     err = csvdata_add_row_mask(c, rows);
 3617     if (err) {
 3618         goto csv_bailout;
 3619     }
 3620     }
 3621 
 3622     if (opt & OPT_H) {
 3623     csv_set_no_header(c);
 3624     }
 3625 
 3626     if (join != NULL) {
 3627     c->jspec = join;
 3628     c->flags |= CSV_HAVEDATA;
 3629     } else if (probe != NULL) {
 3630     c->probe = probe;
 3631     c->flags |= CSV_HAVEDATA;
 3632     } else {
 3633     if (pm != NULL) {
 3634         csv_set_as_matrix(c);
 3635     }
 3636         if (opt & OPT_A) {
 3637         csv_set_all_cols(c);
 3638         }
 3639         if (opt & OPT_V) {
 3640             csv_set_verbose(c);
 3641         }
 3642     }
 3643 
 3644     if (opt & OPT_I) {
 3645     csv_unset_keep_quotes(c);
 3646     }
 3647 
 3648     if (mprn != NULL) {
 3649     print_csv_parsing_header(fname, mprn);
 3650     }
 3651 
 3652     /* get line length, also check for binary data, etc. */
 3653     c->maxlinelen = csv_max_line_length(fp, c, prn);
 3654     if (c->maxlinelen <= 0) {
 3655     err = E_DATA;
 3656     goto csv_bailout;
 3657     }
 3658 
 3659     if (csv_as_matrix(c) && csv_got_semi(c)) {
 3660     if (c->delim == ',' && csv_got_delim(c)) {
 3661         c->decpoint = ',';
 3662     }
 3663     c->delim = ';';
 3664     } else if (!fixed_format(c) && !csv_got_delim(c)) {
 3665     /* set default delimiter */
 3666     if (csv_got_tab(c)) {
 3667         c->delim = '\t';
 3668     } else if (csv_got_semi(c)) {
 3669         c->delim = ';';
 3670     } else {
 3671         c->delim = ' ';
 3672     }
 3673     }
 3674 
 3675 #if CDEBUG
 3676     fprintf(stderr, "fixed_format? %s; got_delim (%c)? %s; got_tab? %s; ",
 3677         fixed_format(c) ? "yes" : "no", c->delim,
 3678         csv_got_delim(c) ? "yes" : "no",
 3679         csv_got_tab(c)? "yes" : "no");
 3680     fprintf(stderr, "decpoint '%c'\n", c->decpoint);
 3681 #endif
 3682 
 3683     /* buffer to hold lines */
 3684     c->line = malloc(c->maxlinelen);
 3685     if (c->line == NULL) {
 3686     err = E_ALLOC;
 3687     goto csv_bailout;
 3688     }
 3689 
 3690  alt_delim:
 3691 
 3692     if (mprn != NULL) {
 3693     if (!fixed_format(c)) {
 3694         pprintf(mprn, A_("using delimiter '%c'\n"), c->delim);
 3695     }
 3696     pprintf(mprn, A_("   longest line: %d characters\n"), c->maxlinelen - 1);
 3697     }
 3698 
 3699     if (csv_has_trailing_comma(c) && c->delim != ',') {
 3700     csv_unset_trailing_comma(c);
 3701     }
 3702 
 3703     gzrewind(fp);
 3704 
 3705     /* read lines, check for consistency in number of fields */
 3706     err = csv_fields_check(fp, c, mprn);
 3707     if (err && !fixed_format(c)) {
 3708     if (c->delim != ';' && csv_got_semi(c)) {
 3709         c->delim = ';';
 3710         err = 0;
 3711         goto alt_delim;
 3712     }
 3713     pputs(prn, A_(csv_msg));
 3714     goto csv_bailout;
 3715     }
 3716 
 3717     err = csv_set_dataset_dimensions(c);
 3718     if (err) {
 3719     err = E_DATA;
 3720     goto csv_bailout;
 3721     }
 3722 
 3723     pprintf(mprn, A_("   number of variables: %d\n"), c->dset->v - 1);
 3724     pprintf(mprn, A_("   number of non-blank lines: %d\n"), c->nrows);
 3725 
 3726     if (c->dset->n == 0) {
 3727     pputs(prn, A_("Invalid data file\n"));
 3728     err = E_DATA;
 3729     goto csv_bailout;
 3730     }
 3731 
 3732     /* initialize CSV dataset */
 3733     err = start_new_Z(c->dset, 0);
 3734     if (!err && csv_skip_col_1(c)) {
 3735     err = dataset_allocate_obs_markers(c->dset);
 3736     }
 3737 
 3738     if (err) {
 3739     goto csv_bailout;
 3740     }
 3741 
 3742     /* second pass */
 3743 
 3744     gzrewind(fp);
 3745 
 3746     if (fixed_format(c)) {
 3747     err = fixed_format_read(c, fp, prn);
 3748     if (err) {
 3749         goto csv_bailout;
 3750     } else {
 3751         csv_set_autoname(c);
 3752         goto csv_continue;
 3753     }
 3754     }
 3755 
 3756     err = csv_varname_scan(c, fp, prn, mprn);
 3757     if (err || probing(c)) {
 3758     goto csv_bailout;
 3759     }
 3760 
 3761     if (c->decpoint == '.' && get_local_decpoint() == ',') {
 3762     /* we're in a locale that uses decimal comma:
 3763        switch to the C locale */
 3764     gretl_push_c_numeric_locale();
 3765     popit = 1;
 3766     } else if (c->decpoint == ',' && get_local_decpoint() == '.') {
 3767     /* dotsub: define this if we're in a '.' locale and
 3768        we've figured that the decimal character is ',' in
 3769        the file we're reading
 3770     */
 3771     csv_set_dotsub(c);
 3772     }
 3773 
 3774     err = csv_read_data(c, fp, prn, mprn);
 3775 
 3776     if (!err) {
 3777     /* try again, under certain conditions */
 3778     if (csv_skip_bad(c)) {
 3779         err = csv_read_data(c, fp, prn, NULL);
 3780     } else if (c->thousep > 0) {
 3781         pprintf(mprn, A_("WARNING: it seems '%c' is being used "
 3782                  "as thousands separator\n"), c->thousep);
 3783         c->decpoint = (c->thousep == '.')? ',' : '.';
 3784         if (c->decpoint == ',') {
 3785         if (get_local_decpoint() == '.') {
 3786             csv_set_dotsub(c);
 3787         } else if (popit) {
 3788             gretl_pop_c_numeric_locale();
 3789             popit = 0;
 3790         }
 3791         }
 3792         revise_non_numeric_values(c);
 3793         csv_set_scrub_thousep(c);
 3794         err = csv_read_data(c, fp, prn, NULL);
 3795     }
 3796     }
 3797 
 3798     if (!err && !probing(c)) {
 3799     err = csv_non_numeric_check(c, prn);
 3800     if (!err && csv_has_non_numeric(c)) {
 3801         /* try once more */
 3802         err = csv_read_data(c, fp, prn, NULL);
 3803     }
 3804     }
 3805 
 3806     if (popit) {
 3807     gretl_pop_c_numeric_locale();
 3808     }
 3809 
 3810     if (err) {
 3811     goto csv_bailout;
 3812     }
 3813 
 3814     if (csv_data_reversed(c)) {
 3815     reverse_data(c->dset, mprn);
 3816     }
 3817 
 3818  csv_continue:
 3819 
 3820     c->dset->t1 = 0;
 3821     c->dset->t2 = c->dset->n - 1;
 3822 
 3823     if (c->markerpd > 0) {
 3824     pputs(mprn, A_("taking date information from row labels\n\n"));
 3825     if (csv_skip_bad(c)) {
 3826         pprintf(prn, "WARNING: Check your data! gretl has stripped out "
 3827             "what appear to be\nextraneous lines in a %s dataset: "
 3828             "this may not be right.\n\n",
 3829             (c->dset->pd == 4)? "quarterly" : "monthly");
 3830     }
 3831     } else {
 3832     pputs(mprn, A_("treating these as undated data\n\n"));
 3833     dataset_obs_info_default(c->dset);
 3834     }
 3835 
 3836     if (c->dset->pd != 1 || strcmp(c->dset->stobs, "1")) {
 3837         c->dset->structure = TIME_SERIES;
 3838     }
 3839 
 3840     if (c->st != NULL) {
 3841     err = gretl_string_table_validate(c->st);
 3842     if (err) {
 3843         pputs(prn, A_("Failed to interpret the data as numeric\n"));
 3844         goto csv_bailout;
 3845     } else if (joining(c)) {
 3846         gretl_string_table_save(c->st, c->dset);
 3847     } else {
 3848         gretl_string_table_print(c->st, c->dset, fname, prn);
 3849     }
 3850     }
 3851 
 3852     if (csv_as_matrix(c)) {
 3853     /* FIXME placement of this */
 3854     if (csv_autoname(c)) {
 3855         strings_array_free(c->dset->varname, c->dset->v);
 3856         c->dset->varname = NULL;
 3857     }
 3858     *pm = gretl_matrix_data_subset(NULL, c->dset, -1, -1,
 3859                        M_MISSING_OK, &err);
 3860     goto csv_bailout;
 3861     }
 3862 
 3863     /* If there were observation labels and they were not interpretable
 3864        as dates, and they weren't simply "1, 2, 3, ...", then they
 3865        should probably be preserved; otherwise discard them.
 3866     */
 3867     if (c->dset->S != NULL && c->markerpd >= 0 &&
 3868     c->dset->markers != DAILY_DATE_STRINGS) {
 3869     dataset_destroy_obs_markers(c->dset);
 3870     }
 3871 
 3872     if (csv_autoname(c)) {
 3873     /* no variable names were found */
 3874     for (i=1; i<c->dset->v; i++) {
 3875         sprintf(c->dset->varname[i], "v%d", i);
 3876     }
 3877     } else {
 3878 #if CDEBUG
 3879     int ii;
 3880 
 3881     for (ii=0; ii<c->dset->v; ii++) {
 3882         fprintf(stderr, " c->dset->varname[%d] = '%s'\n", ii, c->dset->varname[ii]);
 3883     }
 3884 #endif
 3885     if (fix_varname_duplicates(c->dset)) {
 3886         pputs(prn, A_("warning: some variable names were duplicated\n"));
 3887     }
 3888     }
 3889 
 3890     if (!joining(c) && !probing(c)) {
 3891     int newdata = (dset->Z == NULL);
 3892 
 3893     /* not doing a special "join" operation */
 3894     err = merge_or_replace_data(dset, &c->dset, get_merge_opts(opt), prn);
 3895 
 3896     if (!err && newdata && c->descrip != NULL) {
 3897         dset->descrip = c->descrip;
 3898         c->descrip = NULL;
 3899     }
 3900 
 3901     if (!err && newdata) {
 3902         dataset_add_import_info(dset, fname, GRETL_CSV);
 3903     }
 3904     }
 3905 
 3906  csv_bailout:
 3907 
 3908     if (fp != NULL) {
 3909     gzclose(fp);
 3910     }
 3911 
 3912     if (!err && c->jspec != NULL) {
 3913     c->jspec->c = c;
 3914     } else if (!err && c->probe != NULL) {
 3915     c->probe->dset = c->dset;
 3916     c->dset = NULL;
 3917     csvdata_free(c);
 3918     } else {
 3919     csvdata_free(c);
 3920     }
 3921 
 3922     if (altname != NULL) {
 3923     gretl_remove(altname);
 3924     g_free(altname);
 3925     }
 3926 
 3927     if (err == E_ALLOC) {
 3928     pputs(prn, A_("Out of memory\n"));
 3929     }
 3930 
 3931     return err;
 3932 }
 3933 
 3934 /**
 3935  * import_csv:
 3936  * @fname: name of CSV file.
 3937  * @dset: dataset struct.
 3938  * @opt: use OPT_N to force interpretation of data colums containing
 3939  * strings as coded (non-numeric) values and not errors; for use of
 3940  * OPT_T see the help for "append".
 3941  * @prn: gretl printing struct (or NULL).
 3942  *
 3943  * Open a Comma-Separated Values data file and read the data into
 3944  * the current work space.
 3945  *
 3946  * Returns: 0 on successful completion, non-zero otherwise.
 3947  */
 3948 
 3949 int import_csv (const char *fname, DATASET *dset,
 3950         gretlopt opt, PRN *prn)
 3951 {
 3952     const char *cols = NULL;
 3953     const char *rows = NULL;
 3954     int ci, err;
 3955 
 3956     err = incompatible_options(opt, OPT_F | OPT_L);
 3957     if (err) {
 3958     /* --cols and --fixed-cols */
 3959     return err;
 3960     }
 3961 
 3962     ci = (dset != NULL && dset->v > 0)? APPEND : OPEN;
 3963 
 3964     if (opt & OPT_F) {
 3965     /* we should have a "--fixed-cols=XXX" specification */
 3966     cols = get_optval_string(ci, OPT_F);
 3967     if (cols == NULL || *cols == '\0') {
 3968         return E_PARSE;
 3969     }
 3970     } else if (opt & OPT_L) {
 3971     /* should have a "--cols=XXX" specification */
 3972     cols = get_optval_string(ci, OPT_L);
 3973     if (cols == NULL || *cols == '\0') {
 3974         return E_PARSE;
 3975     }
 3976     }
 3977 
 3978     if (opt & OPT_M) {
 3979     /* we should have a "--rowmask=XXX" specification */
 3980     rows = get_optval_string(ci, OPT_M);
 3981     if (rows == NULL || *rows == '\0') {
 3982         return E_PARSE;
 3983     }
 3984     }
 3985 
 3986     return real_import_csv(fname, dset, cols, rows,
 3987                NULL, NULL, NULL, opt, prn);
 3988 }
 3989 
 3990 gretl_matrix *import_csv_as_matrix (const char *fname, int *err)
 3991 {
 3992 #if CDEBUG
 3993     PRN *prn = gretl_print_new(GRETL_PRINT_STDERR, NULL);
 3994 #else
 3995     PRN *prn = NULL;
 3996 #endif
 3997     gretl_matrix *m = NULL;
 3998     char csvname[MAXLEN] = {0};
 3999     gretlopt opt = OPT_A; /* --all-cols */
 4000     int http = 0;
 4001 
 4002     *err = try_http(fname, csvname, &http);
 4003 
 4004     if (!*err && http) {
 4005     *err = real_import_csv(csvname, NULL, NULL, NULL,
 4006                    NULL, NULL, &m, opt, prn);
 4007     } else if (!*err) {
 4008     *err = real_import_csv(fname, NULL, NULL, NULL,
 4009                    NULL, NULL, &m, opt, prn);
 4010     }
 4011 
 4012     gretl_print_destroy(prn);
 4013 
 4014     return m;
 4015 }
 4016 
 4017 static int probe_varnames_check (DATASET *dset, gretlopt opt,
 4018                  int *rerun)
 4019 {
 4020     int missnames = 0;
 4021     int i, err = 0;
 4022 
 4023     for (i=1; i<dset->v; i++) {
 4024     if (dset->varname[i][0] == '\0') {
 4025         missnames = 1;
 4026         break;
 4027     }
 4028     }
 4029 
 4030     if (missnames) {
 4031     if (opt & OPT_H) {
 4032         gretl_errmsg_set("Couldn't find all variable names");
 4033         err = E_DATA;
 4034     } else {
 4035         *rerun = 1;
 4036     }
 4037     }
 4038 
 4039     return err;
 4040 }
 4041 
 4042 /**
 4043  * probe_csv:
 4044  * @fname: name of CSV file.
 4045  * @varnames: location to receive variable names.
 4046  * @nvars: location to receive number of variables (columns).
 4047  * @opt: on input, may contain any extra options to pass to
 4048  * real_import_csv(); on return, OPT_H (indicating that the
 4049  * CSV file has no header) may be added if it seems to be
 4050  * required (no header).
 4051  *
 4052  * Open a Comma-Separated Values data file and read enough to
 4053  * determine the variable names.
 4054  *
 4055  * Returns: 0 on successful completion, non-zero otherwise.
 4056  */
 4057 
 4058 int probe_csv (const char *fname, char ***varnames,
 4059            int *nvars, gretlopt *opt)
 4060 {
 4061     csvprobe probe = {0};
 4062     int err;
 4063 
 4064     err = real_import_csv(fname, NULL, NULL, NULL, NULL,
 4065               &probe, NULL, *opt, NULL);
 4066 
 4067     if (!err) {
 4068     int rerun = 0;
 4069 
 4070     err = probe_varnames_check(probe.dset, *opt, &rerun);
 4071 
 4072     if (err || rerun) {
 4073         destroy_dataset(probe.dset);
 4074         probe.dset = NULL;
 4075     }
 4076 
 4077     if (!err && rerun) {
 4078         /* try again with --no-header flag */
 4079         *opt |= OPT_H;
 4080         err = real_import_csv(fname, NULL, NULL, NULL, NULL,
 4081                   &probe, NULL, *opt, NULL);
 4082     }
 4083 
 4084     if (!err) {
 4085         /* steal the varname array */
 4086         *varnames = probe.dset->varname;
 4087         *nvars = probe.dset->v;
 4088         probe.dset->varname = NULL;
 4089     }
 4090 
 4091     destroy_dataset(probe.dset);
 4092     }
 4093 
 4094     return err;
 4095 }
 4096 
 4097 int csv_open_needs_matrix (gretlopt opt)
 4098 {
 4099     int ret = 0;
 4100 
 4101     if (opt & OPT_M) {
 4102     /* --rowmask=matrix */
 4103     ret = 1;
 4104     } else if (opt & OPT_F) {
 4105     /* --fixed-cols=whatever */
 4106     const char *s = get_optval_string(OPEN, OPT_F);
 4107 
 4108     ret = get_matrix_by_name(s) != NULL;
 4109     }
 4110 
 4111     return ret;
 4112 }
 4113 
 4114 typedef double keynum;
 4115 
 4116 /* below: apparatus to implement the "join" command */
 4117 
 4118 struct jr_row_ {
 4119     int n_keys;     /* number of keys (needed for qsort callback) */
 4120     keynum keyval;  /* primary key value */
 4121     keynum keyval2; /* secondary key value, if applicable */
 4122     int micro;      /* high-frequency "key", if any */
 4123     int dset_row;   /* associated row in the RHS or outer dataset */
 4124     double aux;     /* auxiliary value */
 4125 };
 4126 
 4127 typedef struct jr_row_ jr_row;
 4128 
 4129 struct obskey_ {
 4130     char *timefmt; /* time format, as in strptime */
 4131     int keycol;    /* the column holding the outer time-key */
 4132     int m_means_q; /* "monthly means quarterly" */
 4133     int numdates;  /* flag for conversion from numeric to string */
 4134     int native;    /* native time-series info */
 4135 };
 4136 
 4137 typedef struct obskey_ obskey;
 4138 
 4139 struct joiner_ {
 4140     int n_rows;     /* number of rows in data table */
 4141     int n_keys;     /* number of keys used (0, 1 or 2) */
 4142     int n_unique;   /* number of unique primary key values on right */
 4143     jr_row *rows;   /* array of table rows */
 4144     keynum *keys;   /* array of unique (primary) key values as 64-bit ints */
 4145     int *key_freq;  /* counts of occurrences of (primary) key values */
 4146     int *key_row;   /* record of starting row in joiner table for primary keys */
 4147     int *str_keys;  /* flags for string comparison of key(s) */
 4148     const int *l_keyno; /* list of key columns in left-hand dataset */
 4149     const int *r_keyno; /* list of key columns in right-hand dataset */
 4150     AggrType aggr;      /* aggregation method for 1:n joining */
 4151     int seqval;         /* sequence number for aggregation */
 4152     int auxcol;         /* auxiliary data column for aggregation */
 4153     int midas_m;        /* midas frequency ratio */
 4154     int midas_pd;       /* frequency of outer dataset */
 4155     obskey *auto_keys;  /* struct to hold info on obs-based key(s) */
 4156     DATASET *l_dset;    /* the left-hand or inner dataset */
 4157     DATASET *r_dset;    /* the right-hand or outer temporary dataset */
 4158 };
 4159 
 4160 typedef struct joiner_ joiner;
 4161 
 4162 struct jr_filter_ {
 4163     const char *expr;  /* expression to be run through "genr" */
 4164     const double *val; /* (series) result of evaluating @expr */
 4165     char *vname1;      /* first right-hand variable name */
 4166     char *vname2;      /* second right-hand variable name */
 4167     char *vname3;      /* third right-hand variable name */
 4168 };
 4169 
 4170 typedef struct jr_filter_ jr_filter;
 4171 
 4172 static int expand_jspec (joinspec *jspec, int addvars);
 4173 
 4174 static void jr_filter_destroy (jr_filter *f)
 4175 {
 4176     if (f != NULL) {
 4177     free(f->vname1);
 4178     free(f->vname2);
 4179     free(f->vname3);
 4180     free(f);
 4181     }
 4182 }
 4183 
 4184 static void joiner_destroy (joiner *jr)
 4185 {
 4186     if (jr != NULL) {
 4187     free(jr->rows);
 4188     free(jr->keys);
 4189     free(jr->key_freq);
 4190     free(jr->key_row);
 4191     free(jr);
 4192     }
 4193 }
 4194 
 4195 static joiner *joiner_new (int nrows)
 4196 {
 4197     joiner *jr = malloc(sizeof *jr);
 4198 
 4199     if (jr != NULL) {
 4200     jr->rows = calloc(nrows, sizeof *jr->rows);
 4201     if (jr->rows == NULL) {
 4202         free(jr);
 4203         jr = NULL;
 4204     }
 4205     }
 4206 
 4207     if (jr != NULL) {
 4208     jr->n_rows = nrows;
 4209     jr->n_unique = 0;
 4210     jr->keys = NULL;
 4211     jr->key_freq = NULL;
 4212     jr->key_row = NULL;
 4213     jr->l_keyno = NULL;
 4214     jr->r_keyno = NULL;
 4215     }
 4216 
 4217     return jr;
 4218 }
 4219 
 4220 static int real_set_outer_auto_keys (joiner *jr, const char *s,
 4221                      int j, struct tm *tp)
 4222 {
 4223     int err = 0;
 4224 
 4225     if (calendar_data(jr->l_dset)) {
 4226     int y, m, d, eday;
 4227 
 4228     y = tp->tm_year + 1900;
 4229     m = tp->tm_mon + 1;
 4230     d = tp->tm_mday;
 4231     eday = epoch_day_from_ymd(y, m, d);
 4232     if (eday < 0) {
 4233         if (s != NULL) {
 4234         gretl_errmsg_sprintf("'%s' is not a valid date", s);
 4235         }
 4236         err = E_DATA;
 4237     } else {
 4238         jr->rows[j].n_keys = 1;
 4239         jr->rows[j].keyval = eday;
 4240         jr->rows[j].keyval2 = 0;
 4241         jr->rows[j].micro = 0;
 4242     }
 4243     } else {
 4244     int major = tp->tm_year + 1900;
 4245     int minor = tp->tm_mon + 1;
 4246     int micro = 0;
 4247 
 4248     if (jr->auto_keys->m_means_q) {
 4249         /* using the gretl-specific "%q" conversion */
 4250         if (minor > 4) {
 4251         gretl_errmsg_sprintf("'%s' is not a valid date", s);
 4252         err = E_DATA;
 4253         }
 4254     } else if (jr->l_dset->pd == 4) {
 4255         /* map from month on right to quarter on left, but
 4256            preserve the month info in case we need it
 4257         */
 4258         micro = minor;
 4259         minor = (int) ceil(minor / 3.0);
 4260     }
 4261     if (!err && micro == 0) {
 4262         micro = tp->tm_mday;
 4263     }
 4264     if (!err) {
 4265         jr->rows[j].n_keys = 2;
 4266         jr->rows[j].keyval = major;
 4267         jr->rows[j].keyval2 = minor;
 4268         jr->rows[j].micro = micro;
 4269     }
 4270     }
 4271 
 4272     return err;
 4273 }
 4274 
 4275 static int set_time_format (obskey *auto_keys, const char *fmt)
 4276 {
 4277     if (auto_keys->timefmt != NULL) {
 4278     free(auto_keys->timefmt);
 4279     }
 4280     auto_keys->timefmt = gretl_strdup(fmt);
 4281     return auto_keys->timefmt == NULL ? E_ALLOC : 0;
 4282 }
 4283 
 4284 /* convert a numerical value to string for use with strptime */
 4285 
 4286 static int numdate_to_string (char *targ, double x)
 4287 {
 4288     if (na(x)) {
 4289     return E_MISSDATA;
 4290     } else {
 4291     sprintf(targ, "%.16g", x);
 4292     return 0;
 4293     }
 4294 }
 4295 
 4296 /* Parse a string from row @i of the outer dataset and set the
 4297    key(s) on row @j of the joiner struct. The indices @i and @j may
 4298    not be equal if a filter is being used. Note: we don't come
 4299    here if the outer time-key column is subject to "tconvert"
 4300    treatment; in that case we use read_iso_basic instead.
 4301 */
 4302 
 4303 static int read_outer_auto_keys (joiner *jr, int j, int i)
 4304 {
 4305     char *tfmt = jr->auto_keys->timefmt;
 4306     int numdates = jr->auto_keys->numdates;
 4307     int tcol = jr->auto_keys->keycol;
 4308     int pd = jr->l_dset->pd;
 4309     struct tm t = {0};
 4310     char sconv[32];
 4311     const char *s;
 4312     char *test;
 4313     int s_src = 0;
 4314     int err = 0;
 4315 
 4316     if (tcol >= 0) {
 4317     /* using a specified column */
 4318     if (numdates) {
 4319         /* column is numeric, conversion needed */
 4320         numdate_to_string(sconv, jr->r_dset->Z[tcol][i]);
 4321         s = sconv;
 4322         s_src = 1;
 4323     } else {
 4324         /* column is string-valued, OK */
 4325         s = series_get_string_for_obs(jr->r_dset, tcol, i);
 4326         s_src = 2;
 4327     }
 4328     } else if (jr->auto_keys->native) {
 4329     /* using native time-series info on right */
 4330     ntolabel_8601(sconv, i, jr->r_dset);
 4331     s = sconv;
 4332     s_src = 1;
 4333     } else {
 4334     /* using first-column observation strings */
 4335     s = jr->r_dset->S[i];
 4336     s_src = 3;
 4337     }
 4338 
 4339     /* note: with strptime, a NULL return means that an error
 4340        occurred while a non-NULL and non-empty return string
 4341        means a trailing portion of the input was not
 4342        processed.
 4343     */
 4344     test = strptime(s, tfmt, &t);
 4345 
 4346     if (test == NULL || *test != '\0') {
 4347     err = E_DATA;
 4348     if (j == 0 && test != NULL && (pd == 12 || pd == 4 || pd == 1)) {
 4349         /* If we're looking at the first row of the filtered data,
 4350            allow for the possibility that we got "excess
 4351            precision", i.e. a daily date string when the left-hand
 4352            dataset is monthly, quarterly or annual.
 4353         */
 4354         char *chk = strptime(s, "%Y-%m-%d", &t);
 4355 
 4356         if (chk != NULL && *chk == '\0') {
 4357         set_time_format(jr->auto_keys, "%Y-%m-%d");
 4358         err = 0; /* we might be OK, cancel the error for now */
 4359         }
 4360     }
 4361     if (err) {
 4362         gretl_errmsg_sprintf("'%s' does not match the format '%s'", s, tfmt);
 4363         fprintf(stderr, "time-format match error in read_outer_auto_keys:\n"
 4364             " remainder = '%s' (source = %s)\n", test ? test : "null",
 4365             s_src < 3 ? "specified time column" : "first-column strings");
 4366     }
 4367     }
 4368 
 4369     if (!err) {
 4370     err = real_set_outer_auto_keys(jr, s, j, &t);
 4371     }
 4372 
 4373     return err;
 4374 }
 4375 
 4376 static int read_iso_basic (joiner *jr, int j, int i)
 4377 {
 4378     int tcol = jr->auto_keys->keycol;
 4379     double x;
 4380     int err = 0;
 4381 
 4382     x = jr->r_dset->Z[tcol][i];
 4383 
 4384     if (na(x)) {
 4385     err = E_MISSDATA;
 4386     } else {
 4387     int y = (int) floor(x / 10000);
 4388     int m = (int) floor((x - 10000*y) / 100);
 4389     int d = (int) (x - 10000*y - 100*m);
 4390     guint32 ed = epoch_day_from_ymd(y, m, d);
 4391 
 4392     if (ed <= 0) {
 4393         gretl_errmsg_sprintf("'%.8g' is not a valid date", x);
 4394         err = E_DATA;
 4395     } else if (calendar_data(jr->l_dset)) {
 4396         /* note: no need to go via struct tm */
 4397         jr->rows[j].n_keys = 1;
 4398         jr->rows[j].keyval = ed;
 4399         jr->rows[j].keyval2 = 0;
 4400         jr->rows[j].micro = 0;
 4401     } else {
 4402         struct tm t = {0};
 4403 
 4404         t.tm_year = y - 1900;
 4405         t.tm_mon = m - 1;
 4406         t.tm_mday = d;
 4407         err = real_set_outer_auto_keys(jr, NULL, j, &t);
 4408     }
 4409     }
 4410 
 4411     return err;
 4412 }
 4413 
 4414 /* Evaluate the filter expression provided by the user, and if it
 4415    works OK count the number of rows on which the filter returns
 4416    non-zero.  Flag an error if the filter gives NA on any row, since
 4417    it is then indeterminate.
 4418 */
 4419 
 4420 static int evaluate_filter (jr_filter *filter, DATASET *r_dset,
 4421                 int *nrows)
 4422 {
 4423     char *line;
 4424     int i, err = 0;
 4425 
 4426     line = gretl_strdup_printf("filtered__=%s", filter->expr);
 4427     if (line == NULL) {
 4428     err = E_ALLOC;
 4429     } else {
 4430     err = generate(line, r_dset, GRETL_TYPE_SERIES,
 4431                OPT_P | OPT_Q, NULL);
 4432     }
 4433 
 4434     if (!err) {
 4435     int v = r_dset->v - 1;
 4436 
 4437     filter->val = r_dset->Z[v];
 4438     *nrows = 0;
 4439 
 4440 #if CDEBUG > 1
 4441     fprintf(stderr, "filter genr: '%s':\n", line);
 4442     for (i=0; i<r_dset->n; i++) {
 4443         fprintf(stderr, " %d: %g\n", i+1, filter->val[i]);
 4444     }
 4445 #endif
 4446     for (i=0; i<r_dset->n; i++) {
 4447         if (na(filter->val[i])) {
 4448         gretl_errmsg_sprintf("join filter: indeterminate "
 4449                      "value on row %d", i+1);
 4450         err = E_MISSDATA;
 4451         break;
 4452         } else if (filter->val[i] != 0.0) {
 4453         *nrows += 1;
 4454         }
 4455     }
 4456     }
 4457 
 4458     free(line);
 4459 
 4460     return err;
 4461 }
 4462 
 4463 static keynum dtoll (double x, int *err)
 4464 {
 4465     if (na(x)) {
 4466     *err = E_DATA;
 4467     return -1;
 4468     } else {
 4469     return x;
 4470     }
 4471 }
 4472 
 4473 static keynum dtoll_full (double x, int key, int row, int *err)
 4474 {
 4475     if (na(x)) {
 4476     if (key == 2) {
 4477         gretl_errmsg_sprintf("%s: invalid secondary outer key value on row %d",
 4478                  "join", row);
 4479     } else {
 4480         gretl_errmsg_sprintf("%s: invalid (primary) outer key value on row %d",
 4481                  "join", row);
 4482     }
 4483     *err = E_DATA;
 4484     return -1;
 4485     } else {
 4486     return x;
 4487     }
 4488 }
 4489 
 4490 /* Determine whether or not row @i of the outer data satisfies the
 4491    filter criterion; return 1 if the condition is met, 0 otherwise.
 4492 */
 4493 
 4494 static int join_row_wanted (jr_filter *filter, int i)
 4495 {
 4496     int ret = filter->val[i] != 0;
 4497 
 4498 #if CDEBUG > 2
 4499     fprintf(stderr, "join filter: %s row %d\n",
 4500         ret ? "keeping" : "discarding", i);
 4501 #endif
 4502 
 4503     return ret;
 4504 }
 4505 
 4506 static DATASET *outer_dataset (joinspec *jspec)
 4507 {
 4508     if (jspec->c != NULL) {
 4509     return jspec->c->dset;
 4510     } else {
 4511     return jspec->dset;
 4512     }
 4513 }
 4514 
 4515 #define using_auto_keys(j) (j->auto_keys->timefmt != NULL)
 4516 
 4517 static joiner *build_joiner (joinspec *jspec,
 4518                  DATASET *l_dset,
 4519                  jr_filter *filter,
 4520                  AggrType aggr,
 4521                  int seqval,
 4522                  obskey *auto_keys,
 4523                  int *err)
 4524 {
 4525     joiner *jr = NULL;
 4526     DATASET *r_dset = outer_dataset(jspec);
 4527     int keycol  = jspec->colnums[JOIN_KEY];
 4528     int valcol  = jspec->colnums[JOIN_TARG];
 4529     int key2col = jspec->colnums[JOIN_KEY2];
 4530     int auxcol  = jspec->colnums[JOIN_AUX];
 4531     int i, nrows = r_dset->n;
 4532 
 4533 #if CDEBUG
 4534     fprintf(stderr, "joiner columns:\n"
 4535         "KEY=%d, VAL=%d, F1=%d, F2=%d, F3=%d, KEY2=%d, AUX=%d\n",
 4536         keycol, valcol, jspec->colnums[JOIN_F1],
 4537         jspec->colnums[JOIN_F2], jspec->colnums[JOIN_F3],
 4538         key2col, auxcol);
 4539 #endif
 4540 
 4541     if (filter != NULL) {
 4542     *err = evaluate_filter(filter, r_dset, &nrows);
 4543     if (*err) {
 4544         return NULL;
 4545     } else if (nrows == 0) {
 4546         gretl_warnmsg_set(_("No matching data after filtering"));
 4547         return NULL;
 4548     }
 4549     }
 4550 
 4551 #if CDEBUG
 4552     fprintf(stderr, "after filtering: dset->n = %d, nrows = %d\n",
 4553         r_dset->n, nrows);
 4554 #endif
 4555 
 4556     jr = joiner_new(nrows);
 4557 
 4558     if (jr == NULL) {
 4559     *err = E_ALLOC;
 4560     } else {
 4561     double **Z = r_dset->Z;
 4562     int use_iso_basic = 0;
 4563     int j = 0;
 4564 
 4565     jr->aggr = aggr;
 4566     jr->seqval = seqval;
 4567     jr->auxcol = auxcol;
 4568     jr->l_dset = l_dset;
 4569     jr->r_dset = r_dset;
 4570     jr->auto_keys = auto_keys;
 4571     jr->midas_m = 0;
 4572 
 4573     if (using_auto_keys(jr)) {
 4574         /* check for the case where the outer time-key
 4575            column is in the "tconvert" set: if so we
 4576            know it will be in YYYYMMDD format and we'll
 4577            give it special treatment
 4578         */
 4579         int tcol = jr->auto_keys->keycol;
 4580 
 4581         if (tcol > 0 && jr->auto_keys->numdates) {
 4582         if (column_is_timecol(jr->r_dset->varname[tcol])) {
 4583             use_iso_basic = 1;
 4584         }
 4585         }
 4586     }
 4587 
 4588     /* Now transcribe the data we want: we're pulling from the
 4589        full outer dataset and writing into the array of joiner row
 4590        structs. At this point we're applying the join filter (if
 4591        any) but are not doing any matching by key to the inner
 4592        dataset.
 4593     */
 4594 
 4595     for (i=0; i<r_dset->n && !*err; i++) {
 4596         if (filter != NULL && !join_row_wanted(filter, i)) {
 4597         continue;
 4598         }
 4599         /* the keys */
 4600         if (use_iso_basic) {
 4601         *err = read_iso_basic(jr, j, i);
 4602         } else if (using_auto_keys(jr)) {
 4603         *err = read_outer_auto_keys(jr, j, i);
 4604         } else if (keycol > 0) {
 4605         jr->rows[j].keyval = dtoll_full(Z[keycol][i], 1, i+1, err);
 4606         if (!*err && key2col > 0) {
 4607             /* double key */
 4608             jr->rows[j].n_keys = 2;
 4609             jr->rows[j].keyval2 = dtoll_full(Z[key2col][i], 2, i+1, err);
 4610         } else {
 4611             /* single key */
 4612             jr->rows[j].n_keys = 1;
 4613             jr->rows[j].keyval2 = 0;
 4614         }
 4615         } else {
 4616         /* no keys have been specified */
 4617         jr->rows[j].n_keys = 0;
 4618         jr->rows[j].keyval = 0;
 4619         jr->rows[j].keyval2 = 0;
 4620         }
 4621         /* "payload" data: record the dataset row */
 4622         jr->rows[j].dset_row = valcol > 0 ? i : -1;
 4623         /* the auxiliary data */
 4624         jr->rows[j].aux = auxcol > 0 ? Z[auxcol][i] : 0;
 4625         j++;
 4626     }
 4627     }
 4628 
 4629     return jr;
 4630 }
 4631 
 4632 /* qsort callback for sorting rows of the joiner struct */
 4633 
 4634 static int compare_jr_rows (const void *a, const void *b)
 4635 {
 4636     const jr_row *ra = a;
 4637     const jr_row *rb = b;
 4638     int ret;
 4639 
 4640     ret = (ra->keyval > rb->keyval) - (ra->keyval < rb->keyval);
 4641 
 4642     if (ret == 0 && ra->n_keys > 1) {
 4643     ret = (ra->keyval2 > rb->keyval2) - (ra->keyval2 < rb->keyval2);
 4644     }
 4645 
 4646     if (ret == 0) {
 4647     /* ensure stable sort */
 4648     ret = a - b > 0 ? 1 : -1;
 4649     }
 4650 
 4651     return ret;
 4652 }
 4653 
 4654 /* Sort the rows of the joiner struct, by either one or two keys, then
 4655    figure out how many unique (primary) key values we have and
 4656    construct (a) an array of frequency of occurrence of these values
 4657    and (b) an array which records the first row of the joiner on
 4658    which each of these values is found.
 4659 */
 4660 
 4661 static int joiner_sort (joiner *jr)
 4662 {
 4663     int matches = jr->n_rows;
 4664     int i, err = 0;
 4665 
 4666     /* If there are string keys, we begin by mapping from the string
 4667        indices on the right -- held in the keyval and/or keyval2
 4668        members of the each joiner row -- to the indices for the same
 4669        strings on the left. This enables us to avoid doing string
 4670        comparisons when running aggr_value() later; we can just
 4671        compare the indices of the strings. In addition, if on any
 4672        given row we get no match for the right-hand key string on the
 4673        left (signalled by a strmap value of -1) we can exploit this
 4674        information by shuffling such rows to the end of the joiner
 4675        rectangle and ignoring them when aggregating.
 4676     */
 4677 
 4678     if (jr->str_keys[0] || jr->str_keys[1]) {
 4679     series_table *stl, *str;
 4680     int *strmap;
 4681     int k, kmin, kmax, lkeyval, rkeyval;
 4682 
 4683     kmin = jr->str_keys[0] ? 1 : 2;
 4684     kmax = jr->str_keys[1] ? 2 : 1;
 4685 
 4686     for (k=kmin; k<=kmax; k++) {
 4687         stl = series_get_string_table(jr->l_dset, jr->l_keyno[k]);
 4688         str = series_get_string_table(jr->r_dset, jr->r_keyno[k]);
 4689         strmap = series_table_map(str, stl);
 4690 
 4691         if (strmap == NULL) {
 4692         err = E_ALLOC;
 4693         break;
 4694         }
 4695 
 4696         for (i=0; i<jr->n_rows; i++) {
 4697         if (k == 1) {
 4698             rkeyval = jr->rows[i].keyval;
 4699         } else if (jr->rows[i].keyval == INT_MAX) {
 4700             continue;
 4701         } else {
 4702             rkeyval = jr->rows[i].keyval2;
 4703         }
 4704         lkeyval = strmap[rkeyval];
 4705 #if CDEBUG > 1
 4706         fprintf(stderr, "k = %d, row %d, keyval: %d -> %d\n", k, i, rkeyval, lkeyval);
 4707 #endif
 4708         if (lkeyval > 0) {
 4709             if (k == 1) {
 4710             jr->rows[i].keyval = lkeyval;
 4711             } else {
 4712             jr->rows[i].keyval2 = lkeyval;
 4713             }
 4714         } else {
 4715             /* arrange for qsort to move row to end */
 4716             jr->rows[i].keyval = G_MAXDOUBLE;
 4717             matches--;
 4718         }
 4719         }
 4720 
 4721         free(strmap);
 4722     }
 4723     }
 4724 
 4725     if (err) {
 4726     return err;
 4727     }
 4728 
 4729     qsort(jr->rows, jr->n_rows, sizeof *jr->rows, compare_jr_rows);
 4730 
 4731     if (matches < jr->n_rows) {
 4732     jr->n_rows = matches;
 4733     }
 4734 
 4735     jr->n_unique = 1;
 4736     for (i=1; i<jr->n_rows; i++) {
 4737     if (jr->rows[i].keyval != jr->rows[i-1].keyval) {
 4738         jr->n_unique += 1;
 4739     }
 4740     }
 4741 
 4742     jr->keys = malloc(jr->n_unique * sizeof *jr->keys);
 4743     jr->key_freq = malloc(jr->n_unique * sizeof *jr->key_freq);
 4744     jr->key_row = malloc(jr->n_unique * sizeof *jr->key_row);
 4745 
 4746     if (jr->keys == NULL || jr->key_freq == NULL || jr->key_row == NULL) {
 4747     err = E_ALLOC;
 4748     } else {
 4749     int j = 0, nj = 1;
 4750 
 4751     for (i=0; i<jr->n_unique; i++) {
 4752         jr->key_freq[i] = 0;
 4753     }
 4754 
 4755     jr->keys[0] = jr->rows[0].keyval;
 4756     jr->key_row[0] = 0;
 4757 
 4758     for (i=1; i<jr->n_rows; i++) {
 4759         if (jr->rows[i].keyval != jr->rows[i-1].keyval) {
 4760         /* finalize info for key j */
 4761         jr->keys[j] = jr->rows[i-1].keyval;
 4762         jr->key_freq[j] = nj;
 4763         /* and initialize for next key */
 4764         nj = 1;
 4765         if (j < jr->n_unique - 1) {
 4766             jr->key_row[j+1] = i;
 4767         }
 4768         j++;
 4769         } else {
 4770         nj++;
 4771         }
 4772     }
 4773 
 4774     /* make sure the last row is right */
 4775     jr->keys[j] = jr->rows[i-1].keyval;
 4776     jr->key_freq[j] = nj;
 4777     }
 4778 
 4779     return err;
 4780 }
 4781 
 4782 #if CDEBUG > 1
 4783 
 4784 static void joiner_print (joiner *jr)
 4785 {
 4786     char **labels = NULL;
 4787     jr_row *row;
 4788     int i;
 4789 
 4790     if (jr->str_keys[0]) {
 4791     labels = series_get_string_vals(jr->l_dset, jr->l_keyno[1], NULL, 0);
 4792     }
 4793 
 4794     fprintf(stderr, "\njoiner: n_rows = %d\n", jr->n_rows);
 4795     for (i=0; i<jr->n_rows; i++) {
 4796     row = &jr->rows[i];
 4797     if (row->n_keys > 1) {
 4798         fprintf(stderr, " row %d: keyvals=(%g,%g)\n",
 4799             i, row->keyval, row->keyval2);
 4800     } else {
 4801         int k = lrint(row->keyval) - 1;
 4802 
 4803         if (jr->str_keys[0] && row->keyval >= 0) {
 4804         fprintf(stderr, " row %d: keyval=%g (%s)\n",
 4805             i, row->keyval, labels[k]);
 4806         } else {
 4807         fprintf(stderr, " row %d: keyval=%g\n",
 4808             i, row->keyval);
 4809         }
 4810     }
 4811     }
 4812 
 4813     if (jr->keys != NULL) {
 4814     fprintf(stderr, " for primary key: n_unique = %d\n", jr->n_unique);
 4815     for (i=0; i<jr->n_unique; i++) {
 4816         fprintf(stderr,"  key value %g: count = %d\n",
 4817             jr->keys[i], jr->key_freq[i]);
 4818     }
 4819     }
 4820 }
 4821 
 4822 static void print_outer_dataset (const DATASET *dset, const char *fname)
 4823 {
 4824     PRN *prn = gretl_print_new(GRETL_PRINT_STDERR, NULL);
 4825 
 4826     pprintf(prn, "Data extracted from %s:\n", fname);
 4827     printdata(NULL, NULL, dset, OPT_O, prn);
 4828     gretl_print_destroy(prn);
 4829 }
 4830 
 4831 #endif
 4832 
 4833 static int seqval_out_of_bounds (joiner *jr, int seqmax)
 4834 {
 4835     if (jr->seqval < 0) {
 4836     /* counting down from last match */
 4837     return -jr->seqval > seqmax;
 4838     } else {
 4839     /* counting up from first match */
 4840     return jr->seqval > seqmax;
 4841     }
 4842 }
 4843 
 4844 /* Do a binary search for the left-hand key value @targ in the sorted
 4845    array of unique right-hand key values, @vals; return the position
 4846    among @vals at which @targ matches, or -1 for no match.
 4847 */
 4848 
 4849 static int binsearch (keynum targ, const keynum *vals, int n, int offset)
 4850 {
 4851     int m = n/2;
 4852 
 4853     if (fabs((targ) - (vals[m])) < 1.0e-7) {
 4854     return m + offset;
 4855     } else if (targ < vals[0] || targ > vals[n-1]) {
 4856     return -1;
 4857     } else if (targ < vals[m]) {
 4858     return binsearch(targ, vals, m, offset);
 4859     } else {
 4860     return binsearch(targ, vals + m, n - m, offset + m);
 4861     }
 4862 }
 4863 
 4864 /* In some cases we can figure out what aggr_value() should return
 4865    just based on the number of matches, @n, and the characteristics
 4866    of the joiner. If so, write the value into @x and return 1; if
 4867    not, return 0.
 4868 */
 4869 
 4870 static int aggr_val_determined (joiner *jr, int n, double *x, int *err)
 4871 {
 4872     if (jr->aggr == AGGR_COUNT) {
 4873     /* just return the number of matches */
 4874     *x = n;
 4875     return 1;
 4876     } else if (jr->aggr == AGGR_SEQ && seqval_out_of_bounds(jr, n)) {
 4877     /* out of bounds sequence index: return NA */
 4878     *x = NADBL;
 4879     return 1;
 4880     } else if (n > 1 && jr->aggr == AGGR_NONE) {
 4881     /* fail */
 4882 #if AGGDEBUG
 4883     fprintf(stderr, "aggr_val_determined(): got n=%d\n", n);
 4884 #endif
 4885     *err = E_DATA;
 4886     gretl_errmsg_set(_("You need to specify an aggregation "
 4887                "method for a 1:n join"));
 4888     *x = NADBL;
 4889     return 1;
 4890     } else {
 4891     /* not enough information so far */
 4892     return 0;
 4893     }
 4894 }
 4895 
 4896 /* get month-day index from @dset time-series info */
 4897 
 4898 static int midas_day_index (int t, DATASET *dset)
 4899 {
 4900     char obs[OBSLEN];
 4901     int y, m, d, idx = -1;
 4902 
 4903     ntolabel(obs, t, dset);
 4904     if (sscanf(obs, YMD_READ_FMT, &y, &m, &d) == 3) {
 4905     idx = month_day_index(y, m, d, dset->pd);
 4906     }
 4907 
 4908     return idx;
 4909 }
 4910 
 4911 #define midas_daily(j) (j->midas_m > 20)
 4912 
 4913 #define min_max_cond(x,y,a) ((a==AGGR_MAX && x>y) || (a==AGGR_MIN && x<y))
 4914 
 4915 /* aggr_value: here we're working on a given row of the left-hand
 4916    dataset. The values @key and (if applicable) @key2 are the
 4917    left-hand keys for this row. We count the key-matches on the
 4918    right and apply an aggregation procedure if the user specified
 4919    one. We return the value that should be entered for the imported
 4920    series on this row.
 4921 
 4922    Note: @xmatch and @auxmatch are workspace arrays allocated by
 4923    the caller.
 4924 */
 4925 
 4926 static double aggr_value (joiner *jr,
 4927               keynum key1,
 4928               keynum key2,
 4929               int v, int revseq,
 4930               double *xmatch,
 4931               double *auxmatch,
 4932               int *nomatch,
 4933               int *err)
 4934 {
 4935     double x, xa;
 4936     int imin, imax, pos;
 4937     int i, n, ntotal;
 4938 
 4939     /* find the position of the inner (primary) key in the
 4940        array of unique outer key values */
 4941     pos = binsearch(key1, jr->keys, jr->n_unique, 0);
 4942 
 4943 #if AGGDEBUG
 4944     if (pos < 0) {
 4945     fprintf(stderr, " key1 = %g: no match\n", key1);
 4946     } else {
 4947     fprintf(stderr, " key1 = %g: matched at position %d\n", key1, pos);
 4948     }
 4949 #endif
 4950 
 4951     if (pos < 0) {
 4952     /* (primary) inner key value not found */
 4953     *nomatch = 1;
 4954     return jr->aggr == AGGR_COUNT ? 0 : NADBL;
 4955     }
 4956 
 4957     /* how many matches at @pos? (must be at least 1 unless
 4958        something very bad has happened)
 4959     */
 4960     n = jr->key_freq[pos];
 4961 
 4962 #if AGGDEBUG
 4963     fprintf(stderr, "  number of primary matches = %d\n", n);
 4964 #endif
 4965 
 4966     if (jr->n_keys == 1) {
 4967     /* if there's just a single key, we can figure some
 4968        cases out already */
 4969     if (aggr_val_determined(jr, n, &x, err)) {
 4970         return x;
 4971     }
 4972     }
 4973 
 4974     if (jr->key_row[pos] < 0) {
 4975     /* "can't happen" */
 4976     return NADBL;
 4977     }
 4978 
 4979     /* set the range of rows for reading from the joiner rectangle */
 4980     imin = jr->key_row[pos];
 4981     imax = imin + n;
 4982 
 4983 #if AGGDEBUG
 4984     fprintf(stderr, "  aggregation row range: %d to %d\n", imin+1, imax);
 4985 #endif
 4986 
 4987     if (jr->aggr == AGGR_MIDAS) {
 4988     /* special case: MIDAS "spreading" */
 4989     int daily = dated_daily_data(jr->r_dset);
 4990     int gotit = 0;
 4991 
 4992     x = NADBL;
 4993 
 4994     for (i=imin; i<imax && !gotit; i++) {
 4995         /* loop across primary key matches */
 4996         jr_row *r = &jr->rows[i];
 4997 
 4998         if (jr->n_keys == 1 || key2 == r->keyval2) {
 4999         /* got secondary key match */
 5000         int sub, t = r->dset_row;
 5001 #if AGGDEBUG
 5002         fprintf(stderr, "  i=%d: 2-key match: %d,%d (revseq=%d)\n",
 5003             i, (int) key1, (int) key2, revseq);
 5004 #endif
 5005         if (daily) {
 5006             /* outer dataset has known daily structure */
 5007             sub = midas_day_index(t, jr->r_dset);
 5008             gotit = sub == revseq;
 5009         } else if (midas_daily(jr) && r->micro > 0) {
 5010             /* "other" daily data: r->micro holds day */
 5011             sub = month_day_index((int) key1, (int) key2,
 5012                       r->micro, jr->midas_pd);
 5013             gotit = sub == revseq;
 5014         } else {
 5015             if (r->micro > 0) {
 5016             /* if present, this is derived from the outer
 5017                time-key specification
 5018             */
 5019             sub = r->micro;
 5020             } else {
 5021             date_maj_min(t, jr->r_dset, NULL, &sub);
 5022             }
 5023             gotit = (sub - 1) % jr->midas_m + 1 == revseq;
 5024         }
 5025         if (gotit) {
 5026             x = jr->r_dset->Z[v][t];
 5027         }
 5028         }
 5029     }
 5030 
 5031     /* and we're done */
 5032     return x;
 5033     }
 5034 
 5035     /* We now fill out the array @xmatch with non-missing values
 5036        from the matching outer rows. If we have a secondary key
 5037        we screen for matches on that as we go.
 5038     */
 5039 
 5040     n = 0;      /* will now hold count of non-NA matches */
 5041     ntotal = 0; /* will ignore the OK/NA distinction */
 5042 
 5043     for (i=imin; i<imax; i++) {
 5044     jr_row *r = &jr->rows[i];
 5045 
 5046     if (jr->n_keys == 1 || key2 == r->keyval2) {
 5047         ntotal++;
 5048         x = jr->r_dset->Z[v][r->dset_row];
 5049         if (jr->auxcol) {
 5050         xa = r->aux;
 5051         if (!na(x) && na(xa)) {
 5052             /* we can't know the min/max of the aux var */
 5053             *err = E_MISSDATA;
 5054             return NADBL;
 5055         }
 5056         if (!na(xa)) {
 5057             auxmatch[n] = xa;
 5058             xmatch[n++] = x;
 5059         }
 5060         } else if (!na(x)) {
 5061         xmatch[n++] = x;
 5062         }
 5063     }
 5064     }
 5065 
 5066     if (jr->n_keys > 1) {
 5067     /* we've already checked this for the 1-key case */
 5068     if (aggr_val_determined(jr, n, &x, err)) {
 5069         return x;
 5070     }
 5071     }
 5072 
 5073     x = NADBL;
 5074 
 5075     if (n == 0) {
 5076     ; /* all matched observations are NA */
 5077     } else if (jr->aggr == AGGR_NONE) {
 5078     x = xmatch[0];
 5079     } else if (jr->aggr == AGGR_SEQ) {
 5080     int sval = jr->seqval;
 5081 
 5082     i = sval < 0 ? n + sval : sval - 1;
 5083     if (i >= 0 && i < n) {
 5084         x = xmatch[i];
 5085     }
 5086     } else if (jr->aggr == AGGR_MAX || jr->aggr == AGGR_MIN) {
 5087     if (jr->auxcol) {
 5088         /* using the max/min of an auxiliary var */
 5089         int idx = 0;
 5090 
 5091         x = auxmatch[0];
 5092         for (i=1; i<n; i++) {
 5093         if (min_max_cond(auxmatch[i], x, jr->aggr)) {
 5094             x = auxmatch[i];
 5095             idx = i;
 5096         }
 5097         }
 5098         x = xmatch[idx];
 5099     } else {
 5100         /* max/min of the actual data */
 5101         x = xmatch[0];
 5102         for (i=1; i<n; i++) {
 5103         if (min_max_cond(xmatch[i], x, jr->aggr)) {
 5104             x = xmatch[i];
 5105         }
 5106         }
 5107     }
 5108     } else if (jr->aggr == AGGR_SUM || jr->aggr == AGGR_AVG) {
 5109     x = 0.0;
 5110     for (i=0; i<n; i++) {
 5111         x += xmatch[i];
 5112     }
 5113     if (jr->aggr == AGGR_AVG) {
 5114         x /= n