"Fossies" - the Fresh Open Source Software Archive

Member "gretl-2020b/lib/src/csvdata.c" (16 Mar 2020, 163763 Bytes) of package /linux/misc/gretl-2020b.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "csvdata.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 2020a_vs_2020b.

    1 /*
    2  *  gretl -- Gnu Regression, Econometrics and Time-series Library
    3  *  Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
    4  *
    5  *  This program is free software: you can redistribute it and/or modify
    6  *  it under the terms of the GNU General Public License as published by
    7  *  the Free Software Foundation, either version 3 of the License, or
    8  *  (at your option) any later version.
    9  *
   10  *  This program is distributed in the hope that it will be useful,
   11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
   12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   13  *  GNU General Public License for more details.
   14  *
   15  *  You should have received a copy of the GNU General Public License
   16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
   17  *
   18  */
   19 
   20 #include "libgretl.h"
   21 #include "gretl_string_table.h"
   22 #include "libset.h"
   23 #include "usermat.h"
   24 #include "uservar.h"
   25 #include "genparse.h"
   26 #include "gretl_xml.h"
   27 #include "gretl_midas.h"
   28 #include "matrix_extra.h"
   29 #include "gretl_www.h"
   30 #include "csvdata.h"
   31 
   32 #ifdef WIN32
   33 # include "gretl_win32.h"
   34 #endif
   35 
   36 #include <errno.h>
   37 
   38 #define CDEBUG 0    /* CSV reading in general */
   39 #define AGGDEBUG 0  /* aggregation in "join" */
   40 #define TDEBUG 0    /* handling of time keys in "join" */
   41 
   42 #define CSVSTRLEN 128
   43 
   44 enum {
   45     CSV_HAVEDATA = 1 << 0,
   46     CSV_GOTDELIM = 1 << 1,
   47     CSV_GOTTAB   = 1 << 2,
   48     CSV_GOTSEMI  = 1 << 3,
   49     CSV_BLANK1   = 1 << 4,
   50     CSV_OBS1     = 1 << 5,
   51     CSV_TRAIL    = 1 << 6,
   52     CSV_AUTONAME = 1 << 7,
   53     CSV_REVERSED = 1 << 8,
   54     CSV_DOTSUB   = 1 << 9,
   55     CSV_ALLCOLS  = 1 << 10,
   56     CSV_BOM      = 1 << 11,
   57     CSV_VERBOSE  = 1 << 12,
   58     CSV_THOUSEP  = 1 << 13,
   59     CSV_NOHEADER = 1 << 14,
   60     CSV_QUOTES   = 1 << 15,
   61     CSV_AS_MAT   = 1 << 16
   62 };
   63 
   64 enum {
   65     JOIN_KEY,
   66     JOIN_F1,
   67     JOIN_F2,
   68     JOIN_F3,
   69     JOIN_KEY2,
   70     JOIN_AUX,
   71     JOIN_TARG
   72 };
   73 
   74 typedef struct joinspec_ joinspec;
   75 typedef struct csvprobe_ csvprobe;
   76 typedef struct csvdata_ csvdata;
   77 
   78 struct joinspec_ {
   79     int ncols;
   80     const char **colnames;
   81     const char *mdsbase;
   82     int *colnums;
   83     int *timecols;
   84     csvdata *c;
   85     DATASET *dset;
   86     int wildcard;
   87     int auto_midas;
   88     int midas_pd;
   89     char **wildnames;
   90     char **mdsnames;
   91     char **tmpnames;
   92     int n_tmp;
   93 };
   94 
   95 struct csvprobe_ {
   96     DATASET *dset; /* more info might be wanted */
   97 };
   98 
   99 struct csvdata_ {
  100     int flags;
  101     char delim;
  102     char decpoint;
  103     char thousep;
  104     char qchar;
  105     int markerpd;
  106     int maxlinelen;
  107     int real_n;
  108     char *line;
  109     DATASET *dset;
  110     int ncols, nrows;
  111     long datapos;
  112     char str[CSVSTRLEN];
  113     char skipstr[8];
  114     int *codelist;
  115     char *descrip;
  116     const char *user_na;
  117     gretl_string_table *st;
  118     int *cols_list;
  119     int *width_list;
  120     const gretl_matrix *rowmask;
  121     int masklen;
  122     joinspec *jspec; /* info used for "join" command */
  123     csvprobe *probe; /* used in connection with "join" */
  124 };
  125 
  126 #define csv_has_trailing_comma(c) (c->flags & CSV_TRAIL)
  127 #define csv_has_obs_column(c)     (c->flags & CSV_OBS1)
  128 #define csv_has_blank_column(c)   (c->flags & CSV_BLANK1)
  129 #define csv_got_tab(c)            (c->flags & CSV_GOTTAB)
  130 #define csv_got_semi(c)           (c->flags & CSV_GOTSEMI)
  131 #define csv_got_delim(c)          (c->flags & CSV_GOTDELIM)
  132 #define csv_autoname(c)           (c->flags & CSV_AUTONAME)
  133 #define csv_skip_col_1(c)         (c->flags & (CSV_OBS1 | CSV_BLANK1))
  134 #define csv_have_data(c)          (c->flags & CSV_HAVEDATA)
  135 #define csv_data_reversed(c)      (c->flags & CSV_REVERSED)
  136 #define csv_do_dotsub(c)          (c->flags & CSV_DOTSUB)
  137 #define csv_all_cols(c)           (c->flags & CSV_ALLCOLS)
  138 #define csv_has_bom(c)            (c->flags & CSV_BOM)
  139 #define csv_is_verbose(c)         (c->flags & CSV_VERBOSE)
  140 #define csv_scrub_thousep(c)      (c->flags & CSV_THOUSEP)
  141 #define csv_no_header(c)          (c->flags & CSV_NOHEADER)
  142 #define csv_keep_quotes(c)        (c->flags & CSV_QUOTES)
  143 #define csv_as_matrix(c)          (c->flags & CSV_AS_MAT)
  144 
  145 #define csv_set_trailing_comma(c)   (c->flags |= CSV_TRAIL)
  146 #define csv_unset_trailing_comma(c) (c->flags &= ~CSV_TRAIL)
  147 #define csv_set_obs_column(c)       (c->flags |= CSV_OBS1)
  148 #define csv_set_blank_column(c)     (c->flags |= CSV_BLANK1)
  149 #define csv_set_got_tab(c)          (c->flags |= CSV_GOTTAB)
  150 #define csv_set_got_semi(c)         (c->flags |= CSV_GOTSEMI)
  151 #define csv_set_got_delim(c)        (c->flags |= CSV_GOTDELIM)
  152 #define csv_set_autoname(c)         (c->flags |= CSV_AUTONAME)
  153 #define csv_set_data_reversed(c)    (c->flags |= CSV_REVERSED)
  154 #define csv_set_dotsub(c)           (c->flags |= CSV_DOTSUB)
  155 #define csv_set_all_cols(c)         (c->flags |= CSV_ALLCOLS)
  156 #define csv_set_has_bom(c)          (c->flags |= CSV_BOM)
  157 #define csv_set_verbose(c)          (c->flags |= CSV_VERBOSE)
  158 #define csv_set_scrub_thousep(c)    (c->flags |= CSV_THOUSEP)
  159 #define csv_set_no_header(c)        (c->flags |= CSV_NOHEADER)
  160 #define csv_unset_keep_quotes(c)    (c->flags &= ~CSV_QUOTES)
  161 #define csv_set_as_matrix(c)        (c->flags |= CSV_AS_MAT)
  162 
  163 #define csv_skip_bad(c)        (*c->skipstr != '\0')
  164 #define csv_has_non_numeric(c) (c->st != NULL)
  165 
  166 #define fixed_format(c) (c->cols_list != NULL && c->width_list != NULL)
  167 #define cols_subset(c) (c->cols_list != NULL && c->width_list == NULL)
  168 #define rows_subset(c) (c->rowmask != NULL)
  169 
  170 #define joining(c) (c->jspec != NULL)
  171 #define probing(c) (c->probe != NULL)
  172 
  173 #define is_wildstr(s) (strchr(s, '*') || strchr(s, '?'))
  174 
  175 static int
  176 time_series_label_check (DATASET *dset, int reversed, char *skipstr,
  177              int convert_pd, PRN *prn);
  178 
  179 static int format_uses_quarterly (char *fmt);
  180 
  181 /* file-scope global */
  182 static char import_na[8];
  183 
  184 struct time_mapper {
  185     int ncols;         /* number of "timeconv" columns */
  186     char **colnames;   /* array of outer-dataset column names */
  187     char *tname;       /* the name of the "tkey", if among colnames, or NULL */
  188     char **fmt;        /* array of up to two time-format strings, or NULL */
  189     char m_means_q[2]; /* array of "monthly means quarterly" flags */
  190 };
  191 
  192 /* file-scope global */
  193 struct time_mapper tconv_map;
  194 
  195 enum {
  196     TCONV_FMT = 0,
  197     TKEY_FMT = 1
  198 };
  199 
  200 #define no_formats(map) (map.fmt == NULL)
  201 #define no_tkey_format(map) (map.tname == NULL)
  202 #define has_tconv_format(map) (map.fmt[TCONV_FMT] != NULL)
  203 #define is_tkey_variable(name, map) (strcmp(name, map.tname) == 0)
  204 
  205 static void timeconv_map_set (int ncols, char **colnames,
  206                   char *tname, char **fmt)
  207 {
  208     tconv_map.ncols = ncols;
  209     tconv_map.colnames = colnames;
  210     tconv_map.tname = tname;
  211     tconv_map.fmt = fmt;
  212 
  213     if (fmt != NULL) {
  214     if (fmt[TCONV_FMT] != NULL) {
  215         tconv_map.m_means_q[TCONV_FMT] =
  216         format_uses_quarterly(fmt[TCONV_FMT]);
  217     }
  218     if (fmt[TKEY_FMT] != NULL) {
  219         tconv_map.m_means_q[TKEY_FMT] =
  220         format_uses_quarterly(fmt[TKEY_FMT]);
  221     }
  222     }
  223 }
  224 
  225 static void timeconv_map_init (void)
  226 {
  227     timeconv_map_set(0, NULL, NULL, NULL);
  228 }
  229 
  230 static void timeconv_map_destroy (void)
  231 {
  232     if (tconv_map.colnames != NULL) {
  233     strings_array_free(tconv_map.colnames, tconv_map.ncols);
  234     }
  235     if (tconv_map.fmt != NULL) {
  236     strings_array_free(tconv_map.fmt, 2);
  237     }
  238     timeconv_map_init();
  239 }
  240 
  241 static int timecol_get_format (const DATASET *dset, int v,
  242                    char **pfmt, int *q)
  243 {
  244     if (no_formats(tconv_map)) {
  245     return 0;
  246     } else if (no_tkey_format(tconv_map)) {
  247     /* get the common "tconvert" format */
  248     *pfmt = tconv_map.fmt[TCONV_FMT];
  249     *q = tconv_map.m_means_q[TCONV_FMT];
  250     return 1;
  251     } else if (is_tkey_variable(dset->varname[v], tconv_map)) {
  252     /* get the tkey-specific format */
  253     *pfmt = tconv_map.fmt[TKEY_FMT];
  254     *q = tconv_map.m_means_q[TKEY_FMT];
  255     return 1;
  256     } else if (has_tconv_format(tconv_map)) {
  257     /* get the other one */
  258     *pfmt = tconv_map.fmt[TCONV_FMT];
  259     *q = tconv_map.m_means_q[TCONV_FMT];
  260     return 1;
  261     }
  262 
  263     return 0;
  264 }
  265 
  266 static int column_is_timecol (const char *colname)
  267 {
  268     int i, n = tconv_map.ncols;
  269 
  270     for (i=0; i<n; i++) {
  271     if (!strcmp(colname, tconv_map.colnames[i])) {
  272         return 1;
  273     }
  274     }
  275 
  276     return 0;
  277 }
  278 
  279 static void csvdata_free (csvdata *c)
  280 {
  281     if (c == NULL) {
  282     return;
  283     }
  284 
  285     if (c->descrip != NULL) {
  286     free(c->descrip);
  287     }
  288 
  289     if (c->st != NULL) {
  290     gretl_string_table_destroy(c->st);
  291     }
  292 
  293     if (c->codelist != NULL) {
  294     free(c->codelist);
  295     }
  296 
  297     if (c->line != NULL) {
  298     free(c->line);
  299     }
  300 
  301     if (c->cols_list != NULL) {
  302     free(c->cols_list);
  303     free(c->width_list);
  304     }
  305 
  306     destroy_dataset(c->dset);
  307 
  308     free(c);
  309 }
  310 
  311 static csvdata *csvdata_new (DATASET *dset)
  312 {
  313     csvdata *c = malloc(sizeof *c);
  314 
  315     if (c == NULL) {
  316     return NULL;
  317     }
  318 
  319     c->flags = CSV_QUOTES;
  320     c->delim = '\t';
  321     c->thousep = 0;
  322     c->qchar = 0;
  323     c->markerpd = -1;
  324     c->maxlinelen = 0;
  325     c->real_n = 0;
  326     c->line = NULL;
  327     c->dset = NULL;
  328     c->ncols = 0;
  329     c->nrows = 0;
  330     c->datapos = 0;
  331     *c->str = '\0';
  332     *c->skipstr = '\0';
  333     c->codelist = NULL;
  334     c->descrip = NULL;
  335     c->user_na = NULL;
  336     c->st = NULL;
  337     c->cols_list = NULL;
  338     c->width_list = NULL;
  339     c->rowmask = NULL;
  340     c->masklen = 0;
  341 
  342     if (strcmp(import_na, "default")) {
  343     c->user_na = import_na;
  344     }
  345 
  346     c->jspec = NULL;
  347     c->probe = NULL;
  348 
  349     c->dset = datainfo_new();
  350 
  351     if (c->dset == NULL) {
  352     free(c);
  353     c = NULL;
  354     } else {
  355     c->delim = get_data_export_delimiter();
  356     c->decpoint = get_data_export_decpoint();
  357     if (dset != NULL && dset->Z != NULL) {
  358         c->flags |= CSV_HAVEDATA;
  359     }
  360 #if CDEBUG
  361     fprintf(stderr, "csvdata_new: c->delim = '%c', c->decpoint = '%c'\n",
  362         c->delim, c->decpoint);
  363 #endif
  364     }
  365 
  366     return c;
  367 }
  368 
  369 static int *cols_list_from_matrix (const char *s, int *err)
  370 {
  371     gretl_matrix *m = get_matrix_by_name(s);
  372     int i, n = gretl_vector_get_length(m);
  373     int *list = NULL;
  374 
  375     if (n == 0) {
  376     *err = E_DATA;
  377     } else {
  378     list = gretl_list_new(n);
  379     if (list == NULL) {
  380         *err = E_ALLOC;
  381     } else {
  382         for (i=0; i<n; i++) {
  383         list[i+1] = gretl_vector_get(m, i);
  384         }
  385     }
  386     }
  387 
  388     return list;
  389 }
  390 
  391 /* The interpretation of the "cols" specification depends on
  392    @opt: if this includes OPT_L then it should provide a 1-based
  393    list of columns to be read; but if @opt includes OPT_F it
  394    should provide a fixed-format spec, consisting of pairs
  395    (start column, width).
  396 */
  397 
  398 static int csvdata_add_cols_list (csvdata *c, const char *s,
  399                   gretlopt opt)
  400 {
  401     int delimited = (opt & OPT_L);
  402     int *list, *clist = NULL, *wlist = NULL;
  403     int i, n, m = 0;
  404     int err = 0;
  405 
  406     if (get_matrix_by_name(s)) {
  407     list = cols_list_from_matrix(s, &err);
  408     } else {
  409     list = gretl_list_from_string(s, &err);
  410     }
  411 
  412     if (!err) {
  413     n = list[0];
  414     if (n == 0) {
  415         err = E_DATA;
  416     } else if (delimited) {
  417         m = n;
  418         clist = list;
  419     } else {
  420         /* fixed format: we need two lists */
  421         if (n % 2 != 0) {
  422         err = E_DATA;
  423         } else {
  424         m = n / 2;
  425         clist = gretl_list_new(m);
  426         wlist = gretl_list_new(m);
  427         if (clist == NULL || wlist == NULL) {
  428             err = E_ALLOC;
  429         } else {
  430             int j = 1;
  431 
  432             for (i=1; i<=n; i+=2, j++) {
  433             clist[j] = list[i];
  434             wlist[j] = list[i+1];
  435             }
  436         }
  437         }
  438     }
  439     }
  440 
  441     /* clist = column (start) list: must be a set of increasing
  442        positive integers; and wlist = respective column widths,
  443        must all be positive, if present
  444     */
  445 
  446     for (i=1; i<=m && !err; i++) {
  447     if (clist[i] <= 0 || (i > 1 && clist[i] <= clist[i-1])) {
  448         err = E_DATA;
  449     } else if (wlist != NULL && wlist[i] <= 0) {
  450         err = E_DATA;
  451     } else if (wlist != NULL && wlist[i] >= CSVSTRLEN) {
  452         fprintf(stderr, "Warning: field %d too wide (%d), truncating\n",
  453             i, wlist[i]);
  454         wlist[i] = CSVSTRLEN - 1;
  455     }
  456     }
  457 
  458     if (list != clist) {
  459     free(list);
  460     }
  461 
  462     if (!err) {
  463     c->cols_list = clist;
  464     c->width_list = wlist;
  465     } else {
  466     free(clist);
  467     free(wlist);
  468     if (err == E_DATA) {
  469         gretl_errmsg_set(_("Invalid column specification"));
  470     }
  471     }
  472 
  473     return err;
  474 }
  475 
  476 static int csvdata_add_row_mask (csvdata *c, const char *s)
  477 {
  478     int err = 0;
  479 
  480     c->rowmask = get_matrix_by_name(s);
  481     if (c->rowmask == NULL) {
  482     gretl_errmsg_sprintf(_("'%s': no such matrix"), s);
  483     err = E_DATA;
  484     } else {
  485     c->masklen = gretl_vector_get_length(c->rowmask);
  486     if (c->masklen == 0) {
  487         err = E_NONCONF;
  488     }
  489     }
  490 
  491     return err;
  492 }
  493 
  494 static int n_from_row_mask (csvdata *c)
  495 {
  496     int i, n = 0;
  497 
  498     for (i=0; i<c->masklen && i<=c->nrows; i++) {
  499     if (gretl_vector_get(c->rowmask, i) != 0) {
  500         n++;
  501     }
  502     }
  503 
  504     return n;
  505 }
  506 
  507 static int add_obs_marker (DATASET *dset, int n)
  508 {
  509     char **S = realloc(dset->S, n * sizeof *S);
  510     int err = 0;
  511 
  512     if (S == NULL) {
  513     err = E_ALLOC;
  514     } else {
  515     dset->S = S;
  516     dset->S[n-1] = malloc(OBSLEN);
  517     if (dset->S[n-1] == NULL) {
  518         err = E_ALLOC;
  519     } else {
  520         strcpy(dset->S[n-1], "NA");
  521     }
  522     }
  523 
  524     return err;
  525 }
  526 
  527 static int add_single_obs (DATASET *dset)
  528 {
  529     double *x;
  530     int i, err = 0;
  531 
  532     for (i=0; i<dset->v && !err; i++) {
  533     x = realloc(dset->Z[i], (dset->n + 1) * sizeof *x);
  534     if (x != NULL) {
  535         dset->Z[i] = x;
  536     } else {
  537         err = E_ALLOC;
  538     }
  539     }
  540 
  541     if (!err) {
  542     dset->n += 1;
  543     dset->Z[0][dset->n - 1] = 1.0;
  544     for (i=1; i<dset->v; i++) {
  545         dset->Z[i][dset->n - 1] = NADBL;
  546     }
  547     if (dset->S != NULL) {
  548         err = add_obs_marker(dset, dset->n);
  549     }
  550     }
  551 
  552     return err;
  553 }
  554 
  555 static int pad_weekly_data (DATASET *dset, int add)
  556 {
  557     int oldn = dset->n;
  558     int ttarg, offset = 0, skip = 0;
  559     int i, s, t, tc, err;
  560 
  561     err = dataset_add_observations(dset, add, OPT_A);
  562 
  563     if (!err) {
  564     for (t=0; t<oldn; t++) {
  565         tc = calendar_obs_number(dset->S[t], dset) - offset;
  566         if (tc != t) {
  567         skip = tc - t;
  568         fprintf(stderr, "Gap of size %d at original t = %d\n", skip, t);
  569         offset += skip;
  570         ttarg = oldn - 1 + offset;
  571         for (s=0; s<oldn-t+skip; s++) {
  572             for (i=1; i<dset->v; i++) {
  573             if (s < oldn - t) {
  574                 if (s == 0 || s == oldn-t-1) {
  575                 fprintf(stderr, "shifting obs %d to obs %d\n",
  576                     ttarg-skip, ttarg);
  577                 }
  578                 dset->Z[i][ttarg] = dset->Z[i][ttarg - skip];
  579             } else {
  580                 fprintf(stderr, "inserting NA at obs %d\n", ttarg);
  581                 dset->Z[i][ttarg] = NADBL;
  582             }
  583             }
  584             ttarg--;
  585         }
  586         }
  587     }
  588     }
  589 
  590     return err;
  591 }
  592 
  593 /* FIXME the following needs to be made more flexible? */
  594 
  595 static int csv_weekly_data (DATASET *dset)
  596 {
  597     char *lbl2 = dset->S[dset->n - 1];
  598     int ret = 1;
  599     int misscount = 0;
  600     int t, tc;
  601 
  602     for (t=0; t<dset->n; t++) {
  603     tc = calendar_obs_number(dset->S[t], dset) - misscount;
  604     if (tc != t) {
  605         misscount += tc - t;
  606     }
  607     }
  608 
  609     if (misscount > 0) {
  610     double missfrac = (double) misscount / dset->n;
  611 
  612     fprintf(stderr, "nobs = %d, misscount = %d (%.2f%%)\n",
  613         dset->n, misscount, 100.0 * missfrac);
  614     if (missfrac > 0.05) {
  615         ret = 0;
  616     } else {
  617         int Tc = calendar_obs_number(lbl2, dset) + 1;
  618         int altmiss = Tc - dset->n;
  619 
  620         fprintf(stderr, "check: Tc = %d, missing = %d\n", Tc, altmiss);
  621         if (altmiss != misscount) {
  622         ret = 0;
  623         } else if (dset->Z != NULL) {
  624         int err;
  625 
  626         fprintf(stderr, "OK, consistent\n");
  627         err = pad_weekly_data(dset, misscount);
  628         if (err) ret = 0;
  629         }
  630     }
  631     }
  632 
  633     return ret;
  634 }
  635 
  636 #define DAY_DEBUG 1
  637 
  638 static int check_daily_dates (DATASET *dset, int *pd,
  639                   int *reversed, PRN *prn)
  640 {
  641     int T = dset->n;
  642     char *lbl1 = dset->S[0];
  643     char *lbl2 = dset->S[T - 1];
  644     int fulln = 0, n, t, nbak;
  645     int alt_pd = 0;
  646     int oldpd = dset->pd;
  647     double oldsd0 = dset->sd0;
  648     guint32 ed1, ed2;
  649     int nmiss = 0, err = 0;
  650 
  651     *pd = 0;
  652 
  653     ed1 = get_epoch_day(lbl1);
  654     ed2 = get_epoch_day(lbl2);
  655     if (ed1 <= 0 || ed2 <= 0) {
  656     err = 1;
  657     }
  658 
  659 #if DAY_DEBUG
  660     fprintf(stderr, "check_daily_dates: '%s' -> %d, '%s' -> %d\n",
  661         lbl1, (int) ed1, lbl2, (int) ed2);
  662 #endif
  663 
  664     dset->pd = guess_daily_pd(dset);
  665     dset->structure = TIME_SERIES;
  666 
  667 #if DAY_DEBUG
  668     fprintf(stderr, "guessed at daily pd = %d\n", dset->pd);
  669 #endif
  670 
  671     if (!err) {
  672     if (ed2 < ed1) {
  673 #if DAY_DEBUG
  674         fprintf(stderr, "check_daily_dates: data are reversed?\n");
  675 #endif
  676         dset->sd0 = ed2;
  677         *reversed = 1;
  678     } else {
  679         dset->sd0 = ed1;
  680     }
  681     }
  682 
  683  recompute:
  684 
  685     alt_pd = 0;
  686     nbak = 0;
  687 
  688     if (!err) {
  689     guint32 n1 = (*reversed)? ed2 : ed1;
  690     guint32 n2 = (*reversed)? ed1 : ed2;
  691 
  692     fulln = n2 - n1 + 1;
  693 
  694     if (T > fulln) {
  695         err = 1;
  696     } else {
  697         nmiss = fulln - T;
  698         pprintf(prn, A_("Observations: %d; days in sample: %d\n"),
  699             T, fulln);
  700         if (nmiss > 300 * T) {
  701         pprintf(prn, A_("Probably annual data\n"));
  702         *pd = 1;
  703         } else if (nmiss > 50 * T) {
  704         pprintf(prn, A_("Probably quarterly data\n"));
  705         *pd = 4;
  706         } else if (nmiss > 20 * T) {
  707         pprintf(prn, A_("Probably monthly data\n"));
  708         *pd = 12;
  709         } else if (nmiss > 3 * T) {
  710         pprintf(prn, A_("Probably weekly data\n"));
  711         *pd = dset->pd = 52;
  712         } else {
  713         pprintf(prn, A_("Missing daily rows: %d\n"), nmiss);
  714         }
  715     }
  716     }
  717 
  718     nbak = 0;
  719 
  720     for (t=0; t<dset->n && !err; t++) {
  721     int wd, s = (*reversed)? (dset->n - 1 - t) : t;
  722 
  723     wd = weekday_from_date(dset->S[s]);
  724 
  725     if (dset->pd == 5 && (wd == 6 || wd == 0)) {
  726         /* Got Sat or Sun, can't be 5-day daily? */
  727         alt_pd = (wd == 6)? 6 : 7;
  728         pprintf(prn, "Found a Saturday (%s): re-trying with pd = %d\n",
  729             dset->S[s], alt_pd);
  730         break;
  731     } else if (dset->pd == 6 && wd == 0) {
  732         /* Got Sun, can't be 6-day daily? */
  733         alt_pd = 7;
  734         pprintf(prn, "Found a Sunday (%s): re-trying with pd = %d\n",
  735             dset->S[s], alt_pd);
  736         break;
  737     }
  738 
  739     n = calendar_obs_number(dset->S[s], dset);
  740     if (n < t) {
  741         pprintf(prn, "Daily dates error at t = %d:\n"
  742             "  calendar_obs_number() for '%s' = %d but t = %d\n",
  743             t, dset->S[s], n, t);
  744         err = 1;
  745     } else if (n > fulln - 1) {
  746         pprintf(prn, "Error: date '%s' out of bounds\n", dset->S[s]);
  747         err = 1;
  748     } else if (nbak > 0 && n == nbak) {
  749         pprintf(prn, "Error: date '%s' is repeated\n", dset->S[s]);
  750         err = 1;
  751     }
  752     nbak = n;
  753     }
  754 
  755     if (alt_pd > 0) {
  756     dset->pd = alt_pd;
  757     goto recompute;
  758     }
  759 
  760     if (err) {
  761     dset->pd = oldpd;
  762     dset->sd0 = oldsd0;
  763     dset->structure = CROSS_SECTION;
  764     } else {
  765     strcpy(dset->stobs, (*reversed)? lbl2 : lbl1);
  766     strcpy(dset->endobs, (*reversed)? lbl1 : lbl2);
  767     dset->t2 = dset->n - 1;
  768     if (nmiss > 0 && *pd == 0) {
  769         dset->markers = DAILY_DATE_STRINGS;
  770     }
  771     }
  772 
  773 #if DAY_DEBUG
  774     fprintf(stderr, "check_daily_dates: daily pd = %d, reversed = %d, err = %d\n",
  775         dset->pd, *reversed, err);
  776 #endif
  777 
  778     return (err)? -1 : dset->pd;
  779 }
  780 
  781 /* convert from daily date label to a lower frequency --
  782    annual, monthly or quarterly -- if @pd indicates this
  783    is required
  784 */
  785 
  786 static void convert_daily_label (char *targ, const char *src,
  787                  int pd)
  788 {
  789     int y, m, d;
  790 
  791     sscanf(src, YMD_READ_FMT, &y, &m, &d);
  792 
  793     if (pd == 1) {
  794     sprintf(targ, "%d", y);
  795     } else if (pd == 12) {
  796     sprintf(targ, "%d:%02d", y, m);
  797     } else if (pd == 4) {
  798     sprintf(targ, "%d:%d", y, m / 3 + (m % 3 != 0));
  799     }
  800 }
  801 
  802 /* There's a special case (ugh!) where observation strings are
  803    given as in monthly data, but the frequency is in fact
  804    quarterly, as in:
  805 
  806    1947.06
  807    1947.09
  808    1947.12
  809    1948.03
  810 
  811    we'll make a brave attempt to handle this.
  812 */
  813 
  814 #define fakequarter(m) (m==3 || m==6 || m==9 || m==12)
  815 
  816 static int consistent_qm_labels (DATASET *dset, int reversed,
  817                  int convert_pd, char *skipstr,
  818                  int *ppd, const char *fmt,
  819                  int *extra_zero, PRN *prn)
  820 {
  821     char bad[16], skip[8];
  822     char label[OBSLEN];
  823     int Ey; /* expected year */
  824     int Ep; /* expected sub-period */
  825     int t, s, yr, per;
  826     int pmin = 1;
  827     int pd, pd0;
  828     int ret = 1;
  829 
  830     pd = pd0 = *ppd;
  831 
  832  restart:
  833 
  834     s = reversed ? (dset->n - 1) : 0;
  835 
  836     if (convert_pd) {
  837     convert_daily_label(label, dset->S[s], pd);
  838     } else {
  839     strcpy(label, dset->S[s]);
  840     }
  841 
  842     if (sscanf(label, fmt, &yr, &per) != 2) {
  843     return 0;
  844     }
  845 
  846     for (t=1; t<dset->n; t++) {
  847     s = (reversed)? (dset->n - 1 - t) : t;
  848     Ey = (per == pd)? yr + 1 : yr;
  849     Ep = (per == pd)? pmin : per + pmin;
  850 
  851     if (convert_pd) {
  852         convert_daily_label(label, dset->S[s], pd);
  853     } else {
  854         strcpy(label, dset->S[s]);
  855     }
  856 
  857     if (sscanf(label, fmt, &yr, &per) != 2) {
  858         ret = 0;
  859     } else if (Ep == 1 && pd == pd0 && per == pd + 1
  860            && skipstr != NULL) {
  861         *skip = *bad = '\0';
  862         strncat(skip, label + 4, 7);
  863         strncat(bad, label, OBSLEN-1);
  864         pd = pd0 + 1;
  865         goto restart;
  866     } else if (per == Ep + 2 && pmin == 1 && fakequarter(per)) {
  867         *bad = '\0';
  868         strncat(bad, label, OBSLEN-1);
  869         pmin = 3;
  870         goto restart;
  871     } else if (pd == 12 && Ep == 5 && per == 1 && yr == Ey + 1) {
  872         /* apparently monthly but really quarterly? */
  873         pprintf(prn, "   \"%s\": quarterly date with spurious zero?\n", label);
  874         *extra_zero = 1;
  875         *ppd = pd0 = pd = 4;
  876         goto restart;
  877     } else if (yr != Ey || per != Ep) {
  878         ret = 0;
  879     }
  880 
  881     if (!ret) {
  882         pprintf(prn, "   %s: not a consistent date\n", label);
  883         break;
  884     }
  885     }
  886 
  887     if (ret) {
  888     if (pmin == 3) {
  889         pprintf(prn, "   \"%s\": quarterly data pretending to be monthly?\n",
  890             bad);
  891         *ppd = 4;
  892     } else if (pd == pd0 + 1) {
  893         pprintf(prn, "   \"%s\": BLS-type nonsense? Trying again\n",
  894             bad);
  895         strcpy(skipstr, skip);
  896     }
  897     }
  898 
  899     return ret;
  900 }
  901 
  902 static int consistent_year_labels (const DATASET *dset,
  903                    int reversed,
  904                    int convert_pd)
  905 {
  906     char label[OBSLEN];
  907     int s, t, yr, yprev;
  908     int ret = 1;
  909 
  910     s = (reversed)? (dset->n - 1) : 0;
  911     yprev = atoi(dset->S[s]);
  912 
  913     for (t=1; t<dset->n; t++) {
  914     s = reversed ? (dset->n - 1 - t) : t;
  915     if (convert_pd) {
  916         convert_daily_label(label, dset->S[s], 1);
  917         yr = atoi(label);
  918     } else {
  919         yr = atoi(dset->S[s]);
  920     }
  921     if (yr != yprev + 1) {
  922         ret = 0;
  923         break;
  924     }
  925     yprev = yr;
  926     }
  927 
  928     return ret;
  929 }
  930 
  931 /* check for all 1s in first column of dates: this may
  932    indicate start-of-period dates, day first */
  933 
  934 static int all_day_ones (DATASET *dset)
  935 {
  936     int t;
  937 
  938     for (t=1; t<dset->n; t++) {
  939     if (atoi(dset->S[t]) != 1) {
  940         return 0;
  941     } else if (t > 31) {
  942         /* "1" can't mean January */
  943         return 1;
  944     }
  945     }
  946 
  947     return 0;
  948 }
  949 
  950 enum date_orders {
  951     YYYYMMDD = 1,
  952     MMDDYYYY,
  953     DDMMYYYY
  954 };
  955 
  956 static int get_date_order (int f0, int fn, DATASET *dset)
  957 {
  958     if (f0 > 31 || fn > 31) {
  959     /* first field must be year */
  960     return YYYYMMDD;
  961     } else if (f0 > 12 || fn > 12) {
  962     /* first field must be day */
  963     return DDMMYYYY;
  964     } else if (f0 == 1 && fn == 1 && all_day_ones(dset)) {
  965     /* start-of-period dates, day first? */
  966     return DDMMYYYY;
  967     } else {
  968     /* could be wrong here */
  969     return MMDDYYYY;
  970     }
  971 }
  972 
  973 static void retransform_daily_dates (DATASET *dset)
  974 {
  975     int t, y, m, d;
  976 
  977     /* we apparently guessed wrongly at MMDDYYYY, so
  978        put the dates back as they were for another try,
  979        at DDMMYYYY.
  980     */
  981 
  982     for (t=0; t<dset->n; t++) {
  983     sscanf(dset->S[t], YMD_READ_FMT, &y, &d, &m);
  984     sprintf(dset->S[t], YMD_WRITE_FMT, d, m, y);
  985     }
  986 }
  987 
  988 static int transform_daily_dates (DATASET *dset, int dorder,
  989                   char sep)
  990 {
  991     char *label, fmt[16];
  992     int t, yr, mon, day;
  993     int n, err = 0;
  994 
  995     if (sep > 0) {
  996     sprintf(fmt, "%%d%c%%d%c%%d", sep, sep);
  997     } else {
  998     strcpy(fmt, "%4d%2d%2d");
  999     }
 1000 
 1001     for (t=0; t<dset->n && !err; t++) {
 1002     label = dset->S[t];
 1003     if (dorder == YYYYMMDD) {
 1004         n = sscanf(label, fmt, &yr, &mon, &day);
 1005     } else if (dorder == DDMMYYYY) {
 1006         n = sscanf(label, fmt, &day, &mon, &yr);
 1007     } else {
 1008         n = sscanf(label, fmt, &mon, &day, &yr);
 1009     }
 1010     if (n == 3) {
 1011         sprintf(label, YMD_WRITE_Y2_FMT, yr, mon, day);
 1012     } else {
 1013         err = 1;
 1014     }
 1015     }
 1016 
 1017     return err;
 1018 }
 1019 
 1020 void reverse_data (DATASET *dset, PRN *prn)
 1021 {
 1022     char tmp[OBSLEN];
 1023     double x;
 1024     int T = dset->n / 2;
 1025     int i, t, s;
 1026 
 1027     pprintf(prn, A_("reversing the data!\n"));
 1028 
 1029     for (t=0; t<T; t++) {
 1030     s = dset->n - 1 - t;
 1031     for (i=1; i<dset->v; i++) {
 1032         x = dset->Z[i][t];
 1033         dset->Z[i][t] = dset->Z[i][s];
 1034         dset->Z[i][s] = x;
 1035     }
 1036     if (dset->S != NULL) {
 1037         strcpy(tmp, dset->S[t]);
 1038         strcpy(dset->S[t], dset->S[s]);
 1039         strcpy(dset->S[s], tmp);
 1040     }
 1041     }
 1042 }
 1043 
 1044 static int csv_daily_date_check (DATASET *dset, int *reversed,
 1045                  char *skipstr, PRN *prn)
 1046 {
 1047     int d1[3], d2[3];
 1048     char s1 = 0, s2 = 0;
 1049     char *lbl1 = dset->S[0];
 1050     char *lbl2 = dset->S[dset->n - 1];
 1051     int dorder = 0;
 1052 
 1053     if ((sscanf(lbl1, "%d%c%d%c%d", &d1[0], &s1, &d1[1], &s2, &d1[2]) == 5 &&
 1054      sscanf(lbl2, "%d%c%d%c%d", &d2[0], &s1, &d2[1], &s2, &d2[2]) == 5 &&
 1055      s1 == s2 && ispunct(s1)) ||
 1056     (sscanf(lbl1, "%4d%2d%2d", &d1[0], &d1[1], &d1[2]) == 3 &&
 1057      sscanf(lbl2, "%4d%2d%2d", &d2[0], &d2[1], &d2[2]) == 3)) {
 1058     int mon1, day1;
 1059     int mon2, day2;
 1060     int pd, ret = 0;
 1061 
 1062     dorder = get_date_order(d1[0], d2[0], dset);
 1063 
 1064     tryagain:
 1065 
 1066     if (dorder == YYYYMMDD) {
 1067         pputs(prn, A_("Trying date order YYYYMMDD\n"));
 1068         mon1 = d1[1];
 1069         day1 = d1[2];
 1070         mon2 = d2[1];
 1071         day2 = d2[2];
 1072     } else if (dorder == DDMMYYYY) {
 1073         pputs(prn, A_("Trying date order DDMMYYYY\n"));
 1074         day1 = d1[0];
 1075         mon1 = d1[1];
 1076         day2 = d2[0];
 1077         mon2 = d2[1];
 1078     } else {
 1079         pputs(prn, A_("Trying date order MMDDYYYY\n"));
 1080         mon1 = d1[0];
 1081         day1 = d1[1];
 1082         mon2 = d2[0];
 1083         day2 = d2[1];
 1084     }
 1085 
 1086     if (mon1 > 0 && mon1 < 13 &&
 1087         mon2 > 0 && mon2 < 13 &&
 1088         day1 > 0 && day1 < 32 &&
 1089         day2 > 0 && day2 < 32) {
 1090         /* looks promising for calendar dates, but check
 1091            further if we don't have the canonical order
 1092            or separator
 1093         */
 1094         if (dorder != YYYYMMDD || s1 != '-') {
 1095         if (transform_daily_dates(dset, dorder, s1)) {
 1096             return -1;
 1097         }
 1098         s1 = '-';
 1099         }
 1100         pprintf(prn, A_("Could be %s - %s\n"), lbl1, lbl2);
 1101         ret = check_daily_dates(dset, &pd, reversed, prn);
 1102         if (ret >= 0 && pd > 0) {
 1103         if (pd == 52) {
 1104             if (csv_weekly_data(dset)) {
 1105             ret = 52;
 1106             } else if (dorder == MMDDYYYY) {
 1107             /* maybe we guessed wrong */
 1108             retransform_daily_dates(dset);
 1109             dorder = DDMMYYYY;
 1110             goto tryagain;
 1111             } else {
 1112             ret = -1;
 1113             }
 1114         } else {
 1115             int convert_pd = 0;
 1116 
 1117             if (pd == 1 || pd == 4 || pd == 12) {
 1118             convert_pd = pd;
 1119             }
 1120             ret = time_series_label_check(dset,
 1121                           *reversed,
 1122                           skipstr,
 1123                           convert_pd,
 1124                           prn);
 1125             if (ret < 0 && dorder == MMDDYYYY) {
 1126             retransform_daily_dates(dset);
 1127             dorder = DDMMYYYY;
 1128             goto tryagain;
 1129             }
 1130         }
 1131         }
 1132         return ret;
 1133     }
 1134     } else {
 1135     pprintf(prn, A_("'%s' and '%s': couldn't get dates\n"), lbl1, lbl2);
 1136     }
 1137 
 1138     return -1;
 1139 }
 1140 
 1141 static int pd_from_date_label (const char *lbl, char *year, char *subp,
 1142                    char *format, PRN *prn)
 1143 {
 1144     const char *subchars = ".:QqMmPp-";
 1145     int len = strlen(lbl);
 1146     int try, pd = -1;
 1147 
 1148     strncat(year, lbl, 4);
 1149     try = atoi(year);
 1150 
 1151     if (try > 0 && try < 3000) {
 1152     pprintf(prn, A_("   %s: probably a year... "), year);
 1153     } else {
 1154     pprintf(prn, A_("   %s: probably not a year\n"), year);
 1155     }
 1156 
 1157     if (len == 5) {
 1158     pputs(prn, A_("   but I can't make sense of the extra bit\n"));
 1159     } else if (len == 4) {
 1160     pputs(prn, A_("and just a year\n"));
 1161     pd = 1;
 1162     } else {
 1163     char sep = lbl[4];
 1164     char sub[3], *s = NULL;
 1165     int dashQ = 0;
 1166     int p;
 1167 
 1168     if (strchr(subchars, sep)) {
 1169         *sub = '\0';
 1170         strncat(sub, lbl + 5, 2);
 1171         s = sub;
 1172         if (len == 6 || (len == 7 && (sep == 'q' || sep == 'Q'))) {
 1173         if (len == 7) s++;
 1174         p = atoi(s);
 1175         if (p > 0 && p < 5) {
 1176             pprintf(prn, A_("quarter %s?\n"), s);
 1177             pd = 4;
 1178         } else {
 1179             pprintf(prn, "quarter %d: not possible\n", p);
 1180         }
 1181         } else if (len == 7) {
 1182         if (*s == 'Q') {
 1183             /* YYYY-Qn? This is supported by SDMX */
 1184             dashQ = 1;
 1185             s++;
 1186         }
 1187         p = atoi(s);
 1188         if (dashQ) {
 1189             if (p > 0 && p < 5) {
 1190             pprintf(prn, A_("quarter %d?\n"), p);
 1191             pd = 4;
 1192             } else {
 1193             pprintf(prn, "quarter %d: not possible\n", p);
 1194             }
 1195         } else {
 1196             if (p > 0 && p < 13) {
 1197             pprintf(prn, A_("month %s?\n"), s);
 1198             pd = 12;
 1199             } else {
 1200             pprintf(prn, "month %d: not possible\n", p);
 1201             }
 1202         }
 1203         }
 1204         strcpy(subp, s);
 1205         if (format != NULL && (pd == 4 || pd == 12)) {
 1206         if (dashQ) {
 1207             sprintf(format, "%%d%cQ%%d", sep);
 1208         } else {
 1209             sprintf(format, "%%d%c%%d", sep);
 1210         }
 1211         }
 1212     }
 1213     }
 1214 
 1215     return pd;
 1216 }
 1217 
 1218 static int time_series_label_check (DATASET *dset, int reversed,
 1219                     char *skipstr, int convert_pd,
 1220                     PRN *prn)
 1221 {
 1222     char year[5], sub[3];
 1223     char format[8] = {0};
 1224     char *lbl1 = dset->S[0];
 1225     char *lbl2 = dset->S[dset->n - 1];
 1226     char *label;
 1227     int pd = -1;
 1228 
 1229     *year = *sub = '\0';
 1230     label = reversed ? lbl2 : lbl1;
 1231 
 1232     if (convert_pd) {
 1233     char altobs[OBSLEN];
 1234 
 1235     convert_daily_label(altobs, label, convert_pd);
 1236     pd = pd_from_date_label(altobs, year, sub, format, prn);
 1237     } else {
 1238     pd = pd_from_date_label(label, year, sub, format, prn);
 1239     }
 1240 
 1241     if (pd == 1) {
 1242     if (consistent_year_labels(dset, reversed, convert_pd)) {
 1243         dset->pd = pd;
 1244         strcpy(dset->stobs, year);
 1245         dset->sd0 = atof(dset->stobs);
 1246         strcpy(dset->endobs, lbl2);
 1247     } else {
 1248         pputs(prn, A_("   but the dates are not complete and consistent\n"));
 1249         pd = -1;
 1250     }
 1251     } else if (pd == 4 || pd == 12) {
 1252     int savepd = pd;
 1253     int extra_zero = 0;
 1254 
 1255     if (consistent_qm_labels(dset, reversed, convert_pd,
 1256                  skipstr, &pd, format,
 1257                  &extra_zero, prn)) {
 1258         dset->pd = pd;
 1259         if (savepd == 12 && pd == 4) {
 1260         /* we switched the interpretation from
 1261            monthly to quarterly */
 1262         int s;
 1263 
 1264         if (extra_zero) {
 1265             /* e.g. 1960Q1 written as 1960:01 */
 1266             s = atoi(sub + 1);
 1267         } else {
 1268             /* e.g. 1960Q1 written as 1960:03 */
 1269             s = atoi(sub) / 3;
 1270         }
 1271         sprintf(dset->stobs, "%s:%d", year, s);
 1272         } else {
 1273         sprintf(dset->stobs, "%s:%s", year, sub);
 1274         }
 1275         dset->sd0 = obs_str_to_double(dset->stobs);
 1276         ntodate(dset->endobs, dset->n - 1, dset);
 1277     } else {
 1278         pputs(prn, A_("   but the dates are not complete and consistent\n"));
 1279         pd = -1;
 1280     }
 1281     }
 1282 
 1283     return pd;
 1284 }
 1285 
 1286 static int dates_maybe_reversed (const char *s1,
 1287                  const char *s2,
 1288                  PRN *prn)
 1289 {
 1290     char d1[5], d2[5];
 1291     int ret = 0;
 1292 
 1293     *d1 = *d2 = '\0';
 1294 
 1295     strncat(d1, s1, 4);
 1296     strncat(d2, s2, 4);
 1297 
 1298     ret = atoi(d1) > atoi(d2);
 1299 
 1300     if (ret) {
 1301     pputs(prn, A_("   dates are reversed?\n"));
 1302     }
 1303 
 1304     return ret;
 1305 }
 1306 
 1307 /* e.g. "M1 1957", "M12 2009" */
 1308 
 1309 static int fix_IFS_data_labels (DATASET *dset)
 1310 {
 1311     char *s1 = dset->S[0];
 1312     char *s2 = dset->S[dset->n - 1];
 1313     int ret = 0;
 1314 
 1315     if ((*s1 == 'M' || *s1 == 'Q') && *s2 == *s1) {
 1316     int n1 = strlen(s1);
 1317     int n2 = strlen(s2);
 1318 
 1319     if ((n1 == 7 || n1 == 8) && (n2 == 7 || n2 == 8) &&
 1320         isdigit(s1[1]) && isdigit(s2[1])) {
 1321         int pmax = (*s1 == 'M')? 12 : 4;
 1322         char c, tmp[8], *s;
 1323         int y, p, pbak = 0;
 1324         int i, n, doit = 1;
 1325 
 1326         for (i=0; i<dset->n; i++) {
 1327         s = dset->S[i];
 1328         n = strlen(s);
 1329         if (n != 7 && n != 8) {
 1330             doit = 0;
 1331             break;
 1332         }
 1333         n = sscanf(s, "%c%d %d", &c, &p, &y);
 1334         if (n != 3 || c != *s1) {
 1335             doit = 0;
 1336             break;
 1337         }
 1338         if (y < 1800 || y > 2500 || p <= 0 || p > pmax) {
 1339             doit = 0;
 1340             break;
 1341         }
 1342         if (i > 0 && p != pbak + 1 && p != 1) {
 1343             doit = 0;
 1344             break;
 1345         }
 1346         pbak = p;
 1347         }
 1348 
 1349         if (doit) {
 1350         for (i=0; i<dset->n; i++) {
 1351             s = dset->S[i];
 1352             sscanf(s, "%c%d %d", &c, &p, &y);
 1353             if (pmax == 12) {
 1354             sprintf(tmp, "%d:%02d", y, p);
 1355             } else {
 1356             sprintf(tmp, "%d:%d", y, p);
 1357             }
 1358             if (strlen(tmp) > strlen(s)) {
 1359             free(s);
 1360             dset->S[i] = gretl_strdup(tmp);
 1361             } else {
 1362             strcpy(s, tmp);
 1363             }
 1364         }
 1365         ret = 1;
 1366         }
 1367     }
 1368     }
 1369 
 1370     return ret;
 1371 }
 1372 
 1373 static int month_number (char *s)
 1374 {
 1375     const char *mo[] = {
 1376     "jan", "feb", "mar", "apr",
 1377     "may", "jun", "jul", "aug",
 1378     "sep", "oct", "nov", "dec"
 1379     };
 1380     int i;
 1381 
 1382     gretl_lower(s);
 1383 
 1384     for (i=0; i<12; i++) {
 1385     if (!strcmp(s, mo[i])) {
 1386         return i+1;
 1387     }
 1388     }
 1389 
 1390     return 0;
 1391 }
 1392 
 1393 /* e.g. "Jan-1980", for monthly or quarterly data */
 1394 
 1395 static int fix_mon_year_labels (DATASET *dset)
 1396 {
 1397     char *s1 = dset->S[0];
 1398     char *s2 = dset->S[dset->n - 1];
 1399     char m1[4] = {0};
 1400     char m2[4] = {0};
 1401     int yr1 = 0, yr2 = 0;
 1402     int ret = 0;
 1403 
 1404     if (strlen(s1) == 8 && strlen(s2) == 8 &&
 1405     s1[3] == '-' && s2[3] == '-') {
 1406     yr1 = atoi(s1 + 4);
 1407     yr2 = atoi(s2 + 4);
 1408     strncat(m1, s1, 3);
 1409     strncat(m2, s2, 3);
 1410     }
 1411 
 1412     if (yr1 > 999 && yr1 < 3000 && yr2 > 999 && yr2 < 3000 &&
 1413     month_number(m1) && month_number(m2)) {
 1414     int i, p, pbak = 0;
 1415     int dt, pd = 0;
 1416     char *s;
 1417 
 1418     for (i=0; i<dset->n; i++) {
 1419         s = dset->S[i];
 1420         if (strlen(s) != 8 || s[3] != '-') {
 1421         pd = 0;
 1422         break;
 1423         }
 1424         yr1 = atoi(s + 4);
 1425         *m1 = '\0';
 1426         strncat(m1, s, 3);
 1427         if (yr1 < 1000 || yr1 >= 3000 ||
 1428         (p = month_number(m1)) < 1) {
 1429         pd = 0;
 1430         break;
 1431         }
 1432         if (i > 0) {
 1433         dt = p - pbak;
 1434         if (dt != 1 && dt != 3 && p != 1) {
 1435             pd = 0;
 1436             break;
 1437         }
 1438         if (pd == 0 && dt > 0) {
 1439             pd = (dt == 1)? 12 : 4;
 1440         }
 1441         }
 1442         pbak = p;
 1443     }
 1444 
 1445     if (pd > 0) {
 1446         for (i=0; i<dset->n; i++) {
 1447         s = dset->S[i];
 1448         yr1 = atoi(s + 4);
 1449         *m1 = '\0';
 1450         strncat(m1, s, 3);
 1451         p = month_number(m1);
 1452         if (pd == 12) {
 1453             sprintf(dset->S[i], "%d:%02d", yr1, p);
 1454         } else {
 1455             sprintf(dset->S[i], "%d:%g", yr1, ceil((3+p)/4.0));
 1456         }
 1457         }
 1458         ret = 1;
 1459     }
 1460     }
 1461 
 1462     return ret;
 1463 }
 1464 
 1465 /* Attempt to parse CSV row labels as dates.  Return -1 if this
 1466    doesn't work out, or 0 if the labels seem to be just integer
 1467    observation numbers, else return the inferred data frequency.
 1468 */
 1469 
 1470 int test_markers_for_dates (DATASET *dset, int *reversed,
 1471                 char *skipstr, PRN *prn)
 1472 {
 1473     char endobs[OBSLEN];
 1474     int n = dset->n;
 1475     char *lbl1 = dset->S[0];
 1476     char *lbl2 = dset->S[n - 1];
 1477     int len1 = strlen(lbl1);
 1478     int len2 = strlen(lbl2);
 1479     int pd = -1;
 1480 
 1481     if (skipstr != NULL && *skipstr != '\0') {
 1482     return time_series_label_check(dset, *reversed, skipstr, 0, prn);
 1483     }
 1484 
 1485     pprintf(prn, A_("   first row label \"%s\", last label \"%s\"\n"),
 1486         lbl1, lbl2);
 1487 
 1488     /* are the labels (probably) just 1, 2, 3 etc.? */
 1489     sprintf(endobs, "%d", n);
 1490     if (!strcmp(lbl1, "1") && !strcmp(lbl2, endobs)) {
 1491     return 0;
 1492     }
 1493 
 1494     if (fix_IFS_data_labels(dset) || fix_mon_year_labels(dset)) {
 1495     lbl1 = dset->S[0];
 1496     lbl2 = dset->S[n - 1];
 1497     len1 = strlen(lbl1);
 1498     }
 1499 
 1500     /* labels are of different lengths? */
 1501     if (len1 != len2) {
 1502     if (abs(len1 - len2) > 1) {
 1503         return -1;
 1504     } else if (len2 > len1) {
 1505         len1 = len2;
 1506     }
 1507     }
 1508 
 1509     pputs(prn, A_("trying to parse row labels as dates...\n"));
 1510 
 1511     if (len1 == 8 || len1 == 10) {
 1512     /* daily data? */
 1513     pd = csv_daily_date_check(dset, reversed, skipstr, prn);
 1514     } else if (len1 >= 4) {
 1515     /* annual, quarterly, monthly? */
 1516     if (isdigit((unsigned char) lbl1[0]) &&
 1517         isdigit((unsigned char) lbl1[1]) &&
 1518         isdigit((unsigned char) lbl1[2]) &&
 1519         isdigit((unsigned char) lbl1[3])) {
 1520         *reversed = dates_maybe_reversed(lbl1, lbl2, prn);
 1521         pd = time_series_label_check(dset, *reversed, skipstr, 0, prn);
 1522     } else {
 1523         pputs(prn, A_("   definitely not a four-digit year\n"));
 1524     }
 1525     }
 1526 
 1527     if (pd <= 0 && *reversed) {
 1528     /* give up the "reversed" notion if we didn't get
 1529        a workable time-series interpretation */
 1530     *reversed = 0;
 1531     }
 1532 
 1533     return pd;
 1534 }
 1535 
 1536 static int utf8_ok (FILE *fp, int pos)
 1537 {
 1538     long mark = ftell(fp);
 1539     int len = pos + 9;
 1540     char *test = malloc(len + 1);
 1541     int i, ret = 0;
 1542 
 1543     fseek(fp, mark - pos - 1, SEEK_SET);
 1544 
 1545     for (i=0; i<len; i++) {
 1546     test[i] = fgetc(fp);
 1547     }
 1548     test[i] = '\0';
 1549 
 1550     if (g_utf8_validate(test, -1, NULL)) {
 1551     ret = 1;
 1552     } else {
 1553     GError *gerr = NULL;
 1554     gsize wrote = 0;
 1555     gchar *tr;
 1556 
 1557     /* try for iso-8859? */
 1558     tr = g_convert(test, -1, "UTF-8", "ISO-8859-15",
 1559                NULL, &wrote, &gerr);
 1560     if (gerr != NULL) {
 1561         g_error_free(gerr);
 1562     } else {
 1563         g_free(tr);
 1564         ret = 1;
 1565     }
 1566     }
 1567 
 1568     free(test);
 1569 
 1570     fseek(fp, mark, SEEK_SET);
 1571 
 1572     return ret;
 1573 }
 1574 
 1575 enum {
 1576     UTF_8 = 1,
 1577     UTF_16,
 1578     UTF_32
 1579 };
 1580 
 1581 /* If we got a UTF-16 or UTF-32 BOM, try recoding to
 1582    UTF-8 before parsing data. We write the recoded text
 1583    to a temporary file in the user's "dotdir" (and
 1584    then delete that file once we're done).
 1585 */
 1586 
 1587 static int csv_recode_input (FILE **fpp,
 1588                  const char *fname,
 1589                  gchar **pfname,
 1590                  int ucode,
 1591                  PRN *prn)
 1592 {
 1593     const gchar *from_set =
 1594     (ucode == UTF_32)? "UTF-32" : "UTF-16";
 1595     gchar *altname = NULL;
 1596     int err = 0;
 1597 
 1598     /* the current stream is not useable as is,
 1599        so shut it down
 1600     */
 1601     fclose(*fpp);
 1602     *fpp = NULL;
 1603 
 1604     /* we'll recode to a temp file in dotdir */
 1605     altname = g_strdup_printf("%srecode_tmp.u8", gretl_dotdir());
 1606 
 1607     err = gretl_recode_file(fname, altname,
 1608                 from_set, "UTF-8",
 1609                 prn);
 1610 
 1611     if (!err) {
 1612     /* try reattaching the stream */
 1613     *fpp = gretl_fopen(altname, "rb");
 1614     if (*fpp == NULL) {
 1615         gretl_remove(altname);
 1616         err = E_FOPEN;
 1617     } else {
 1618         pputs(prn, "switched to recoded input\n");
 1619         *pfname = altname;
 1620         altname = NULL;
 1621     }
 1622     }
 1623 
 1624     g_free(altname);
 1625 
 1626     return err;
 1627 }
 1628 
 1629 /* Check the first 4 bytes of "CSV" input for a Byte Order
 1630    Mark. If we find the UTF-8 BOM (typically written by
 1631    Microsoft tools), simply record the fact so that we can
 1632    skip it on reading. But if we find a BOM indicating a
 1633    16-bit or 32-bit unicode encoding, flag this by returning
 1634    a non-zero @ucode value; in that case we'll attempt a
 1635    full recording of the input (via GLib) before we start
 1636    reading data.
 1637 */
 1638 
 1639 static int csv_unicode_check (FILE *fp, csvdata *c, PRN *prn)
 1640 {
 1641     unsigned char b[4];
 1642     int n = fread(b, 1, 4, fp);
 1643     int ucode = 0;
 1644 
 1645     if (n == 4) {
 1646     if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) {
 1647         pputs(prn, "got UTF-8 BOM\n");
 1648         ucode = UTF_8;
 1649     } else if (b[0] == 0xFE && b[1] == 0xFF) {
 1650         pputs(prn, "got UTF-16BE, will try recoding\n");
 1651         ucode = UTF_16;
 1652     } else if (b[0] == 0xFF && b[1] == 0xFE) {
 1653         if (b[2] == 0 && b[3] == 0) {
 1654         pputs(prn, "got UTF-32LE, will try recoding\n");
 1655         ucode = UTF_32;
 1656         } else {
 1657         pputs(prn, "got UTF-16LE, will try recoding\n");
 1658         ucode = UTF_16;
 1659         }
 1660     } else if (b[0] == 0 && b[1] == 0 &&
 1661            b[0] == 0xFE && b[1] == 0xFF) {
 1662         pputs(prn, "got UTF-32BE, will try recoding\n");
 1663         ucode = UTF_32;
 1664     }
 1665     }
 1666 
 1667     if (ucode == UTF_8) {
 1668     csv_set_has_bom(c);
 1669     fseek(fp, 3, SEEK_SET);
 1670     ucode = 0;
 1671     } else {
 1672     rewind(fp);
 1673     }
 1674 
 1675     return ucode;
 1676 }
 1677 
 1678 /* The function below checks for the maximum line length in the given
 1679    file.  It also checks for extraneous binary data (the file is
 1680    supposed to be plain text), and checks whether the 'delim'
 1681    character is present in the file, on a non-comment line (where
 1682    a comment line is one that starts with '#').
 1683 
 1684    In addition, we check whether the file has a trailing comma on every
 1685    line, and for the numbers of double- and single-quote characters
 1686    to try to determine which, if either, is used to indicate quoted
 1687    fields in the input.
 1688 */
 1689 
 1690 static int csv_max_line_length (FILE *fp, csvdata *cdata, PRN *prn)
 1691 {
 1692     int c, c1, cbak = 0, cc = 0;
 1693     int comment = 0, maxlinelen = 0;
 1694     int max_ldquo = 0, max_lsquo = 0;
 1695     int ldquo = 0, lsquo = 0;
 1696     int ndquo = 0, nsquo = 0;
 1697     int crlf = 0, lines = 0;
 1698 
 1699     csv_set_trailing_comma(cdata); /* just provisionally */
 1700 
 1701     while ((c = fgetc(fp)) != EOF) {
 1702     if (c == 0x0d) {
 1703         /* CR */
 1704         c1 = fgetc(fp);
 1705         if (c1 == EOF) {
 1706         break;
 1707         } else if (c1 == 0x0a) {
 1708         /* CR + LF -> LF */
 1709         crlf = 1;
 1710         c = c1;
 1711         } else {
 1712         /* Mac-style: CR not followed by LF */
 1713         c = 0x0a;
 1714         ungetc(c1, fp);
 1715         }
 1716     }
 1717     if (c == 0x0a) {
 1718         if (cc > maxlinelen) {
 1719         maxlinelen = cc;
 1720         }
 1721         cc = 0;
 1722         if (cbak != 0 && cbak != ',') {
 1723         csv_unset_trailing_comma(cdata);
 1724         }
 1725         lines++;
 1726         if (ldquo > max_ldquo) {
 1727         max_ldquo = ldquo;
 1728         }
 1729         if (lsquo > max_lsquo) {
 1730         max_lsquo = lsquo;
 1731         }
 1732         ldquo = lsquo = 0;
 1733         continue;
 1734     }
 1735     cbak = c;
 1736     if (!isspace((unsigned char) c) && !isprint((unsigned char) c) &&
 1737         !(c == CTRLZ) && !utf8_ok(fp, cc)) {
 1738         pprintf(prn, A_("Binary data (%d) encountered (line %d:%d): "
 1739                 "this is not a valid text file\n"),
 1740             c, lines + 1, cc + 1);
 1741         return -1;
 1742     }
 1743     if (cc == 0) {
 1744         comment = (c == '#');
 1745     }
 1746     if (!comment) {
 1747         if (c == '\t') {
 1748         /* let's ignore trailing tabs in this heuristic */
 1749         c1 = fgetc(fp);
 1750         if (c1 != 0x0d && c1 != 0x0a) {
 1751             csv_set_got_tab(cdata);
 1752         }
 1753         ungetc(c1, fp);
 1754         }
 1755         if (c == ';') {
 1756         csv_set_got_semi(cdata);
 1757         }
 1758         if (c == cdata->delim) {
 1759         csv_set_got_delim(cdata);
 1760         } else if (c == '"') {
 1761         ldquo++;
 1762         ndquo++;
 1763         } else if (c == '\'') {
 1764         lsquo++;
 1765         nsquo++;
 1766         }
 1767     }
 1768     cc++;
 1769     }
 1770 
 1771     if (maxlinelen == 0) {
 1772     pputs(prn, A_("Data file is empty\n"));
 1773     } else if (csv_has_trailing_comma(cdata)) {
 1774     pputs(prn, A_("Data file has trailing commas\n"));
 1775     }
 1776 
 1777     if (ndquo > 0 || nsquo > 0) {
 1778     /* candidates for quotation character? */
 1779     int cands[2] = {0};
 1780 
 1781     if (ndquo > 0) {
 1782         pprintf(prn, A_("Found %d double-quotes, max %d per line\n"),
 1783             ndquo, max_ldquo);
 1784     }
 1785     if (nsquo > 0) {
 1786         pprintf(prn, A_("Found %d single-quotes, max %d per line\n"),
 1787             nsquo, max_lsquo);
 1788     }
 1789     if (max_ldquo > 0 && max_ldquo % 2 == 0) {
 1790         /* double-quote is a candidate */
 1791         cands[0] = 1;
 1792     }
 1793     if (max_lsquo > 0 && max_lsquo % 2 == 0) {
 1794         /* single-quote is a candidate */
 1795         cands[1] = 1;
 1796     }
 1797     if (cands[0] && cands[1]) {
 1798         /* hmm, rule one out: prefer the more numerous */
 1799         if (nsquo > ndquo) {
 1800         cands[0] = 0;
 1801         } else {
 1802         cands[1] = 0;
 1803         }
 1804     }
 1805     if (cands[0]) {
 1806         pputs(prn, A_("Assuming double-quote is the relevant "
 1807               "quotation character\n"));
 1808         cdata->qchar = '"';
 1809     } else if (cands[1]) {
 1810         pputs(prn, A_("Assuming single-quote is the relevant "
 1811               "quotation character\n"));
 1812         cdata->qchar = '\'';
 1813     }
 1814     }
 1815 
 1816     if (maxlinelen > 0) {
 1817     /* allow for newline and null terminator */
 1818     maxlinelen += 2 + crlf;
 1819     }
 1820 
 1821     return maxlinelen;
 1822 }
 1823 
 1824 #define nonspace_delim(d) (d != ',' && d != ';' && d != '\t')
 1825 
 1826 static int count_csv_fields (csvdata *c)
 1827 {
 1828     const char *s = c->line;
 1829     int inquote = 0;
 1830     int cbak, nf = 0;
 1831 
 1832     if (*s == c->delim && *s == ' ') {
 1833     s++;
 1834     }
 1835 
 1836     while (*s) {
 1837     if (csv_keep_quotes(c) && *s == c->qchar) {
 1838         inquote = !inquote;
 1839     } else if (!inquote && *s == c->delim) {
 1840         nf++;
 1841     }
 1842     cbak = *s;
 1843     s++;
 1844     /* Problem: (when) should a trailing delimiter be read as an
 1845        implicit NA?  For now we'll so treat it if the delimiter
 1846        is not plain space.
 1847     */
 1848     if (*s == '\0' && cbak == c->delim && nonspace_delim(c->delim)) {
 1849         nf--;
 1850     }
 1851     }
 1852 
 1853     return nf + 1;
 1854 }
 1855 
 1856 static void purge_quoted_commas (char *s)
 1857 {
 1858     int inquote = 0;
 1859 
 1860     while (*s) {
 1861     if (*s == '"') {
 1862         inquote = !inquote;
 1863     } else if (inquote && *s == ',') {
 1864         *s = ' ';
 1865     }
 1866     s++;
 1867     }
 1868 }
 1869 
 1870 static void purge_unquoted_spaces (char *s)
 1871 {
 1872     int inquote = 0;
 1873 
 1874     while (*s) {
 1875     if (*s == '"') {
 1876         inquote = !inquote;
 1877     } else if (!inquote && *s == ' ') {
 1878         shift_string_left(s, 1);
 1879     }
 1880     s++;
 1881     }
 1882 }
 1883 
 1884 static void compress_csv_line (csvdata *c, int nospace)
 1885 {
 1886     int n = strlen(c->line);
 1887     char *p = c->line + n - 1;
 1888 
 1889     if (*p == 0x0a) {
 1890     *p = '\0';
 1891     p--;
 1892     }
 1893 
 1894     if (*p == 0x0d) {
 1895     *p = '\0';
 1896     }
 1897 
 1898     if (!csv_keep_quotes(c) && c->delim == ',') {
 1899     purge_quoted_commas(c->line);
 1900     }
 1901 
 1902     if (c->delim != ' ') {
 1903     if (nospace) {
 1904         purge_unquoted_spaces(c->line);
 1905     }
 1906     } else {
 1907     compress_spaces(c->line);
 1908     }
 1909 
 1910     if (!csv_keep_quotes(c)) {
 1911         gretl_delchar('"', c->line);
 1912     }
 1913 
 1914     if (csv_has_trailing_comma(c)) {
 1915     /* chop trailing comma */
 1916     n = strlen(c->line);
 1917     if (n > 0) {
 1918         c->line[n-1] = '\0';
 1919     }
 1920     }
 1921 }
 1922 
 1923 int import_obs_label (const char *s)
 1924 {
 1925     char tmp[VNAMELEN];
 1926 
 1927     if (s == NULL) {
 1928     return 1;
 1929     }
 1930 
 1931     if (!strcmp(s, "\"\"") || !strcmp(s, "''")) {
 1932     return 1;
 1933     }
 1934 
 1935     if (*s == '"' || *s == '\'') s++;
 1936 
 1937     if (*s == '\0') {
 1938     return 1;
 1939     }
 1940 
 1941     if (strlen(s) > VNAMELEN - 1) {
 1942     return 0;
 1943     }
 1944 
 1945     *tmp = '\0';
 1946     strncat(tmp, s, VNAMELEN - 1);
 1947     gretl_lower(tmp);
 1948 
 1949     return (!strcmp(tmp, "obs") ||
 1950         !strcmp(tmp, "date") ||
 1951         !strcmp(tmp, "year") ||
 1952         !strcmp(tmp, "period") ||
 1953         !strcmp(tmp, "observation") ||
 1954         !strcmp(tmp, "observation_date"));
 1955 }
 1956 
 1957 static int join_wants_col_zero (csvdata *c, const char *s)
 1958 {
 1959     const char *colname;
 1960     int i;
 1961 
 1962     if (*s == '\0') {
 1963     return 0;
 1964     }
 1965 
 1966     for (i=0; i<c->jspec->ncols; i++) {
 1967     colname = c->jspec->colnames[i];
 1968     if (colname != NULL && !strcmp(s, colname)) {
 1969         return 1;
 1970     }
 1971     }
 1972 
 1973     return 0;
 1974 }
 1975 
 1976 static void check_first_field (const char *line, csvdata *c, PRN *prn)
 1977 {
 1978     const char *s;
 1979 
 1980  tryagain:
 1981     s = line;
 1982 
 1983     if (c->delim != ' ' && *s == c->delim) {
 1984     csv_set_blank_column(c);
 1985     } else {
 1986     char field1[OBSLEN];
 1987     int i = 0;
 1988 
 1989     if (c->delim == ' ' && *s == ' ') {
 1990         s++;
 1991     }
 1992 
 1993     while (*s && i < sizeof field1) {
 1994         if (*s == c->delim) {
 1995         break;
 1996         } else if (*s == '\t') {
 1997         /* presence of a tab must indicate tab-separation? */
 1998         c->delim = '\t';
 1999         goto tryagain;
 2000         }
 2001         field1[i++] = *s++;
 2002     }
 2003 
 2004     field1[i] = '\0';
 2005     iso_to_ascii(field1);
 2006 
 2007     if (joining(c) && join_wants_col_zero(c, field1)) {
 2008         return;
 2009     } else if (csv_all_cols(c)) {
 2010         /* open/append wants all columns as data */
 2011         return;
 2012     }
 2013 
 2014     pprintf(prn, A_("   first field: '%s'\n"), field1);
 2015 
 2016     if (import_obs_label(field1)) {
 2017         pputs(prn, A_("   seems to be observation label\n"));
 2018         csv_set_obs_column(c);
 2019     }
 2020     }
 2021 }
 2022 
 2023 void import_na_init (void)
 2024 {
 2025     const char *s = get_csv_na_read_string();
 2026 
 2027     strcpy(import_na, s);
 2028 }
 2029 
 2030 /* Returns 1 if the string @s should be counted representing
 2031    an NA or missing value, 0 otherwise. If there is a user-set
 2032    "csv_read_na" value this is used for comparison, otherwise
 2033    a set of default values is consulted.
 2034 */
 2035 
 2036 int import_na_string (const char *s)
 2037 {
 2038     if (*import_na != '\0' && strcmp(import_na, "default")) {
 2039     /* the user has set a specific "NA" string, so
 2040        respect it */
 2041     return !strcmp(s, import_na);
 2042     } else {
 2043     /* consult a list of common representations of NA */
 2044     const char *defaults[] = {
 2045         "NA",
 2046         "N.A.",
 2047         "n.a.",
 2048         "na",
 2049         "n/a",
 2050         "N/A",
 2051         "#N/A",
 2052         "NaN",
 2053         ".NaN",
 2054         ".",
 2055         "..",
 2056         "-999",
 2057         "-9999",
 2058         "-",
 2059         NULL
 2060     };
 2061     int i;
 2062 
 2063     for (i=0; defaults[i] != NULL; i++) {
 2064         if (!strcmp(s, defaults[i])) {
 2065         return 1;
 2066         }
 2067     }
 2068     }
 2069 
 2070     return 0;
 2071 }
 2072 
 2073 static int csv_missval (const char *str, int i, int t,
 2074             int *miss_shown, PRN *prn)
 2075 {
 2076     int miss = 0;
 2077 
 2078     if (*str == '\0') {
 2079     if (miss_shown != NULL) {
 2080         if (t < 80 || *miss_shown < i) {
 2081         pprintf(prn, A_("   the cell for variable %d, obs %d "
 2082                 "is empty: treating as missing value\n"),
 2083             i, t);
 2084         *miss_shown += 1;
 2085         }
 2086     }
 2087     miss = 1;
 2088     }
 2089 
 2090     if (import_na_string(str)) {
 2091     if (miss_shown != NULL) {
 2092         if (t < 80 || *miss_shown < i) {
 2093         pprintf(prn, A_("   warning: missing value for variable "
 2094                 "%d, obs %d\n"), i, t);
 2095         *miss_shown += 1;
 2096         }
 2097     }
 2098     miss = 1;
 2099     }
 2100 
 2101     return miss;
 2102 }
 2103 
 2104 /* In the case where we think we've found thousands
 2105    separators in numerical input, provisionally mark
 2106    all "non-numeric" values as NAs; we do this prior
 2107    to a second pass through the data.
 2108 */
 2109 
 2110 static void revise_non_numeric_values (csvdata *c)
 2111 {
 2112     int i, t;
 2113 
 2114     for (i=1; i<c->dset->v; i++) {
 2115     for (t=0; t<c->dset->n; t++) {
 2116         if (c->dset->Z[i][t] == NON_NUMERIC) {
 2117         c->dset->Z[i][t] = NADBL;
 2118         }
 2119     }
 2120     }
 2121 }
 2122 
 2123 int non_numeric_check (DATASET *dset, int **plist,
 2124                gretl_string_table **pst,
 2125                PRN *prn)
 2126 {
 2127     int *list = NULL;
 2128     int i, j, t, nn = 0;
 2129     int err = 0;
 2130 
 2131 #if CDEBUG > 1
 2132     fprintf(stderr, "non_numeric_check: testing %d series\n", dset->v - 1);
 2133 #endif
 2134 
 2135     if (pst == NULL) {
 2136     /* not interested in string-valued series/columns */
 2137     for (i=1; i<dset->v; i++) {
 2138         for (t=0; t<dset->n; t++) {
 2139         if (dset->Z[i][t] == NON_NUMERIC) {
 2140             dset->Z[i][t] = NADBL;
 2141         }
 2142         }
 2143     }
 2144     return 0;
 2145     }
 2146 
 2147     for (i=1; i<dset->v; i++) {
 2148     for (t=0; t<dset->n; t++) {
 2149         if (dset->Z[i][t] == NON_NUMERIC) {
 2150         nn++;
 2151         break;
 2152         }
 2153     }
 2154     }
 2155 
 2156 #if CDEBUG > 1
 2157     fprintf(stderr, " found %d candidate series\n", nn);
 2158 #endif
 2159 
 2160     if (nn == 0) {
 2161     return 0; /* nothing to be done */
 2162     }
 2163 
 2164     list = gretl_list_new(nn);
 2165     if (list == NULL) {
 2166     return E_ALLOC;
 2167     }
 2168 
 2169     j = 1;
 2170     for (i=1; i<dset->v; i++) {
 2171     for (t=0; t<dset->n; t++) {
 2172         if (dset->Z[i][t] == NON_NUMERIC) {
 2173         list[j++] = i;
 2174         break;
 2175         }
 2176     }
 2177     }
 2178 
 2179 #if CDEBUG > 1
 2180     printlist(list, "non-numeric vars list");
 2181 #endif
 2182 
 2183     for (i=1; i<=list[0]; i++) {
 2184     /* check each member of @list */
 2185     double nnfrac;
 2186     int nnon = 0;
 2187     int nok = 0;
 2188     int tn = 0;
 2189     int v = list[i];
 2190 
 2191     series_set_flag(dset, v, VAR_DISCRETE);
 2192 
 2193     for (t=0; t<dset->n; t++) {
 2194         if (dset->Z[v][t] == NON_NUMERIC) {
 2195         if (tn == 0) {
 2196             /* record the first non-numeric obs */
 2197             tn = t + 1;
 2198         }
 2199         nnon++;
 2200         } else if (!na(dset->Z[v][t])) {
 2201         nok++;
 2202         }
 2203     }
 2204 
 2205     nnfrac = (nok == 0)? 1.0 : (double) nnon / (nnon + nok);
 2206     pprintf(prn, A_("variable %d (%s): non-numeric values = %d "
 2207             "(%.2f percent)\n"), v, dset->varname[v],
 2208         nnon, 100 * nnfrac);
 2209     if ((nnon < 2 && dset->n > 2) || nnfrac < 0.01) {
 2210         /* if we got just a few non-numeric values, we'll assume
 2211            that the data file is broken
 2212         */
 2213         pprintf(prn, A_("ERROR: variable %d (%s), observation %d, "
 2214                 "non-numeric value\n"),
 2215             v, dset->varname[v], tn);
 2216         err = E_DATA;
 2217     }
 2218     }
 2219 
 2220     if (!err) {
 2221     pputs(prn, _("allocating string table\n"));
 2222     *pst = gretl_string_table_new(list);
 2223     if (*pst == NULL) {
 2224         err = E_ALLOC;
 2225     }
 2226     }
 2227 
 2228     if (err) {
 2229     free(list);
 2230     } else {
 2231     *plist = list;
 2232     }
 2233 
 2234     return err;
 2235 }
 2236 
 2237 static int csv_non_numeric_check (csvdata *c, PRN *prn)
 2238 {
 2239     gretl_string_table *st = NULL;
 2240     int *nlist = NULL;
 2241     int err = 0;
 2242 
 2243     if (csv_as_matrix(c)) {
 2244     err = non_numeric_check(c->dset, &nlist, NULL, prn);
 2245     } else {
 2246     err = non_numeric_check(c->dset, &nlist, &st, prn);
 2247     }
 2248 
 2249     if (!err) {
 2250     c->codelist = nlist;
 2251     c->st = st;
 2252     }
 2253 
 2254     return err;
 2255 }
 2256 
 2257 /* Handle the case in "join" where the user specified some time
 2258    columns for conversion to numeric and also gave a specific format
 2259    for the conversion.
 2260 */
 2261 
 2262 static double special_time_val (const char *s, const char *fmt,
 2263                 int m_means_q)
 2264 {
 2265     struct tm t = {0};
 2266     char *test;
 2267 
 2268     test = strptime(s, fmt, &t);
 2269 
 2270     if (test == NULL || *test != '\0') {
 2271     /* conversion didn't work right */
 2272     return NADBL;
 2273     } else {
 2274     int y, m, d;
 2275 
 2276     y = t.tm_year + 1900;
 2277     m = t.tm_mon + 1;
 2278     d = t.tm_mday;
 2279 
 2280     if (m_means_q) {
 2281         /* convert to 1st month of quarter */
 2282         if (m == 2) m = 4;
 2283         else if (m == 3) m = 7;
 2284         else if (m == 4) m = 10;
 2285         else if (m != 1) {
 2286         return NADBL;
 2287         }
 2288     }
 2289 
 2290     if (d == 0) d = 1;
 2291 
 2292     return 10000*y + 100*m + d;
 2293     }
 2294 }
 2295 
 2296 static int char_count (char c, const char *s)
 2297 {
 2298     int n = 0;
 2299 
 2300     while (*s) {
 2301     if (*s == c) n++;
 2302     s++;
 2303     }
 2304 
 2305     return n;
 2306 }
 2307 
 2308 /* Follow-up check for the case where we think we might
 2309    have found a thousands separator: each occurrence of
 2310    the putative separator must be followed by exactly 3
 2311    digits: we set c->thousep to an invalid value if this
 2312    is not the case.
 2313 */
 2314 
 2315 static void validate_thousep (csvdata *c, const char *s)
 2316 {
 2317     int nd;
 2318 
 2319     while (*s) {
 2320     if (*s == c->thousep) {
 2321         nd = 0;
 2322         s++;
 2323         while (*s) {
 2324         if (isdigit(*s)) {
 2325             nd++;
 2326             s++;
 2327         } else {
 2328             break;
 2329         }
 2330         }
 2331         if (nd != 3) {
 2332         /* nope! */
 2333 #if CDEBUG
 2334         fprintf(stderr, "validate_thousep: no: '%c' is followed by %d digits\n",
 2335             c->thousep, nd);
 2336 #endif
 2337         c->thousep = -1;
 2338         break;
 2339         }
 2340     } else {
 2341         s++;
 2342     }
 2343     }
 2344 }
 2345 
 2346 /* Initial heuristic for detecting a thousands separator,
 2347    where the string @s has been determined to contain
 2348    nothing but digits, dot and comma (allowing for a leading
 2349    minus).
 2350 
 2351    1) If the string contains both comma and dot, whichever
 2352    character appears to the left cannot be the decimal
 2353    separator and may be a thousands separator.
 2354 
 2355    2) If more than one comma appears in the string, comma
 2356    cannot be the decimal character and might be a thousands
 2357    separator; mutatis mutandis for dot.
 2358 */
 2359 
 2360 static void test_for_thousands_sep (csvdata *c, const char *s)
 2361 {
 2362     const char *p1 = strrchr(s, '.');
 2363     const char *p2 = strrchr(s, ',');
 2364     char thousep = 0;
 2365 
 2366     if (p1 != NULL && p2 != NULL) {
 2367     thousep = (p2 - p1 > 0)? '.' : ',';
 2368     } else if (p1 != NULL && char_count('.', s) > 0) {
 2369     thousep = '.';
 2370     } else if (p2 != NULL && char_count(',', s) > 0) {
 2371     thousep = ',';
 2372     }
 2373 
 2374     if (c->thousep > 0) {
 2375     if (thousep != 0 && thousep != c->thousep) {
 2376         /* no consistent interpretation exists */
 2377         c->thousep = -1; /* invalid */
 2378     }
 2379     } else if (thousep != 0) {
 2380     /* we have a candidate for testing */
 2381     char *test, tmp[CSVSTRLEN];
 2382 
 2383     strcpy(tmp, s);
 2384     gretl_delchar(thousep, tmp);
 2385     if (thousep == '.' && get_local_decpoint() == '.') {
 2386         gretl_charsub(tmp, ',', '.');
 2387     }
 2388     errno = 0;
 2389     strtod(tmp, &test);
 2390     if (*test == '\0' && errno == 0) {
 2391         c->thousep = thousep;
 2392     }
 2393     }
 2394 
 2395     if (c->thousep && thousep != 0) {
 2396     validate_thousep(c, s);
 2397     }
 2398 }
 2399 
 2400 static int all_digits_and_seps (const char *s)
 2401 {
 2402     const char *test = "0123456789.,";
 2403 
 2404     if (*s == '-') s++;
 2405 
 2406     return strspn(s, test) == strlen(s);
 2407 }
 2408 
 2409 static double eval_non_numeric (csvdata *c, int i, const char *s)
 2410 {
 2411     double x = NON_NUMERIC;
 2412 
 2413     if (series_get_flags(c->dset, i) & VAR_TIMECOL) {
 2414     char *fmt = NULL;
 2415     int mq = 0;
 2416 
 2417     if (timecol_get_format(c->dset, i, &fmt, &mq)) {
 2418         /* the user gave a specific format for this */
 2419         x = special_time_val(s, fmt, mq);
 2420     } else {
 2421         /* default: ISO 8601 extended */
 2422         int y, m, d, n;
 2423 
 2424         n = sscanf(s, "%d-%d-%d", &y, &m, &d);
 2425         if (n == 3) {
 2426         x = 10000*y + 100*m + d;
 2427         } else {
 2428         x = NADBL;
 2429         }
 2430     }
 2431     } else if (c->thousep >= 0 && !csv_scrub_thousep(c)) {
 2432     /* Here we consider the possibility although @s does not
 2433        validate as numeric according to the C library, it is by
 2434        intent numeric but includes one or more thousands
 2435        separators.
 2436 
 2437        The condition c->thousep >= 0 requires that we haven't
 2438        already ruled out this interpretation due to inconsistency,
 2439        and !csv_scrub_thousep(c) requires that we're not on a
 2440        second pass through the data.
 2441     */
 2442     if (all_digits_and_seps(s)) {
 2443         test_for_thousands_sep(c, s);
 2444     }
 2445     }
 2446 
 2447     return x;
 2448 }
 2449 
 2450 static int converted_ok (const char *s, char *test, double x)
 2451 {
 2452     if (*test != '\0') {
 2453     if (errno) perror(s);
 2454     return 0; /* definitely not OK */
 2455     } else if (errno == ERANGE && fabs(x) > 0 && fabs(x) < 0.001) {
 2456     return 1; /* subnormal, but we'll let that pass */
 2457     } else if (errno) {
 2458     perror(s);
 2459     return 0;
 2460     } else {
 2461     return 1;
 2462     }
 2463 }
 2464 
 2465 static char *csv_unquote (char *s)
 2466 {
 2467     if (s[0] == '"') {
 2468     int i, n = strlen(s);
 2469 
 2470     if (n > 1 && s[n-1] == '"') {
 2471         for (i=0; i<n-2; i++) {
 2472         s[i] = s[i+1];
 2473         }
 2474         s[i] = '\0';
 2475     }
 2476     }
 2477     return s;
 2478 }
 2479 
 2480 static double csv_atof (csvdata *c, int i)
 2481 {
 2482     char tmp[CSVSTRLEN], clean[CSVSTRLEN];
 2483     double x = NON_NUMERIC;
 2484     const char *s = c->str;
 2485     char *test;
 2486 
 2487     if (csv_scrub_thousep(c) && strchr(s, c->thousep) &&
 2488     all_digits_and_seps(s)) {
 2489     /* second pass through the data: pre-process fields
 2490        that we reckon include thousands separators
 2491     */
 2492     strcpy(clean, s);
 2493     gretl_delchar(c->thousep, clean);
 2494     s = clean;
 2495     }
 2496 
 2497     if (c->decpoint == '.' || !csv_do_dotsub(c) || strchr(s, ',') == NULL) {
 2498     /* either we're currently set to the correct locale,
 2499        or there's no problematic decimal point in @s
 2500     */
 2501     errno = 0;
 2502     x = strtod(s, &test);
 2503     if (converted_ok(s, test, x)) {
 2504         return x; /* handled */
 2505     }
 2506     } else if (csv_do_dotsub(c)) {
 2507     /* in C numeric locale: substitute dot for comma */
 2508     strcpy(tmp, s);
 2509     gretl_charsub(tmp, ',', '.');
 2510     errno = 0;
 2511     x = strtod(tmp, &test);
 2512     if (converted_ok(s, test, x)) {
 2513         return x; /* handled */
 2514     }
 2515     }
 2516 
 2517     if (c->decpoint == '.' && strchr(s, ',') != NULL) {
 2518     /* try remediation for decimal comma? */
 2519     strcpy(tmp, s);
 2520     gretl_charsub(tmp, ',', '.');
 2521     errno = 0;
 2522     x = strtod(tmp, &test);
 2523     if (converted_ok(s, test, x)) {
 2524         return x; /* handled */
 2525     }
 2526     }
 2527 
 2528     /* fallback */
 2529     /* revised 2020-02-13 to use csv_unquote */
 2530     return eval_non_numeric(c, i, csv_unquote(c->str));
 2531 }
 2532 
 2533 static int process_csv_obs (csvdata *c, int i, int t, int *miss_shown,
 2534                 PRN *prn)
 2535 {
 2536     int err = 0;
 2537 
 2538     if (c->st != NULL) {
 2539     /* second round, handling string-valued variables */
 2540     if (in_gretl_list(c->codelist, i)) {
 2541         double zit = c->dset->Z[i][t];
 2542         int ix;
 2543 
 2544         if (na(zit) && *c->str != '\0' && c->user_na == NULL) {
 2545         /* by default (no user_na) only blanks count as NAs */
 2546         zit = NON_NUMERIC;
 2547         }
 2548         if (!na(zit)) {
 2549         ix = gretl_string_table_index(c->st, c->str, i, 0, prn);
 2550         if (ix > 0) {
 2551             c->dset->Z[i][t] = (double) ix;
 2552         } else {
 2553             err = E_DATA;
 2554         }
 2555         }
 2556     }
 2557     } else if (csv_missval(c->str, i, t+1, miss_shown, prn)) {
 2558     c->dset->Z[i][t] = NADBL;
 2559     } else {
 2560     gretl_strstrip(c->str);
 2561     c->dset->Z[i][t] = csv_atof(c, i);
 2562     }
 2563 
 2564     return err;
 2565 }
 2566 
 2567 /* Emulation of fgets(), designed to handle any sort of line
 2568    termination (unix, DOS, Mac or even an unholy mixture).
 2569    Line-endings are converted to LF (0x0a).
 2570 */
 2571 
 2572 static char *csv_fgets (csvdata *cdata, FILE *fp)
 2573 {
 2574     char *s = cdata->line;
 2575     int n = cdata->maxlinelen;
 2576     int i, c1, c = 0;
 2577 
 2578     for (i=0; i<n-1 && c!=0x0a; i++) {
 2579     c = fgetc(fp);
 2580     if (c == EOF) {
 2581         if (i == 0) {
 2582         /* signal end of read */
 2583         return NULL;
 2584         } else {
 2585         break;
 2586         }
 2587     } else if (c == 0x0d) {
 2588         /* CR: convert to LF and peek at next char: if it's
 2589            LF swallow it, otherwise put it back */
 2590         c = 0x0a;
 2591         c1 = fgetc(fp);
 2592         if (c1 != 0x0a) {
 2593         ungetc(c1, fp);
 2594         }
 2595     }
 2596     s[i] = c;
 2597     }
 2598 
 2599     s[i] = '\0';
 2600 
 2601     return s;
 2602 }
 2603 
 2604 /* pick up any comments following the data block in a CSV file */
 2605 
 2606 static char *get_csv_descrip (csvdata *c, FILE *fp)
 2607 {
 2608     char *line = c->line;
 2609     char *desc = NULL;
 2610     size_t llen, totlen;
 2611 
 2612     while (csv_fgets(c, fp)) {
 2613     tailstrip(line);
 2614     llen = strlen(line);
 2615     if (desc == NULL) {
 2616         totlen = llen + 4;
 2617         desc = malloc(totlen);
 2618         if (desc == NULL) {
 2619         return NULL;
 2620         }
 2621         sprintf(desc, "%s\n", line);
 2622     } else {
 2623         char *tmp;
 2624 
 2625         totlen = strlen(desc) + llen + 4;
 2626         tmp = realloc(desc, totlen);
 2627         if (tmp == NULL) {
 2628         free(desc);
 2629         return NULL;
 2630         }
 2631         desc = tmp;
 2632         strcat(desc, line);
 2633         strcat(desc, "\n");
 2634     }
 2635     }
 2636 
 2637     if (desc != NULL && string_is_blank(desc)) {
 2638     free(desc);
 2639     desc = NULL;
 2640     }
 2641 
 2642     return desc;
 2643 }
 2644 
 2645 static const char *
 2646 csv_msg = N_("\nPlease note:\n"
 2647          "- The first row of the CSV file should contain the "
 2648          "names of the variables.\n"
 2649          "- The first column may optionally contain date "
 2650          "strings or other 'markers':\n  in that case its row 1 entry "
 2651          "should be blank, or should say 'obs' or 'date'.\n"
 2652          "- The remainder of the file must be a rectangular "
 2653          "array of data.\n");
 2654 
 2655 /* Here we check whether we get a consistent reading on
 2656    the number of fields per line in the CSV file
 2657 */
 2658 
 2659 static int csv_fields_check (FILE *fp, csvdata *c, PRN *prn)
 2660 {
 2661     int gotdata = 0;
 2662     int chkcols = 0;
 2663     int err = 0;
 2664 
 2665     c->ncols = c->nrows = 0;
 2666 
 2667     if (csv_has_bom(c)) {
 2668     fseek(fp, 3, SEEK_SET);
 2669     }
 2670 
 2671     while (csv_fgets(c, fp) && !err) {
 2672 
 2673     /* skip comment lines */
 2674     if (*c->line == '#') {
 2675         continue;
 2676     }
 2677 
 2678     /* skip blank lines -- but finish if the blank comes after data */
 2679     if (string_is_blank(c->line)) {
 2680         if (gotdata) {
 2681         if (!csv_have_data(c)) {
 2682             c->descrip = get_csv_descrip(c, fp);
 2683         }
 2684         break;
 2685         } else {
 2686         continue;
 2687         }
 2688     }
 2689 
 2690     c->nrows += 1;
 2691 
 2692     if (fixed_format(c)) {
 2693         tailstrip(c->line);
 2694         gotdata = 1;
 2695         chkcols = strlen(c->line);
 2696         if (chkcols < c->cols_list[c->cols_list[0]]) {
 2697         gretl_errmsg_set(_("Invalid column specification"));
 2698         err = E_DATA;
 2699         break;
 2700         } else {
 2701         continue;
 2702         }
 2703     }
 2704 
 2705     compress_csv_line(c, 1);
 2706 
 2707     if (!gotdata) {
 2708         /* scrutinize the first "real" line */
 2709         check_first_field(c->line, c, prn);
 2710         gotdata = 1;
 2711     }
 2712 
 2713     chkcols = count_csv_fields(c);
 2714     if (c->ncols == 0) {
 2715         c->ncols = chkcols;
 2716         pprintf(prn, A_("   number of columns = %d\n"), c->ncols);
 2717     } else if (chkcols != c->ncols) {
 2718         pprintf(prn, A_("   ...but row %d has %d fields: aborting\n"),
 2719             c->nrows, chkcols);
 2720         err = E_DATA;
 2721     } else if (cols_subset(c)) {
 2722         int datacols = csv_skip_col_1(c) ? (c->ncols - 1) : c->ncols;
 2723 
 2724         if (c->cols_list[c->cols_list[0]] > datacols) {
 2725         gretl_errmsg_set(_("Invalid column specification"));
 2726         err = E_DATA;
 2727         }
 2728     }
 2729     }
 2730 
 2731     if (!err && fixed_format(c)) {
 2732     c->ncols = c->cols_list[0];
 2733     }
 2734 
 2735     return err;
 2736 }
 2737 
 2738 static void strip_illegals (char *s)
 2739 {
 2740     char name[VNAMELEN] = {0};
 2741     int i, j = 0;
 2742 
 2743     for (i=0; s[i] != '\0'; i++) {
 2744     if (isalnum(s[i]) || s[i] == '_') {
 2745         name[j++] = s[i];
 2746     }
 2747     }
 2748 
 2749     name[j] = '\0';
 2750     strcpy(s, name);
 2751 }
 2752 
 2753 static int intercept_nan_as_name (const char *s)
 2754 {
 2755     if (strlen(s) == 3) {
 2756     char screen[4];
 2757 
 2758     strcpy(screen, s);
 2759     gretl_lower(screen);
 2760     if (!strcmp(screen, "nan")) {
 2761         return 1;
 2762     }
 2763     }
 2764 
 2765     return 0;
 2766 }
 2767 
 2768 static int csv_is_numeric (const char *s, csvdata *c)
 2769 {
 2770     int ret = 0;
 2771 
 2772     if (c->decpoint == '.') {
 2773     ret = numeric_string(s);
 2774     } else {
 2775     /* decimal comma in force */
 2776     char *tmp = gretl_strdup(s);
 2777 
 2778     gretl_charsub(tmp, ',', '.');
 2779     ret = numeric_string(tmp);
 2780     free(tmp);
 2781     }
 2782 
 2783     return ret;
 2784 }
 2785 
 2786 static int process_csv_varname (csvdata *c, int j, int *numcount,
 2787                 PRN *prn)
 2788 {
 2789     char *vname = c->dset->varname[j];
 2790     char *src = c->str;
 2791     int err = 0;
 2792 
 2793     *vname = '\0';
 2794 
 2795     if (intercept_nan_as_name(src)) {
 2796     gretl_errmsg_sprintf(_("If '%s' is intended as the name of a variable, "
 2797                    "please change it --\nstrings of this sort usually "
 2798                    "mean 'not a number'."), src);
 2799     err = E_DATA;
 2800     } else if (*src == '\0') {
 2801     fprintf(stderr, "variable name %d is missing\n", j);
 2802     sprintf(vname, "v%d", j);
 2803     } else if (csv_is_numeric(src, c)) {
 2804     *numcount += 1;
 2805     } else {
 2806     const char *s = src;
 2807 
 2808     while (*s && !isalpha(*s)) s++;
 2809     if (*s == '\0') {
 2810         fprintf(stderr, "variable name %d (%s) is garbage\n", j, src);
 2811         sprintf(vname, "v%d", j);
 2812     } else {
 2813         strncat(vname, s, VNAMELEN - 1);
 2814     }
 2815     iso_to_ascii(vname);
 2816     strip_illegals(vname);
 2817     if (check_varname(vname)) {
 2818         errmsg(1, prn);
 2819         err = E_DATA;
 2820     }
 2821     }
 2822 
 2823     return err;
 2824 }
 2825 
 2826 static int csv_reconfigure_for_markers (DATASET *dset)
 2827 {
 2828     int err = dataset_allocate_obs_markers(dset);
 2829 
 2830     if (!err) {
 2831     err = dataset_drop_last_variables(dset, 1);
 2832     }
 2833 
 2834     return err;
 2835 }
 2836 
 2837 static int skip_data_column (csvdata *c, int k)
 2838 {
 2839     int col = csv_skip_col_1(c) ? k : k + 1;
 2840 
 2841     if (!in_gretl_list(c->cols_list, col)) {
 2842     return 1;
 2843     } else {
 2844     return 0;
 2845     }
 2846 }
 2847 
 2848 /* special fix-up for column names in the context of "join":
 2849    the algorithm here is also used in the userspace fixname()
 2850    function
 2851 */
 2852 
 2853 void normalize_join_colname (char *targ, const char *src,
 2854                  int underscore, int k)
 2855 {
 2856     const char *letters = "abcdefghijklmnopqrstuvwxyz"
 2857     "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 2858     int i = 0;
 2859 
 2860     /* skip any leading non-letters */
 2861     src += strcspn(src, letters);
 2862 
 2863     while (*src && i < VNAMELEN - 1) {
 2864     if (strspn(src, letters) > 0 || isdigit(*src) || *src == '_') {
 2865         /* transcribe valid characters */
 2866         targ[i++] = *src;
 2867     } else if (*src == ' ' || underscore) {
 2868         /* convert space to underscore */
 2869         if (i > 0 && targ[i-1] == '_') {
 2870         ; /* skip */
 2871         } else {
 2872         targ[i++] = '_';
 2873         }
 2874     }
 2875     src++;
 2876     }
 2877 
 2878     if (i > 0) {
 2879     targ[i] = '\0';
 2880     } else if (k <= 0) {
 2881     strcpy(targ, "col[n]");
 2882     } else {
 2883     sprintf(targ, "col%d", k);
 2884     }
 2885 }
 2886 
 2887 static int update_join_cols_list (csvdata *c, int k)
 2888 {
 2889     int *test;
 2890     int err = 0;
 2891 
 2892     test = gretl_list_append_term(&c->cols_list, k);
 2893     if (test == NULL) {
 2894     err = E_ALLOC;
 2895     }
 2896 
 2897 #if CDEBUG
 2898     printlist(c->cols_list, "c->cols_list for join");
 2899 #endif
 2900 
 2901     return err;
 2902 }
 2903 
 2904 /* handle_join_varname: the index @k contains the column number
 2905    relative to the entire CSV file, while @pj points to j, the column
 2906    number relative to the reduced dataset that will be constructed by
 2907    selection of columns from the file.
 2908 
 2909    Here we're examining a column heading read from file (c->str) to
 2910    see whether it matches any of the column-names required for an
 2911    ongoing join operation (held in c->jspec->colnames). If so, we
 2912    write the index j into the appropriate slot in c->jspec->colnums
 2913    (which starts off filled with zeros), so the joiner will know where
 2914    to find the required data. (The j value is bound to be at least 1
 2915    since column 0 is reserved to the constant.)
 2916 
 2917    In some cases a given named column may perform more than one role in
 2918    a join operation -- for example, it may serve as an element in a
 2919    filter and also as the auxiliary variable in an "aggregation"
 2920    method. To allow for this we don't stop scanning at the first match
 2921    of c->str with a required column name.
 2922 
 2923    The call to update_join_cols_list() uses the index @k to record the
 2924    overall column position of "wanted data", for use by the CSV
 2925    reader.
 2926 */
 2927 
 2928 static int handle_join_varname (csvdata *c, int k, int *pj)
 2929 {
 2930     const char *colname;
 2931     char okname[VNAMELEN];
 2932     int matched = 0;
 2933     int i, j = *pj;
 2934 
 2935     if (!csv_skip_col_1(c)) {
 2936     k++;
 2937     }
 2938 
 2939     if (csv_no_header(c)) {
 2940     sprintf(okname, "col%d", k);
 2941     } else {
 2942     /* convert to valid gretl identifier */
 2943     normalize_join_colname(okname, c->str, 0, k);
 2944     }
 2945 
 2946 #if CDEBUG
 2947     fprintf(stderr, "handle_join_varname: looking at '%s' (%s)\n", c->str, okname);
 2948 #endif
 2949 
 2950     for (i=0; i<c->jspec->ncols; i++) {
 2951     /* find "wanted name" i */
 2952     colname = c->jspec->colnames[i];
 2953     if (colname == NULL || c->jspec->colnums[i] > 0) {
 2954         /* name not wanted, or already found */
 2955         continue;
 2956     }
 2957     if (!strcmp(okname, colname)) {
 2958 #if CDEBUG
 2959         fprintf(stderr, " target %d matched at CSV col %d, j=%d\n", i, k, j);
 2960 #endif
 2961         c->jspec->colnums[i] = j;
 2962         if (!matched) {
 2963         matched = 1;
 2964         strcpy(c->dset->varname[j], okname);
 2965         update_join_cols_list(c, k);
 2966         *pj += 1;
 2967         if (in_gretl_list(c->jspec->timecols, i)) {
 2968             series_set_flag(c->dset, j, VAR_TIMECOL);
 2969         }
 2970         }
 2971     }
 2972     }
 2973 
 2974     return 0;
 2975 }
 2976 
 2977 #define starts_number(c) (isdigit((unsigned char) c) || c == '-' || \
 2978                           c == '+' || c == '.')
 2979 
 2980 #define obs_labels_no_varnames(o,c,n)  (!o && c->v > 3 && n == c->v - 2)
 2981 
 2982 static int csv_varname_scan (csvdata *c, FILE *fp, PRN *prn, PRN *mprn)
 2983 {
 2984     char *p;
 2985     int obscol = csv_has_obs_column(c);
 2986     int i, j, k, numcount;
 2987     int err = 0;
 2988 
 2989     if (!csv_no_header(c)) {
 2990     pputs(mprn, A_("scanning for variable names...\n"));
 2991     }
 2992 
 2993     if (csv_has_bom(c)) {
 2994     fseek(fp, 3, SEEK_SET);
 2995     }
 2996 
 2997     while (csv_fgets(c, fp)) {
 2998     if (*c->line == '#' || string_is_blank(c->line)) {
 2999         continue;
 3000     } else {
 3001         break;
 3002     }
 3003     }
 3004 
 3005     c->datapos = ftell(fp);
 3006 
 3007     compress_csv_line(c, 1);
 3008 
 3009     p = c->line;
 3010     if (c->delim == ' ' && *p == ' ') p++;
 3011     iso_to_ascii(p);
 3012 
 3013     if (strlen(p) > 118) {
 3014     pprintf(mprn, A_("   line: %.115s...\n"), p);
 3015     } else {
 3016     pprintf(mprn, A_("   line: %s\n"), p);
 3017     }
 3018 
 3019     numcount = 0;
 3020     j = 1; /* for the constant */
 3021 
 3022     for (k=0; k<c->ncols && !err; k++) {
 3023     i = 0;
 3024     while (*p && *p != c->delim) {
 3025         if (i < CSVSTRLEN - 1) {
 3026         c->str[i++] = *p;
 3027         }
 3028         p++;
 3029     }
 3030     c->str[i] = '\0';
 3031     if (*p == c->delim) p++;
 3032 
 3033     if (k == 0 && csv_skip_col_1(c)) {
 3034         ; /* no-op */
 3035     } else if (!joining(c) && cols_subset(c) && skip_data_column(c, k)) {
 3036         ; /* no-op */
 3037     } else {
 3038         if (joining(c)) {
 3039         handle_join_varname(c, k, &j);
 3040         } else if (probing(c) && csv_no_header(c)) {
 3041         sprintf(c->dset->varname[j], "col%d", j);
 3042         j++;
 3043         } else {
 3044         err = process_csv_varname(c, j, &numcount, prn);
 3045         j++;
 3046         }
 3047     }
 3048     if (j == c->dset->v) {
 3049 #if CDEBUG
 3050         fprintf(stderr, "breaking on j = %d (k = %d)\n", j, k);
 3051 #endif
 3052         break;
 3053     }
 3054     }
 3055 
 3056     if (!err && joining(c) && c->cols_list == NULL) {
 3057     /* no relevant columns were found */
 3058     gretl_errmsg_set("No relevant columns were found");
 3059     err = E_UNKVAR;
 3060     }
 3061 
 3062     if (err) {
 3063     return err;
 3064     }
 3065 
 3066     if (csv_no_header(c) || numcount == c->dset->v - 1 ||
 3067     obs_labels_no_varnames(obscol, c->dset, numcount)) {
 3068     if (!csv_no_header(c)) {
 3069         pputs(prn, A_("it seems there are no variable names\n"));
 3070         /* then we undercounted the observations by one? */
 3071         if (!rows_subset(c)) {
 3072         err = add_single_obs(c->dset);
 3073         }
 3074     }
 3075     if (!err) {
 3076         /* set up to handle the "no varnames" case */
 3077         csv_set_autoname(c);
 3078         c->datapos = csv_has_bom(c) ? 3 : 0;
 3079         if (!csv_all_cols(c)) {
 3080         if (obs_labels_no_varnames(obscol, c->dset, numcount)) {
 3081             err = csv_reconfigure_for_markers(c->dset);
 3082             if (!err) {
 3083             csv_set_obs_column(c);
 3084             }
 3085         }
 3086         }
 3087     }
 3088     } else if (numcount > 0) {
 3089     for (i=1; i<c->dset->v; i++) {
 3090         if (check_varname(c->dset->varname[i])) {
 3091         errmsg(1, prn);
 3092         break;
 3093         }
 3094     }
 3095     fprintf(stderr, "numcount = %d\n", numcount);
 3096     err = E_DATA;
 3097     }
 3098 
 3099     return err;
 3100 }
 3101 
 3102 static int row_not_wanted (csvdata *c, int t)
 3103 {
 3104     if (c->rowmask != NULL) {
 3105     if (t >= c->masklen) {
 3106         return 1;
 3107     } else if (gretl_vector_get(c->rowmask, t) == 0) {
 3108         return 1;
 3109     }
 3110     }
 3111 
 3112     return 0;
 3113 }
 3114 
 3115 /* read numerical data when we've been given a fixed column-reading
 3116    specification */
 3117 
 3118 static int fixed_format_read (csvdata *c, FILE *fp, PRN *prn)
 3119 {
 3120     char *p;
 3121     int miss_shown = 0;
 3122     int *missp = NULL;
 3123     int t = 0, s = 0;
 3124     int i, k, n, m;
 3125     int err = 0;
 3126 
 3127     c->real_n = c->dset->n;
 3128 
 3129     if (csv_has_bom(c)) {
 3130     fseek(fp, 3, SEEK_SET);
 3131     }
 3132 
 3133     if (csv_is_verbose(c)) {
 3134     missp = &miss_shown;
 3135     }
 3136 
 3137     while (csv_fgets(c, fp) && !err) {
 3138     tailstrip(c->line);
 3139     if (*c->line == '#' || string_is_blank(c->line)) {
 3140         continue;
 3141     }
 3142     if (row_not_wanted(c, s)) {
 3143         s++;
 3144         continue;
 3145     }
 3146     m = strlen(c->line);
 3147     for (i=1; i<=c->ncols && !err; i++) {
 3148         k = c->cols_list[i];
 3149         n = c->width_list[i];
 3150         if (k + n - 1 > m) {
 3151         /* attempting to read out of bounds */
 3152         fprintf(stderr, "row %d, column %d: start=%d, width=%d, "
 3153             "but line length = %d\n", t+1, i, k, n, m);
 3154         err = E_DATA;
 3155         break;
 3156         }
 3157         p = c->line + k - 1;
 3158         *c->str = '\0';
 3159         strncat(c->str, p, n);
 3160         /* Added 2016-11-16: allow trailing blanks in a field
 3161            of specified width. This is required for handling
 3162            US CPS data.
 3163         */
 3164         tailstrip(c->str);
 3165         if (csv_missval(c->str, i, t+1, missp, prn)) {
 3166         c->dset->Z[i][t] = NADBL;
 3167         } else {
 3168         c->dset->Z[i][t] = csv_atof(c, i);
 3169         if (c->dset->Z[i][t] == NON_NUMERIC) {
 3170             gretl_errmsg_sprintf(_("At row %d, column %d:\n"), t+1, k);
 3171             gretl_errmsg_sprintf(_("'%s' -- no numeric conversion performed!"),
 3172                      c->str);
 3173             err = E_DATA;
 3174         }
 3175         }
 3176     }
 3177     s++;
 3178     if (++t == c->dset->n) {
 3179         break;
 3180     }
 3181     }
 3182 
 3183     if (err == E_DATA) {
 3184     gretl_errmsg_set(_("Invalid column specification"));
 3185     }
 3186 
 3187     return err;
 3188 }
 3189 
 3190 #define XML1_OK(u) ((u>=0x0020 && u<=0xD7FF) || \
 3191             (u>=0xE000 && u<=0xFFFD))
 3192 
 3193 /* Check that an observation label contains only
 3194    valid UTF-8, and moreover that every character
 3195    is valid in XML 1.0. If not, try recoding from
 3196    ISO 8859.
 3197 */
 3198 
 3199 static int maybe_fix_csv_string (gchar *s)
 3200 {
 3201     int err = 0;
 3202 
 3203     if (!g_utf8_validate(s, -1, NULL)) {
 3204     GError *gerr = NULL;
 3205     gsize wrote = 0;
 3206     gchar *tr;
 3207 
 3208     /* try for iso-8859? */
 3209     tr = g_convert(s, -1, "UTF-8", "ISO-8859-15",
 3210                NULL, &wrote, &gerr);
 3211     if (gerr != NULL) {
 3212         gretl_errmsg_set(gerr->message);
 3213         g_error_free(gerr);
 3214         err = E_DATA;
 3215     } else {
 3216         *s = '\0';
 3217         gretl_utf8_strncat(s, tr, CSVSTRLEN-1);
 3218         g_free(tr);
 3219     }
 3220     }
 3221 
 3222     if (!err) {
 3223     int i, n = g_utf8_strlen(s, -1);
 3224     gunichar u;
 3225 
 3226     for (i=0; i<n; i++) {
 3227         u = g_utf8_get_char(s);
 3228         if (!XML1_OK(u)) {
 3229         return 0;
 3230         }
 3231         s = g_utf8_next_char(s);
 3232     }
 3233     }
 3234 
 3235     return err;
 3236 }
 3237 
 3238 static void transcribe_obs_label (csvdata *c, int t)
 3239 {
 3240     char *s = c->str;
 3241     char c0 = *s;
 3242     int n = strlen(s);
 3243 
 3244     /* skip a leading quote, and unquote fully
 3245        if a matching trailing quote is found
 3246     */
 3247 
 3248     if (c0 == '"' || c0 == '\'') {
 3249     if (s[n-1] == c0) {
 3250         s[n-1] = '\0';
 3251         n--;
 3252     }
 3253     s++;
 3254     n--;
 3255     /* and once more, with feeling... */
 3256     if (s[0] == '\'') {
 3257         s++;
 3258         n--;
 3259     }
 3260     }
 3261 
 3262     if (n > OBSLEN - 1) {
 3263     n = OBSLEN - 1;
 3264     }
 3265 
 3266     c->dset->S[t][0] = '\0';
 3267     gretl_utf8_strncat(c->dset->S[t], s, n);
 3268 }
 3269 
 3270 static int real_read_labels_and_data (csvdata *c, FILE *fp, PRN *prn)
 3271 {
 3272     char *p;
 3273     int miss_shown = 0;
 3274     int *missp = NULL;
 3275     int truncated = 0;
 3276     int t = 0, s = 0;
 3277     int i, j, k;
 3278     int err = 0;
 3279 
 3280     if (csv_is_verbose(c)) {
 3281     missp = &miss_shown;
 3282     }
 3283 
 3284     c->real_n = c->dset->n;
 3285 
 3286     while (csv_fgets(c, fp) && !err) {
 3287     int inquote = 0;
 3288 
 3289     if (*c->line == '#' || string_is_blank(c->line)) {
 3290         continue;
 3291     } else if (*c->skipstr != '\0' && strstr(c->line, c->skipstr)) {
 3292         c->real_n -= 1;
 3293         continue;
 3294     } else if (row_not_wanted(c, s)) {
 3295         s++;
 3296         continue;
 3297     }
 3298 
 3299     compress_csv_line(c, 0);
 3300     p = c->line;
 3301 
 3302     if (c->delim == ' ') {
 3303         if (*p == ' ') p++;
 3304     } else {
 3305         p += strspn(p, " ");
 3306     }
 3307 
 3308     j = 1;
 3309     for (k=0; k<c->ncols && !err; k++) {
 3310         i = 0;
 3311         while (*p) {
 3312         if (csv_keep_quotes(c) && *p == c->qchar) {
 3313             inquote = !inquote;
 3314         } else if (!inquote && *p == c->delim) {
 3315             break;
 3316         }
 3317         if (i < CSVSTRLEN - 1) {
 3318             c->str[i++] = *p;
 3319         } else {
 3320             truncated++;
 3321         }
 3322         p++;
 3323         }
 3324         c->str[i] = '\0';
 3325         err = maybe_fix_csv_string(c->str);
 3326         if (!err) {
 3327         if (k == 0 && csv_skip_col_1(c) && c->dset->S != NULL) {
 3328             transcribe_obs_label(c, t);
 3329         } else if (cols_subset(c) && skip_data_column(c, k)) {
 3330             ; /* no-op */
 3331         } else {
 3332             err = process_csv_obs(c, j++, t, missp, prn);
 3333         }
 3334         }
 3335         if (!err) {
 3336         /* prep for next column */
 3337         if (*p == c->delim) {
 3338             p++;
 3339         }
 3340         if (c->delim != ' ') {
 3341             p += strspn(p, " ");
 3342         }
 3343         }
 3344     }
 3345 
 3346     s++;
 3347     if (++t == c->dset->n) {
 3348         break;
 3349     }
 3350     }
 3351 
 3352     if (truncated) {
 3353     pprintf(prn, A_("warning: %d labels were truncated.\n"), truncated);
 3354     }
 3355 
 3356     if (!err && c->real_n < c->dset->n) {
 3357     int drop = c->dset->n - c->real_n;
 3358 
 3359     err = dataset_drop_observations(c->dset, drop);
 3360     }
 3361 
 3362     return err;
 3363 }
 3364 
 3365 /* When reading a CSV file, should we attempt to parse observation
 3366    strings as dates (and impose time-series structure on the data
 3367    if this is successful)? In general, yes, but maybe not if we're
 3368    reading the data in the context of a "join" operation, since
 3369    in this case automatic detection may collide with time-key
 3370    information supplied by the user. Current status: we'll skip
 3371    the auto-dating stuff when joining unless (a) it's a MIDAS
 3372    join (mixed frequencies) and the user has _not_ supplied any
 3373    time key specification.
 3374 */
 3375 
 3376 static int csv_skip_dates (csvdata *c)
 3377 {
 3378     if (c->jspec != NULL) {
 3379     /* with --aggr=spread (MIDAS) we'll need dates info,
 3380        unless the user have a time key spec
 3381     */
 3382     return c->jspec->auto_midas == 0;
 3383     } else {
 3384     return 0;
 3385     }
 3386 }
 3387 
 3388 static int csv_read_data (csvdata *c, FILE *fp, PRN *prn, PRN *mprn)
 3389 {
 3390     int reversed = csv_data_reversed(c);
 3391     int err;
 3392 
 3393     if (mprn != NULL) {
 3394     if (csv_all_cols(c)) {
 3395         pputs(mprn, A_("scanning for data...\n"));
 3396     } else {
 3397         pputs(mprn, A_("scanning for row labels and data...\n"));
 3398     }
 3399     }
 3400 
 3401     fseek(fp, c->datapos, SEEK_SET);
 3402 
 3403     err = real_read_labels_and_data(c, fp, prn);
 3404 
 3405     if (!err && csv_skip_col_1(c) && !rows_subset(c) && !csv_skip_dates(c)) {
 3406     c->markerpd = test_markers_for_dates(c->dset, &reversed,
 3407                          c->skipstr, prn);
 3408     if (reversed) {
 3409         csv_set_data_reversed(c);
 3410     }
 3411     }
 3412 
 3413     return err;
 3414 }
 3415 
 3416 static void print_csv_parsing_header (const char *fname, PRN *prn)
 3417 {
 3418     if (!g_utf8_validate(fname, -1, NULL)) {
 3419     gchar *trfname = g_locale_to_utf8(fname, -1, NULL, NULL, NULL);
 3420 
 3421     pprintf(prn, "%s %s...\n", A_("parsing"), trfname);
 3422     g_free(trfname);
 3423     } else {
 3424     pprintf(prn, "%s %s...\n", A_("parsing"), fname);
 3425     }
 3426 }
 3427 
 3428 static int join_unique_columns (csvdata *c)
 3429 {
 3430     const char **cnames = c->jspec->colnames;
 3431     char *counted;
 3432     int i, j, ncols = 0;
 3433 
 3434     counted = calloc(c->jspec->ncols, 1);
 3435 
 3436     for (i=0; i<c->jspec->ncols; i++) {
 3437     if (cnames[i] != NULL && counted[i] == 0) {
 3438         counted[i] = 1;
 3439         /* mark any duplicates as counted too */
 3440         for (j=i+1; j<c->jspec->ncols; j++) {
 3441         if (cnames[j] != NULL && !strcmp(cnames[i], cnames[j])) {
 3442             counted[j] = 1;
 3443         }
 3444         }
 3445 #if CDEBUG
 3446         fprintf(stderr, "join_unique_columns: '%s'\n", cnames[i]);
 3447 #endif
 3448         ncols++;
 3449     }
 3450     }
 3451 
 3452     free(counted);
 3453 
 3454     return ncols;
 3455 }
 3456 
 3457 static int csv_set_dataset_dimensions (csvdata *c)
 3458 {
 3459     int err = 0;
 3460 
 3461     c->dset->v = 0;
 3462 
 3463     if (rows_subset(c)) {
 3464     c->dset->n = n_from_row_mask(c);
 3465     }
 3466 
 3467     if (fixed_format(c)) {
 3468     if (c->dset->n == 0) {
 3469         c->dset->n = c->nrows;
 3470     }
 3471     c->dset->v = c->ncols + 1;
 3472     } else {
 3473     int cols_wanted, cols_present;
 3474 
 3475     if (c->dset->n == 0) {
 3476         if (csv_no_header(c)) {
 3477         c->dset->n = c->nrows;
 3478         } else {
 3479         /* allow for varnames row */
 3480         c->dset->n = c->nrows - 1;
 3481         }
 3482     }
 3483 
 3484     cols_present = csv_skip_col_1(c) ? (c->ncols - 1) : c->ncols;
 3485 
 3486     if (joining(c)) {
 3487         cols_wanted = join_unique_columns(c);
 3488     } else if (cols_subset(c)) {
 3489         cols_wanted = c->cols_list[0];
 3490     } else {
 3491         cols_wanted = cols_present;
 3492     }
 3493 
 3494     if (cols_wanted > cols_present) {
 3495         gretl_errmsg_set(_("Invalid column specification"));
 3496         err = E_DATA;
 3497     } else {
 3498         /* allow for the constant */
 3499         c->dset->v = cols_wanted + 1;
 3500     }
 3501     }
 3502 
 3503     if (probing(c)) {
 3504     /* don't allocate tons of space for data that
 3505        we won't read right now */
 3506     c->dset->n = 1;
 3507     }
 3508 
 3509 #if CDEBUG
 3510     if (joining(c)) {
 3511     fprintf(stderr, "csv dataset dimensions: v=%d, n=%d\n",
 3512         c->dset->v, c->dset->n);
 3513     }
 3514 #endif
 3515 
 3516     return err;
 3517 }
 3518 
 3519 /*
 3520  * real_import_csv:
 3521  * @fname: name of CSV file.
 3522  * @dset: dataset struct.
 3523  * @cols: column specification.
 3524  * @rows: row specification.
 3525  * @join: specification pertaining to "join" command.
 3526  * @probe: also pertains to "join" (via GUI).
 3527  * @pm: location of matrix to accept the data or NULL.
 3528  * @opt: use OPT_N to force interpretation of data colums containing
 3529  * strings as coded (non-numeric) values and not errors; use OPT_H
 3530  * to indicate absence of a header row; use OPT_A to indicate that
 3531  * all columns should be read as data series (i.e. do not try to
 3532  * interpret the first column as observation labels); for use of
 3533  * OPT_T see the help text for the "append" command.
 3534  * @prn: gretl printing struct (or NULL).
 3535  *
 3536  * Open a Comma-Separated Values data file and read the data into
 3537  * the current work space.
 3538  *
 3539  * Returns: 0 on successful completion, non-zero otherwise.
 3540  */
 3541 
 3542 static int real_import_csv (const char *fname,
 3543                 DATASET *dset,
 3544                 const char *cols,
 3545                 const char *rows,
 3546                 joinspec *join,
 3547                 csvprobe *probe,
 3548                 gretl_matrix **pm,
 3549                 gretlopt opt,
 3550                 PRN *prn)
 3551 {
 3552     csvdata *c = NULL;
 3553     FILE *fp = NULL;
 3554     PRN *mprn = NULL;
 3555     gchar *altname = NULL;
 3556     int recode = 0;
 3557     int popit = 0;
 3558     int i, err = 0;
 3559 
 3560     import_na_init();
 3561 
 3562     if (prn != NULL) {
 3563     set_alt_gettext_mode(prn);
 3564     }
 3565 
 3566     if (gretl_messages_on()) {
 3567     mprn = prn;
 3568     }
 3569 
 3570     fp = gretl_fopen(fname, "rb");
 3571     if (fp == NULL) {
 3572     pprintf(prn, A_("Couldn't open %s\n"), fname);
 3573     err = E_FOPEN;
 3574     goto csv_bailout;
 3575     }
 3576 
 3577     c = csvdata_new(dset);
 3578     if (c == NULL) {
 3579     err = E_ALLOC;
 3580     goto csv_bailout;
 3581     }
 3582 
 3583     recode = csv_unicode_check(fp, c, prn);
 3584     if (recode) {
 3585     err = csv_recode_input(&fp, fname, &altname, recode, prn);
 3586     if (err) {
 3587         goto csv_bailout;
 3588     }
 3589     }
 3590 
 3591     if (cols != NULL) {
 3592     err = csvdata_add_cols_list(c, cols, opt);
 3593     if (err) {
 3594         goto csv_bailout;
 3595     } else if (fixed_format(c)) {
 3596         pprintf(mprn, A_("using fixed column format\n"));
 3597     }
 3598     }
 3599 
 3600     if (rows != NULL) {
 3601     err = csvdata_add_row_mask(c, rows);
 3602     if (err) {
 3603         goto csv_bailout;
 3604     }
 3605     }
 3606 
 3607     if (opt & OPT_H) {
 3608     csv_set_no_header(c);
 3609     }
 3610 
 3611     if (join != NULL) {
 3612     c->jspec = join;
 3613     c->flags |= CSV_HAVEDATA;
 3614     } else if (probe != NULL) {
 3615     c->probe = probe;
 3616     c->flags |= CSV_HAVEDATA;
 3617     } else {
 3618     if (pm != NULL) {
 3619         csv_set_as_matrix(c);
 3620     }
 3621         if (opt & OPT_A) {
 3622         csv_set_all_cols(c);
 3623         }
 3624         if (opt & OPT_V) {
 3625             csv_set_verbose(c);
 3626         }
 3627     }
 3628 
 3629     if (opt & OPT_I) {
 3630     csv_unset_keep_quotes(c);
 3631     }
 3632 
 3633     if (mprn != NULL) {
 3634     print_csv_parsing_header(fname, mprn);
 3635     }
 3636 
 3637     /* get line length, also check for binary data, etc. */
 3638     c->maxlinelen = csv_max_line_length(fp, c, prn);
 3639     if (c->maxlinelen <= 0) {
 3640     err = E_DATA;
 3641     goto csv_bailout;
 3642     }
 3643 
 3644     if (csv_as_matrix(c) && csv_got_semi(c)) {
 3645     if (c->delim == ',' && csv_got_delim(c)) {
 3646         c->decpoint = ',';
 3647     }
 3648     c->delim = ';';
 3649     } else if (!fixed_format(c) && !csv_got_delim(c)) {
 3650     /* set default delimiter */
 3651     if (csv_got_tab(c)) {
 3652         c->delim = '\t';
 3653     } else if (csv_got_semi(c)) {
 3654         c->delim = ';';
 3655     } else {
 3656         c->delim = ' ';
 3657     }
 3658     }
 3659 
 3660 #if CDEBUG
 3661     fprintf(stderr, "fixed_format? %s; got_delim (%c)? %s; got_tab? %s; ",
 3662         fixed_format(c) ? "yes" : "no", c->delim,
 3663         csv_got_delim(c) ? "yes" : "no",
 3664         csv_got_tab(c)? "yes" : "no");
 3665     fprintf(stderr, "decpoint '%c'\n", c->decpoint);
 3666 #endif
 3667 
 3668     /* buffer to hold lines */
 3669     c->line = malloc(c->maxlinelen);
 3670     if (c->line == NULL) {
 3671     err = E_ALLOC;
 3672     goto csv_bailout;
 3673     }
 3674 
 3675  alt_delim:
 3676 
 3677     if (mprn != NULL) {
 3678     if (!fixed_format(c)) {
 3679         pprintf(mprn, A_("using delimiter '%c'\n"), c->delim);
 3680     }
 3681     pprintf(mprn, A_("   longest line: %d characters\n"), c->maxlinelen - 1);
 3682     }
 3683 
 3684     if (csv_has_trailing_comma(c) && c->delim != ',') {
 3685     csv_unset_trailing_comma(c);
 3686     }
 3687 
 3688     rewind(fp);
 3689 
 3690     /* read lines, check for consistency in number of fields */
 3691     err = csv_fields_check(fp, c, mprn);
 3692     if (err && !fixed_format(c)) {
 3693     if (c->delim != ';' && csv_got_semi(c)) {
 3694         c->delim = ';';
 3695         err = 0;
 3696         goto alt_delim;
 3697     }
 3698     pputs(prn, A_(csv_msg));
 3699     goto csv_bailout;
 3700     }
 3701 
 3702     err = csv_set_dataset_dimensions(c);
 3703     if (err) {
 3704     err = E_DATA;
 3705     goto csv_bailout;
 3706     }
 3707 
 3708     pprintf(mprn, A_("   number of variables: %d\n"), c->dset->v - 1);
 3709     pprintf(mprn, A_("   number of non-blank lines: %d\n"), c->nrows);
 3710 
 3711     if (c->dset->n == 0) {
 3712     pputs(prn, A_("Invalid data file\n"));
 3713     err = E_DATA;
 3714     goto csv_bailout;
 3715     }
 3716 
 3717     /* initialize CSV dataset */
 3718     err = start_new_Z(c->dset, 0);
 3719     if (!err && csv_skip_col_1(c)) {
 3720     err = dataset_allocate_obs_markers(c->dset);
 3721     }
 3722 
 3723     if (err) {
 3724     goto csv_bailout;
 3725     }
 3726 
 3727     /* second pass */
 3728 
 3729     rewind(fp);
 3730 
 3731     if (fixed_format(c)) {
 3732     err = fixed_format_read(c, fp, prn);
 3733     if (err) {
 3734         goto csv_bailout;
 3735     } else {
 3736         csv_set_autoname(c);
 3737         goto csv_continue;
 3738     }
 3739     }
 3740 
 3741     err = csv_varname_scan(c, fp, prn, mprn);
 3742     if (err || probing(c)) {
 3743     goto csv_bailout;
 3744     }
 3745 
 3746     if (c->decpoint == '.' && get_local_decpoint() == ',') {
 3747     /* we're in a locale that uses decimal comma:
 3748        switch to the C locale */
 3749     gretl_push_c_numeric_locale();
 3750     popit = 1;
 3751     } else if (c->decpoint == ',' && get_local_decpoint() == '.') {
 3752     /* dotsub: define this if we're in a '.' locale and
 3753        we've figured that the decimal character is ',' in
 3754        the file we're reading
 3755     */
 3756     csv_set_dotsub(c);
 3757     }
 3758 
 3759     err = csv_read_data(c, fp, prn, mprn);
 3760 
 3761     if (!err) {
 3762     /* try again, under certain conditions */
 3763     if (csv_skip_bad(c)) {
 3764         err = csv_read_data(c, fp, prn, NULL);
 3765     } else if (c->thousep > 0) {
 3766         pprintf(mprn, A_("WARNING: it seems '%c' is being used "
 3767                  "as thousands separator\n"), c->thousep);
 3768         c->decpoint = (c->thousep == '.')? ',' : '.';
 3769         if (c->decpoint == ',') {
 3770         if (get_local_decpoint() == '.') {
 3771             csv_set_dotsub(c);
 3772         } else if (popit) {
 3773             gretl_pop_c_numeric_locale();
 3774             popit = 0;
 3775         }
 3776         }
 3777         revise_non_numeric_values(c);
 3778         csv_set_scrub_thousep(c);
 3779         err = csv_read_data(c, fp, prn, NULL);
 3780     }
 3781     }
 3782 
 3783     if (!err && !probing(c)) {
 3784     err = csv_non_numeric_check(c, prn);
 3785     if (!err && csv_has_non_numeric(c)) {
 3786         /* try once more */
 3787         err = csv_read_data(c, fp, prn, NULL);
 3788     }
 3789     }
 3790 
 3791     if (popit) {
 3792     gretl_pop_c_numeric_locale();
 3793     }
 3794 
 3795     if (err) {
 3796     goto csv_bailout;
 3797     }
 3798 
 3799     if (csv_data_reversed(c)) {
 3800     reverse_data(c->dset, mprn);
 3801     }
 3802 
 3803  csv_continue:
 3804 
 3805     c->dset->t1 = 0;
 3806     c->dset->t2 = c->dset->n - 1;
 3807 
 3808     if (c->markerpd > 0) {
 3809     pputs(mprn, A_("taking date information from row labels\n\n"));
 3810     if (csv_skip_bad(c)) {
 3811         pprintf(prn, "WARNING: Check your data! gretl has stripped out "
 3812             "what appear to be\nextraneous lines in a %s dataset: "
 3813             "this may not be right.\n\n",
 3814             (c->dset->pd == 4)? "quarterly" : "monthly");
 3815     }
 3816     } else {
 3817     pputs(mprn, A_("treating these as undated data\n\n"));
 3818     dataset_obs_info_default(c->dset);
 3819     }
 3820 
 3821     if (c->dset->pd != 1 || strcmp(c->dset->stobs, "1")) {
 3822         c->dset->structure = TIME_SERIES;
 3823     }
 3824 
 3825     if (c->st != NULL) {
 3826     err = gretl_string_table_validate(c->st);
 3827     if (err) {
 3828         pputs(prn, A_("Failed to interpret the data as numeric\n"));
 3829         goto csv_bailout;
 3830     } else if (joining(c)) {
 3831         gretl_string_table_save(c->st, c->dset);
 3832     } else {
 3833         gretl_string_table_print(c->st, c->dset, fname, prn);
 3834     }
 3835     }
 3836 
 3837     if (csv_as_matrix(c)) {
 3838     /* FIXME placement of this */
 3839     if (csv_autoname(c)) {
 3840         strings_array_free(c->dset->varname, c->dset->v);
 3841         c->dset->varname = NULL;
 3842     }
 3843     *pm = gretl_matrix_data_subset(NULL, c->dset, -1, -1,
 3844                        M_MISSING_OK, &err);
 3845     goto csv_bailout;
 3846     }
 3847 
 3848     /* If there were observation labels and they were not interpretable
 3849        as dates, and they weren't simply "1, 2, 3, ...", then they
 3850        should probably be preserved; otherwise discard them.
 3851     */
 3852     if (c->dset->S != NULL && c->markerpd >= 0 &&
 3853     c->dset->markers != DAILY_DATE_STRINGS) {
 3854     dataset_destroy_obs_markers(c->dset);
 3855     }
 3856 
 3857     if (csv_autoname(c)) {
 3858     /* no variable names were found */
 3859     for (i=1; i<c->dset->v; i++) {
 3860         sprintf(c->dset->varname[i], "v%d", i);
 3861     }
 3862     } else {
 3863 #if CDEBUG
 3864     int ii;
 3865 
 3866     for (ii=0; ii<c->dset->v; ii++) {
 3867         fprintf(stderr, " c->dset->varname[%d] = '%s'\n", ii, c->dset->varname[ii]);
 3868     }
 3869 #endif
 3870     if (fix_varname_duplicates(c->dset)) {
 3871         pputs(prn, A_("warning: some variable names were duplicated\n"));
 3872     }
 3873     }
 3874 
 3875     if (!joining(c) && !probing(c)) {
 3876     int newdata = (dset->Z == NULL);
 3877 
 3878     /* not doing a special "join" operation */
 3879     err = merge_or_replace_data(dset, &c->dset, get_merge_opts(opt), prn);
 3880 
 3881     if (!err && newdata && c->descrip != NULL) {
 3882         dset->descrip = c->descrip;
 3883         c->descrip = NULL;
 3884     }
 3885 
 3886     if (!err && newdata) {
 3887         dataset_add_import_info(dset, fname, GRETL_CSV);
 3888     }
 3889     }
 3890 
 3891  csv_bailout:
 3892 
 3893     if (fp != NULL) {
 3894     fclose(fp);
 3895     }
 3896 
 3897     if (!err && c->jspec != NULL) {
 3898     c->jspec->c = c;
 3899     } else if (!err && c->probe != NULL) {
 3900     c->probe->dset = c->dset;
 3901     c->dset = NULL;
 3902     csvdata_free(c);
 3903     } else {
 3904     csvdata_free(c);
 3905     }
 3906 
 3907     if (altname != NULL) {
 3908     gretl_remove(altname);
 3909     g_free(altname);
 3910     }
 3911 
 3912     if (err == E_ALLOC) {
 3913     pputs(prn, A_("Out of memory\n"));
 3914     }
 3915 
 3916     return err;
 3917 }
 3918 
 3919 /**
 3920  * import_csv:
 3921  * @fname: name of CSV file.
 3922  * @dset: dataset struct.
 3923  * @opt: use OPT_N to force interpretation of data colums containing
 3924  * strings as coded (non-numeric) values and not errors; for use of
 3925  * OPT_T see the help for "append".
 3926  * @prn: gretl printing struct (or NULL).
 3927  *
 3928  * Open a Comma-Separated Values data file and read the data into
 3929  * the current work space.
 3930  *
 3931  * Returns: 0 on successful completion, non-zero otherwise.
 3932  */
 3933 
 3934 int import_csv (const char *fname, DATASET *dset,
 3935         gretlopt opt, PRN *prn)
 3936 {
 3937     const char *cols = NULL;
 3938     const char *rows = NULL;
 3939     int ci, err;
 3940 
 3941     err = incompatible_options(opt, OPT_F | OPT_L);
 3942     if (err) {
 3943     /* --cols and --fixed-cols */
 3944     return err;
 3945     }
 3946 
 3947     ci = (dset != NULL && dset->v > 0)? APPEND : OPEN;
 3948 
 3949     if (opt & OPT_F) {
 3950     /* we should have a "--fixed-cols=XXX" specification */
 3951     cols = get_optval_string(ci, OPT_F);
 3952     if (cols == NULL || *cols == '\0') {
 3953         return E_PARSE;
 3954     }
 3955     } else if (opt & OPT_L) {
 3956     /* should have a "--cols=XXX" specification */
 3957     cols = get_optval_string(ci, OPT_L);
 3958     if (cols == NULL || *cols == '\0') {
 3959         return E_PARSE;
 3960     }
 3961     }
 3962 
 3963     if (opt & OPT_M) {
 3964     /* we should have a "--rowmask=XXX" specification */
 3965     rows = get_optval_string(ci, OPT_M);
 3966     if (rows == NULL || *rows == '\0') {
 3967         return E_PARSE;
 3968     }
 3969     }
 3970 
 3971     return real_import_csv(fname, dset, cols, rows,
 3972                NULL, NULL, NULL, opt, prn);
 3973 }
 3974 
 3975 gretl_matrix *import_csv_as_matrix (const char *fname, int *err)
 3976 {
 3977 #if CDEBUG
 3978     PRN *prn = gretl_print_new(GRETL_PRINT_STDERR, NULL);
 3979 #else
 3980     PRN *prn = NULL;
 3981 #endif
 3982     gretl_matrix *m = NULL;
 3983     char csvname[MAXLEN] = {0};
 3984     gretlopt opt = OPT_A; /* --all-cols */
 3985     int http = 0;
 3986 
 3987     *err = try_http(fname, csvname, &http);
 3988 
 3989     if (!*err && http) {
 3990     *err = real_import_csv(csvname, NULL, NULL, NULL,
 3991                    NULL, NULL, &m, opt, prn);
 3992     } else if (!*err) {
 3993     *err = real_import_csv(fname, NULL, NULL, NULL,
 3994                    NULL, NULL, &m, opt, prn);
 3995     }
 3996 
 3997     gretl_print_destroy(prn);
 3998 
 3999     return m;
 4000 }
 4001 
 4002 static int probe_varnames_check (DATASET *dset, gretlopt opt,
 4003                  int *rerun)
 4004 {
 4005     int missnames = 0;
 4006     int i, err = 0;
 4007 
 4008     for (i=1; i<dset->v; i++) {
 4009     if (dset->varname[i][0] == '\0') {
 4010         missnames = 1;
 4011         break;
 4012     }
 4013     }
 4014 
 4015     if (missnames) {
 4016     if (opt & OPT_H) {
 4017         gretl_errmsg_set("Couldn't find all variable names");
 4018         err = E_DATA;
 4019     } else {
 4020         *rerun = 1;
 4021     }
 4022     }
 4023 
 4024     return err;
 4025 }
 4026 
 4027 /**
 4028  * probe_csv:
 4029  * @fname: name of CSV file.
 4030  * @varnames: location to receive variable names.
 4031  * @nvars: location to receive number of variables (columns).
 4032  * @opt: on input, may contain any extra options to pass to
 4033  * real_import_csv(); on return, OPT_H (indicating that the
 4034  * CSV file has no header) may be added if it seems to be
 4035  * required (no header).
 4036  *
 4037  * Open a Comma-Separated Values data file and read enough to
 4038  * determine the variable names.
 4039  *
 4040  * Returns: 0 on successful completion, non-zero otherwise.
 4041  */
 4042 
 4043 int probe_csv (const char *fname, char ***varnames,
 4044            int *nvars, gretlopt *opt)
 4045 {
 4046     csvprobe probe = {0};
 4047     int err;
 4048 
 4049     err = real_import_csv(fname, NULL, NULL, NULL, NULL,
 4050               &probe, NULL, *opt, NULL);
 4051 
 4052     if (!err) {
 4053     int rerun = 0;
 4054 
 4055     err = probe_varnames_check(probe.dset, *opt, &rerun);
 4056 
 4057     if (err || rerun) {
 4058         destroy_dataset(probe.dset);
 4059         probe.dset = NULL;
 4060     }
 4061 
 4062     if (!err && rerun) {
 4063         /* try again with --no-header flag */
 4064         *opt |= OPT_H;
 4065         err = real_import_csv(fname, NULL, NULL, NULL, NULL,
 4066                   &probe, NULL, *opt, NULL);
 4067     }
 4068 
 4069     if (!err) {
 4070         /* steal the varname array */
 4071         *varnames = probe.dset->varname;
 4072         *nvars = probe.dset->v;
 4073         probe.dset->varname = NULL;
 4074     }
 4075 
 4076     destroy_dataset(probe.dset);
 4077     }
 4078 
 4079     return err;
 4080 }
 4081 
 4082 int csv_open_needs_matrix (gretlopt opt)
 4083 {
 4084     int ret = 0;
 4085 
 4086     if (opt & OPT_M) {
 4087     /* --rowmask=matrix */
 4088     ret = 1;
 4089     } else if (opt & OPT_F) {
 4090     /* --fixed-cols=whatever */
 4091     const char *s = get_optval_string(OPEN, OPT_F);
 4092 
 4093     ret = get_matrix_by_name(s) != NULL;
 4094     }
 4095 
 4096     return ret;
 4097 }
 4098 
 4099 typedef double keynum;
 4100 #define KEYNUM_FMT "%g"
 4101 
 4102 /* below: apparatus to implement the "join" command */
 4103 
 4104 struct jr_row_ {
 4105     int n_keys;     /* number of keys (needed for qsort callback) */
 4106     keynum keyval;  /* primary key value */
 4107     keynum keyval2; /* secondary key value, if applicable */
 4108     int micro;      /* high-frequency "key", if any */
 4109     int dset_row;   /* associated row in the RHS or outer dataset */
 4110     double aux;     /* auxiliary value */
 4111 };
 4112 
 4113 typedef struct jr_row_ jr_row;
 4114 
 4115 struct obskey_ {
 4116     char *timefmt; /* time format, as in strptime */
 4117     int keycol;    /* the column holding the outer time-key */
 4118     int m_means_q; /* "monthly means quarterly" */
 4119     int numdates;  /* flag for conversion from numeric to string */
 4120     int native;    /* native time-series info */
 4121 };
 4122 
 4123 typedef struct obskey_ obskey;
 4124 
 4125 struct joiner_ {
 4126     int n_rows;     /* number of rows in data table */
 4127     int n_keys;     /* number of keys used (0, 1 or 2) */
 4128     int n_unique;   /* number of unique primary key values on right */
 4129     jr_row *rows;   /* array of table rows */
 4130     keynum *keys;   /* array of unique (primary) key values as 64-bit ints */
 4131     int *key_freq;  /* counts of occurrences of (primary) key values */
 4132     int *key_row;   /* record of starting row in joiner table for primary keys */
 4133     int *str_keys;  /* flags for string comparison of key(s) */
 4134     const int *l_keyno; /* list of key columns in left-hand dataset */
 4135     const int *r_keyno; /* list of key columns in right-hand dataset */
 4136     AggrType aggr;      /* aggregation method for 1:n joining */
 4137     int seqval;         /* sequence number for aggregation */
 4138     int auxcol;         /* auxiliary data column for aggregation */
 4139     int midas_m;        /* midas frequency ratio */
 4140     int midas_pd;       /* frequency of outer dataset */
 4141     obskey *auto_keys;  /* struct to hold info on obs-based key(s) */
 4142     DATASET *l_dset;    /* the left-hand or inner dataset */
 4143     DATASET *r_dset;    /* the right-hand or outer temporary dataset */
 4144 };
 4145 
 4146 typedef struct joiner_ joiner;
 4147 
 4148 struct jr_filter_ {
 4149     const char *expr;  /* expression to be run through "genr" */
 4150     const double *val; /* (series) result of evaluating @expr */
 4151     char *vname1;      /* first right-hand variable name */
 4152     char *vname2;      /* second right-hand variable name */
 4153     char *vname3;      /* third right-hand variable name */
 4154 };
 4155 
 4156 typedef struct jr_filter_ jr_filter;
 4157 
 4158 static int expand_jspec (joinspec *jspec, int addvars);
 4159 
 4160 static void jr_filter_destroy (jr_filter *f)
 4161 {
 4162     if (f != NULL) {
 4163     free(f->vname1);
 4164     free(f->vname2);
 4165     free(f->vname3);
 4166     free(f);
 4167     }
 4168 }
 4169 
 4170 static void joiner_destroy (joiner *jr)
 4171 {
 4172     if (jr != NULL) {
 4173     free(jr->rows);
 4174     free(jr->keys);
 4175     free(jr->key_freq);
 4176     free(jr->key_row);
 4177     free(jr);
 4178     }
 4179 }
 4180 
 4181 static joiner *joiner_new (int nrows)
 4182 {
 4183     joiner *jr = malloc(sizeof *jr);
 4184 
 4185     if (jr != NULL) {
 4186     jr->rows = calloc(nrows, sizeof *jr->rows);
 4187     if (jr->rows == NULL) {
 4188         free(jr);
 4189         jr = NULL;
 4190     }
 4191     }
 4192 
 4193     if (jr != NULL) {
 4194     jr->n_rows = nrows;
 4195     jr->n_unique = 0;
 4196     jr->keys = NULL;
 4197     jr->key_freq = NULL;
 4198     jr->key_row = NULL;
 4199     jr->l_keyno = NULL;
 4200     jr->r_keyno = NULL;
 4201     }
 4202 
 4203     return jr;
 4204 }
 4205 
 4206 static int real_set_outer_auto_keys (joiner *jr, const char *s,
 4207                      int j, struct tm *tp)
 4208 {
 4209     int err = 0;
 4210 
 4211     if (calendar_data(jr->l_dset)) {
 4212     int y, m, d, eday;
 4213 
 4214     y = tp->tm_year + 1900;
 4215     m = tp->tm_mon + 1;
 4216     d = tp->tm_mday;
 4217     eday = epoch_day_from_ymd(y, m, d);
 4218     if (eday < 0) {
 4219         if (s != NULL) {
 4220         gretl_errmsg_sprintf("'%s' is not a valid date", s);
 4221         }
 4222         err = E_DATA;
 4223     } else {
 4224         jr->rows[j].n_keys = 1;
 4225         jr->rows[j].keyval = eday;
 4226         jr->rows[j].keyval2 = 0;
 4227         jr->rows[j].micro = 0;
 4228     }
 4229     } else {
 4230     int major = tp->tm_year + 1900;
 4231     int minor = tp->tm_mon + 1;
 4232     int micro = 0;
 4233 
 4234     if (jr->auto_keys->m_means_q) {
 4235         /* using the gretl-specific "%q" conversion */
 4236         if (minor > 4) {
 4237         gretl_errmsg_sprintf("'%s' is not a valid date", s);
 4238         err = E_DATA;
 4239         }
 4240     } else if (jr->l_dset->pd == 4) {
 4241         /* map from month on right to quarter on left, but
 4242            preserve the month info in case we need it
 4243         */
 4244         micro = minor;
 4245         minor = (int) ceil(minor / 3.0);
 4246     }
 4247     if (!err && micro == 0) {
 4248         micro = tp->tm_mday;
 4249     }
 4250     if (!err) {
 4251         jr->rows[j].n_keys = 2;
 4252         jr->rows[j].keyval = major;
 4253         jr->rows[j].keyval2 = minor;
 4254         jr->rows[j].micro = micro;
 4255     }
 4256     }
 4257 
 4258     return err;
 4259 }
 4260 
 4261 static int set_time_format (obskey *auto_keys, const char *fmt)
 4262 {
 4263     if (auto_keys->timefmt != NULL) {
 4264     free(auto_keys->timefmt);
 4265     }
 4266     auto_keys->timefmt = gretl_strdup(fmt);
 4267     return auto_keys->timefmt == NULL ? E_ALLOC : 0;
 4268 }
 4269 
 4270 /* convert a numerical value to string for use with strptime */
 4271 
 4272 static int numdate_to_string (char *targ, double x)
 4273 {
 4274     if (na(x)) {
 4275     return E_MISSDATA;
 4276     } else {
 4277     sprintf(targ, "%.16g", x);
 4278     return 0;
 4279     }
 4280 }
 4281 
 4282 /* Parse a string from row @i of the outer dataset and set the
 4283    key(s) on row @j of the joiner struct. The indices @i and @j may
 4284    not be equal if a filter is being used. Note: we don't come
 4285    here if the outer time-key column is subject to "tconvert"
 4286    treatment; in that case we use read_iso_basic instead.
 4287 */
 4288 
 4289 static int read_outer_auto_keys (joiner *jr, int j, int i)
 4290 {
 4291     char *tfmt = jr->auto_keys->timefmt;
 4292     int numdates = jr->auto_keys->numdates;
 4293     int tcol = jr->auto_keys->keycol;
 4294     int pd = jr->l_dset->pd;
 4295     struct tm t = {0};
 4296     char sconv[32];
 4297     const char *s;
 4298     char *test;
 4299     int s_src = 0;
 4300     int err = 0;
 4301 
 4302     if (tcol >= 0) {
 4303     /* using a specified column */
 4304     if (numdates) {
 4305         /* column is numeric, conversion needed */
 4306         numdate_to_string(sconv, jr->r_dset->Z[tcol][i]);
 4307         s = sconv;
 4308         s_src = 1;
 4309     } else {
 4310         /* column is string-valued, OK */
 4311         s = series_get_string_for_obs(jr->r_dset, tcol, i);
 4312         s_src = 2;
 4313     }
 4314     } else if (jr->auto_keys->native) {
 4315     /* using native time-series info on right */
 4316     ntodate_8601(sconv, i, jr->r_dset);
 4317     s = sconv;
 4318     s_src = 1;
 4319     } else {
 4320     /* using first-column observation strings */
 4321     s = jr->r_dset->S[i];
 4322     s_src = 3;
 4323     }
 4324 
 4325     /* note: with strptime, a NULL return means that an error
 4326        occurred while a non-NULL and non-empty return string
 4327        means a trailing portion of the input was not
 4328        processed.
 4329     */
 4330     test = strptime(s, tfmt, &t);
 4331 
 4332     if (test == NULL || *test != '\0') {
 4333     err = E_DATA;
 4334     if (j == 0 && test != NULL && (pd == 12 || pd == 4 || pd == 1)) {
 4335         /* If we're looking at the first row of the filtered data,
 4336            allow for the possibility that we got "excess
 4337            precision", i.e. a daily date string when the left-hand
 4338            dataset is monthly, quarterly or annual.
 4339         */
 4340         char *chk = strptime(s, "%Y-%m-%d", &t);
 4341 
 4342         if (chk != NULL && *chk == '\0') {
 4343         set_time_format(jr->auto_keys, "%Y-%m-%d");
 4344         err = 0; /* we might be OK, cancel the error for now */
 4345         }
 4346     }
 4347     if (err) {
 4348         gretl_errmsg_sprintf("'%s' does not match the format '%s'", s, tfmt);
 4349         fprintf(stderr, "time-format match error in read_outer_auto_keys:\n"
 4350             " remainder = '%s' (source = %s)\n", test ? test : "null",
 4351             s_src < 3 ? "specified time column" : "first-column strings");
 4352     }
 4353     }
 4354 
 4355     if (!err) {
 4356     err = real_set_outer_auto_keys(jr, s, j, &t);
 4357     }
 4358 
 4359     return err;
 4360 }
 4361 
 4362 static int read_iso_basic (joiner *jr, int j, int i)
 4363 {
 4364     int tcol = jr->auto_keys->keycol;
 4365     double x;
 4366     int err = 0;
 4367 
 4368     x = jr->r_dset->Z[tcol][i];
 4369 
 4370     if (na(x)) {
 4371     err = E_MISSDATA;
 4372     } else {
 4373     int y = (int) floor(x / 10000);
 4374     int m = (int) floor((x - 10000*y) / 100);
 4375     int d = (int) (x - 10000*y - 100*m);
 4376     guint32 ed = epoch_day_from_ymd(y, m, d);
 4377 
 4378     if (ed <= 0) {
 4379         gretl_errmsg_sprintf("'%.8g' is not a valid date", x);
 4380         err = E_DATA;
 4381     } else if (calendar_data(jr->l_dset)) {
 4382         /* note: no need to go via struct tm */
 4383         jr->rows[j].n_keys = 1;
 4384         jr->rows[j].keyval = ed;
 4385         jr->rows[j].keyval2 = 0;
 4386         jr->rows[j].micro = 0;
 4387     } else {
 4388         struct tm t = {0};
 4389 
 4390         t.tm_year = y - 1900;
 4391         t.tm_mon = m - 1;
 4392         t.tm_mday = d;
 4393         err = real_set_outer_auto_keys(jr, NULL, j, &t);
 4394     }
 4395     }
 4396 
 4397     return err;
 4398 }
 4399 
 4400 /* Evaluate the filter expression provided by the user, and if it
 4401    works OK count the number of rows on which the filter returns
 4402    non-zero.  Flag an error if the filter gives NA on any row, since
 4403    it is then indeterminate.
 4404 */
 4405 
 4406 static int evaluate_filter (jr_filter *filter, DATASET *r_dset,
 4407                 int *nrows)
 4408 {
 4409     char *line;
 4410     int i, err = 0;
 4411 
 4412     line = gretl_strdup_printf("filtered__=%s", filter->expr);
 4413     if (line == NULL) {
 4414     err = E_ALLOC;
 4415     } else {
 4416     err = generate(line, r_dset, GRETL_TYPE_SERIES,
 4417                OPT_P | OPT_Q, NULL);
 4418     }
 4419 
 4420     if (!err) {
 4421     int v = r_dset->v - 1;
 4422 
 4423     filter->val = r_dset->Z[v];
 4424     *nrows = 0;
 4425 
 4426 #if CDEBUG > 1
 4427     fprintf(stderr, "filter genr: '%s':\n", line);
 4428     for (i=0; i<r_dset->n; i++) {
 4429         fprintf(stderr, " %d: %g\n", i+1, filter->val[i]);
 4430     }
 4431 #endif
 4432     for (i=0; i<r_dset->n; i++) {
 4433         if (na(filter->val[i])) {
 4434         gretl_errmsg_sprintf("join filter: indeterminate "
 4435                      "value on row %d", i+1);
 4436         err = E_MISSDATA;
 4437         break;
 4438         } else if (filter->val[i] != 0.0) {
 4439         *nrows += 1;
 4440         }
 4441     }
 4442     }
 4443 
 4444     free(line);
 4445 
 4446     return err;
 4447 }
 4448 
 4449 static keynum dtoll (double x, int *err)
 4450 {
 4451     if (na(x)) {
 4452     *err = E_DATA;
 4453     return -1;
 4454     } else {
 4455     return x;
 4456     }
 4457 }
 4458 
 4459 static keynum dtoll_full (double x, int key, int row, int *err)
 4460 {
 4461     if (na(x)) {
 4462     if (key == 2) {
 4463         gretl_errmsg_sprintf("%s: invalid secondary outer key value on row %d",
 4464                  "join", row);
 4465     } else {
 4466         gretl_errmsg_sprintf("%s: invalid (primary) outer key value on row %d",
 4467                  "join", row);
 4468     }
 4469     *err = E_DATA;
 4470     return -1;
 4471     } else {
 4472     return x;
 4473     }
 4474 }
 4475 
 4476 /* Determine whether or not row @i of the outer data satisfies the
 4477    filter criterion; return 1 if the condition is met, 0 otherwise.
 4478 */
 4479 
 4480 static int join_row_wanted (jr_filter *filter, int i)
 4481 {
 4482     int ret = filter->val[i] != 0;
 4483 
 4484 #if CDEBUG > 2
 4485     fprintf(stderr, "join filter: %s row %d\n",
 4486         ret ? "keeping" : "discarding", i);
 4487 #endif
 4488 
 4489     return ret;
 4490 }
 4491 
 4492 static DATASET *outer_dataset (joinspec *jspec)
 4493 {
 4494     if (jspec->c != NULL) {
 4495     return jspec->c->dset;
 4496     } else {
 4497     return jspec->dset;
 4498     }
 4499 }
 4500 
 4501 #define using_auto_keys(j) (j->auto_keys->timefmt != NULL)
 4502 
 4503 static joiner *build_joiner (joinspec *jspec,
 4504                  DATASET *l_dset,
 4505                  jr_filter *filter,
 4506                  AggrType aggr,
 4507                  int seqval,
 4508                  obskey *auto_keys,
 4509                  int *err)
 4510 {
 4511     joiner *jr = NULL;
 4512     DATASET *r_dset = outer_dataset(jspec);
 4513     int keycol  = jspec->colnums[JOIN_KEY];
 4514     int valcol  = jspec->colnums[JOIN_TARG];
 4515     int key2col = jspec->colnums[JOIN_KEY2];
 4516     int auxcol  = jspec->colnums[JOIN_AUX];
 4517     int i, nrows = r_dset->n;
 4518 
 4519 #if CDEBUG
 4520     fprintf(stderr, "joiner columns:\n"
 4521         "KEY=%d, VAL=%d, F1=%d, F2=%d, F3=%d, KEY2=%d, AUX=%d\n",
 4522         keycol, valcol, jspec->colnums[JOIN_F1],
 4523         jspec->colnums[JOIN_F2], jspec->colnums[JOIN_F3],
 4524         key2col, auxcol);
 4525 #endif
 4526 
 4527     if (filter != NULL) {
 4528     *err = evaluate_filter(filter, r_dset, &nrows);
 4529     if (*err) {
 4530         return NULL;
 4531     } else if (nrows == 0) {
 4532         gretl_warnmsg_set(_("No matching data after filtering"));
 4533         return NULL;
 4534     }
 4535     }
 4536 
 4537 #if CDEBUG
 4538     fprintf(stderr, "after filtering: dset->n = %d, nrows = %d\n",
 4539         r_dset->n, nrows);
 4540 #endif
 4541 
 4542     jr = joiner_new(nrows);
 4543 
 4544     if (jr == NULL) {
 4545     *err = E_ALLOC;
 4546     } else {
 4547     double **Z = r_dset->Z;
 4548     int use_iso_basic = 0;
 4549     int j = 0;
 4550 
 4551     jr->aggr = aggr;
 4552     jr->seqval = seqval;
 4553     jr->auxcol = auxcol;
 4554     jr->l_dset = l_dset;
 4555     jr->r_dset = r_dset;
 4556     jr->auto_keys = auto_keys;
 4557     jr->midas_m = 0;
 4558 
 4559     if (using_auto_keys(jr)) {
 4560         /* check for the case where the outer time-key
 4561            column is in the "tconvert" set: if so we
 4562            know it will be in YYYYMMDD format and we'll
 4563            give it special treatment
 4564         */
 4565         int tcol = jr->auto_keys->keycol;
 4566 
 4567         if (tcol > 0 && jr->auto_keys->numdates) {
 4568         if (column_is_timecol(jr->r_dset->varname[tcol])) {
 4569             use_iso_basic = 1;
 4570         }
 4571         }
 4572     }
 4573 
 4574     /* Now transcribe the data we want: we're pulling from the
 4575        full outer dataset and writing into the array of joiner row
 4576        structs. At this point we're applying the join filter (if
 4577        any) but are not doing any matching by key to the inner
 4578        dataset.
 4579     */
 4580 
 4581     for (i=0; i<r_dset->n && !*err; i++) {
 4582         if (filter != NULL && !join_row_wanted(filter, i)) {
 4583         continue;
 4584         }
 4585         /* the keys */
 4586         if (use_iso_basic) {
 4587         *err = read_iso_basic(jr, j, i);
 4588         } else if (using_auto_keys(jr)) {
 4589         *err = read_outer_auto_keys(jr, j, i);
 4590         } else if (keycol > 0) {
 4591         jr->rows[j].keyval = dtoll_full(Z[keycol][i], 1, i+1, err);
 4592         if (!*err && key2col > 0) {
 4593             /* double key */
 4594             jr->rows[j].n_keys = 2;
 4595             jr->rows[j].keyval2 = dtoll_full(Z[key2col][i], 2, i+1, err);
 4596         } else {
 4597             /* single key */
 4598             jr->rows[j].n_keys = 1;
 4599             jr->rows[j].keyval2 = 0;
 4600         }
 4601         } else {
 4602         /* no keys have been specified */
 4603         jr->rows[j].n_keys = 0;
 4604         jr->rows[j].keyval = 0;
 4605         jr->rows[j].keyval2 = 0;
 4606         }
 4607         /* "payload" data: record the dataset row */
 4608         jr->rows[j].dset_row = valcol > 0 ? i : -1;
 4609         /* the auxiliary data */
 4610         jr->rows[j].aux = auxcol > 0 ? Z[auxcol][i] : 0;
 4611         j++;
 4612     }
 4613     }
 4614 
 4615     return jr;
 4616 }
 4617 
 4618 /* qsort callback for sorting rows of the joiner struct */
 4619 
 4620 static int compare_jr_rows (const void *a, const void *b)
 4621 {
 4622     const jr_row *ra = a;
 4623     const jr_row *rb = b;
 4624     int ret;
 4625 
 4626     ret = (ra->keyval > rb->keyval) - (ra->keyval < rb->keyval);
 4627 
 4628     if (ret == 0 && ra->n_keys > 1) {
 4629     ret = (ra->keyval2 > rb->keyval2) - (ra->keyval2 < rb->keyval2);
 4630     }
 4631 
 4632     if (ret == 0) {
 4633     /* ensure stable sort */
 4634     ret = a - b > 0 ? 1 : -1;
 4635     }
 4636 
 4637     return ret;
 4638 }
 4639 
 4640 /* Sort the rows of the joiner struct, by either one or two keys, then
 4641    figure out how many unique (primary) key values we have and
 4642    construct (a) an array of frequency of occurrence of these values
 4643    and (b) an array which records the first row of the joiner on
 4644    which each of these values is found.
 4645 */
 4646 
 4647 static int joiner_sort (joiner *jr)
 4648 {
 4649     int matches = jr->n_rows;
 4650     int i, err = 0;
 4651 
 4652     /* If there are string keys, we begin by mapping from the string
 4653        indices on the right -- held in the keyval and/or keyval2
 4654        members of the each joiner row -- to the indices for the same
 4655        strings on the left. This enables us to avoid doing string
 4656        comparisons when running aggr_value() later; we can just
 4657        compare the indices of the strings. In addition, if on any
 4658        given row we get no match for the right-hand key string on the
 4659        left (signalled by a strmap value of -1) we can exploit this
 4660        information by shuffling such rows to the end of the joiner
 4661        rectangle and ignoring them when aggregating.
 4662     */
 4663 
 4664     if (jr->str_keys[0] || jr->str_keys[1]) {
 4665     series_table *stl, *str;
 4666     int *strmap;
 4667     int k, kmin, kmax, lkeyval, rkeyval;
 4668 
 4669     kmin = jr->str_keys[0] ? 1 : 2;
 4670     kmax = jr->str_keys[1] ? 2 : 1;
 4671 
 4672     for (k=kmin; k<=kmax; k++) {
 4673         stl = series_get_string_table(jr->l_dset, jr->l_keyno[k]);
 4674         str = series_get_string_table(jr->r_dset, jr->r_keyno[k]);
 4675         strmap = series_table_map(str, stl);
 4676 
 4677         if (strmap == NULL) {
 4678         err = E_ALLOC;
 4679         break;
 4680         }
 4681 
 4682         for (i=0; i<jr->n_rows; i++) {
 4683         if (k == 1) {
 4684             rkeyval = jr->rows[i].keyval;
 4685         } else if (jr->rows[i].keyval == INT_MAX) {
 4686             continue;
 4687         } else {
 4688             rkeyval = jr->rows[i].keyval2;
 4689         }
 4690         lkeyval = strmap[rkeyval];
 4691 #if CDEBUG > 1
 4692         fprintf(stderr, "k = %d, row %d, keyval: %d -> %d\n", k, i, rkeyval, lkeyval);
 4693 #endif
 4694         if (lkeyval > 0) {
 4695             if (k == 1) {
 4696             jr->rows[i].keyval = lkeyval;
 4697             } else {
 4698             jr->rows[i].keyval2 = lkeyval;
 4699             }
 4700         } else {
 4701             /* arrange for qsort to move row to end */
 4702             jr->rows[i].keyval = G_MAXDOUBLE;
 4703             matches--;
 4704         }
 4705         }
 4706 
 4707         free(strmap);
 4708     }
 4709     }
 4710 
 4711     if (err) {
 4712     return err;
 4713     }
 4714 
 4715     qsort(jr->rows, jr->n_rows, sizeof *jr->rows, compare_jr_rows);
 4716 
 4717     if (matches < jr->n_rows) {
 4718     jr->n_rows = matches;
 4719     }
 4720 
 4721     jr->n_unique = 1;
 4722     for (i=1; i<jr->n_rows; i++) {
 4723     if (jr->rows[i].keyval != jr->rows[i-1].keyval) {
 4724         jr->n_unique += 1;
 4725     }
 4726     }
 4727 
 4728     jr->keys = malloc(jr->n_unique * sizeof *jr->keys);
 4729     jr->key_freq = malloc(jr->n_unique * sizeof *jr->key_freq);
 4730     jr->key_row = malloc(jr->n_unique * sizeof *jr->key_row);
 4731 
 4732     if (jr->keys == NULL || jr->key_freq == NULL || jr->key_row == NULL) {
 4733     err = E_ALLOC;
 4734     } else {
 4735     int j = 0, nj = 1;
 4736 
 4737     for (i=0; i<jr->n_unique; i++) {
 4738         jr->key_freq[i] = 0;
 4739     }
 4740 
 4741     jr->keys[0] = jr->rows[0].keyval;
 4742     jr->key_row[0] = 0;
 4743 
 4744     for (i=1; i<jr->n_rows; i++) {
 4745         if (jr->rows[i].keyval != jr->rows[i-1].keyval) {
 4746         /* finalize info for key j */
 4747         jr->keys[j] = jr->rows[i-1].keyval;
 4748         jr->key_freq[j] = nj;
 4749         /* and initialize for next key */
 4750         nj = 1;
 4751         if (j < jr->n_unique - 1) {
 4752             jr->key_row[j+1] = i;
 4753         }
 4754         j++;
 4755         } else {
 4756         nj++;
 4757         }
 4758     }
 4759 
 4760     /* make sure the last row is right */
 4761     jr->keys[j] = jr->rows[i-1].keyval;
 4762     jr->key_freq[j] = nj;
 4763     }
 4764 
 4765     return err;
 4766 }
 4767 
 4768 #if CDEBUG > 1
 4769 
 4770 static void joiner_print (joiner *jr)
 4771 {
 4772     char **labels = NULL;
 4773     jr_row *row;
 4774     int i;
 4775 
 4776     if (jr->str_keys[0]) {
 4777     labels = series_get_string_vals(jr->l_dset, jr->l_keyno[1], NULL, 0);
 4778     }
 4779 
 4780     fprintf(stderr, "\njoiner: n_rows = %d\n", jr->n_rows);
 4781     for (i=0; i<jr->n_rows; i++) {
 4782     row = &jr->rows[i];
 4783     if (row->n_keys > 1) {
 4784         fprintf(stderr, " row %d: keyvals=(%" KEYNUM_FMT ",%" KEYNUM_FMT ")\n",
 4785             i, row->keyval, row->keyval2);
 4786     } else {
 4787         if (jr->str_keys[0] && row->keyval >= 0) {
 4788         fprintf(stderr, " row %d: keyval=%" KEYNUM_FMT "(%s)\n",
 4789             i, row->keyval, labels[row->keyval - 1]);
 4790         } else {
 4791         fprintf(stderr, " row %d: keyval=%" KEYNUM_FMT "\n",
 4792             i, row->keyval);
 4793         }
 4794     }
 4795     }
 4796 
 4797     if (jr->keys != NULL) {
 4798     fprintf(stderr, " for primary key: n_unique = %d\n", jr->n_unique);
 4799     for (i=0; i<jr->n_unique; i++) {
 4800         fprintf(stderr,"  key value %" KEYNUM_FMT ": count = %d\n",
 4801             jr->keys[i], jr->key_freq[i]);
 4802     }
 4803     }
 4804 }
 4805 
 4806 static void print_outer_dataset (const DATASET *dset, const char *fname)
 4807 {
 4808     PRN *prn = gretl_print_new(GRETL_PRINT_STDERR, NULL);
 4809 
 4810     pprintf(prn, "Data extracted from %s:\n", fname);
 4811     printdata(NULL, NULL, dset, OPT_O, prn);
 4812     gretl_print_destroy(prn);
 4813 }
 4814 
 4815 #endif
 4816 
 4817 static int seqval_out_of_bounds (joiner *jr, int seqmax)
 4818 {
 4819     if (jr->seqval < 0) {
 4820     /* counting down from last match */
 4821     return -jr->seqval > seqmax;
 4822     } else {
 4823     /* counting up from first match */
 4824     return jr->seqval > seqmax;
 4825     }
 4826 }
 4827 
 4828 /* Do a binary search for the left-hand key value @targ in the sorted
 4829    array of unique right-hand key values, @vals; return the position
 4830    among @vals at which @targ matches, or -1 for no match.
 4831 */
 4832 
 4833 static int binsearch (keynum targ, const keynum *vals, int n, int offset)
 4834 {
 4835     int m = n/2;
 4836 
 4837     if (fabs((targ) - (vals[m])) < 1.0e-7) {
 4838     return m + offset;
 4839     } else if (targ < vals[0] || targ > vals[n-1]) {
 4840     return -1;
 4841     } else if (targ < vals[m]) {
 4842     return binsearch(targ, vals, m, offset);
 4843     } else {
 4844     return binsearch(targ, vals + m, n - m, offset + m);
 4845     }
 4846 }
 4847 
 4848 /* In some cases we can figure out what aggr_value() should return
 4849    just based on the number of matches, @n, and the characteristics
 4850    of the joiner. If so, write the value into @x and return 1; if
 4851    not, return 0.
 4852 */
 4853 
 4854 static int aggr_val_determined (joiner *jr, int n, double *x, int *err)
 4855 {
 4856     if (jr->aggr == AGGR_COUNT) {
 4857     /* just return the number of matches */
 4858     *x = n;
 4859     return 1;
 4860     } else if (jr->aggr == AGGR_SEQ && seqval_out_of_bounds(jr, n)) {
 4861     /* out of bounds sequence index: return NA */
 4862     *x = NADBL;
 4863     return 1;
 4864     } else if (n > 1 && jr->aggr == AGGR_NONE) {
 4865     /* fail */
 4866 #if AGGDEBUG
 4867     fprintf(stderr, "aggr_val_determined(): got n=%d\n", n);
 4868 #endif
 4869     *err = E_DATA;
 4870     gretl_errmsg_set(_("You need to specify an aggregation "
 4871                "method for a 1:n join"));
 4872     *x = NADBL;
 4873     return 1;
 4874     } else {
 4875     /* not enough information so far */
 4876     return 0;
 4877     }
 4878 }
 4879 
 4880 /* get month-day index from @dset time-series info */
 4881 
 4882 static int midas_day_index (int t, DATASET *dset)
 4883 {
 4884     char obs[OBSLEN];
 4885     int y, m, d, idx = -1;
 4886 
 4887     ntodate(obs, t, dset);
 4888     if (sscanf(obs, YMD_READ_FMT, &y, &m, &d) == 3) {
 4889     idx = month_day_index(y, m, d, dset->pd);
 4890     }
 4891 
 4892     return idx;
 4893 }
 4894 
 4895 #define midas_daily(j) (j->midas_m > 20)
 4896 
 4897 #define min_max_cond(x,y,a) ((a==AGGR_MAX && x>y) || (a==AGGR_MIN && x<y))
 4898 
 4899 /* aggr_value: here we're working on a given row of the left-hand
 4900    dataset. The values @key and (if applicable) @key2 are the
 4901    left-hand keys for this row. We count the key-matches on the
 4902    right and apply an aggregation procedure if the user specified
 4903    one. We return the value that should be entered for the imported
 4904    series on this row.
 4905 
 4906    Note: @xmatch and @auxmatch are workspace arrays allocated by
 4907    the caller.
 4908 */
 4909 
 4910 static double aggr_value (joiner *jr,
 4911               keynum key1,
 4912               keynum key2,
 4913               int v, int revseq,
 4914               double *xmatch,
 4915               double *auxmatch,
 4916               int *nomatch,
 4917               int *err)
 4918 {
 4919     double x, xa;
 4920     int imin, imax, pos;
 4921     int i, n, ntotal;
 4922 
 4923     /* find the position of the inner (primary) key in the
 4924        array of unique outer key values */
 4925     pos = binsearch(key1, jr->keys, jr->n_unique, 0);
 4926 
 4927 #if AGGDEBUG
 4928     if (pos < 0) {
 4929     fprintf(stderr, " key1 = " KEYNUM_FMT ": no match\n", key1);
 4930     } else {
 4931     fprintf(stderr, " key1 = " KEYNUM_FMT ": matched at position %d\n", key1, pos);
 4932     }
 4933 #endif
 4934 
 4935     if (pos < 0) {
 4936     /* (primary) inner key value not found */
 4937     *nomatch = 1;
 4938     return jr->aggr == AGGR_COUNT ? 0 : NADBL;
 4939     }
 4940 
 4941     /* how many matches at @pos? (must be at least 1 unless
 4942        something very bad has happened)
 4943     */
 4944     n = jr->key_freq[pos];
 4945 
 4946 #if AGGDEBUG
 4947     fprintf(stderr, "  number of primary matches = %d\n", n);
 4948 #endif
 4949 
 4950     if (jr->n_keys == 1) {
 4951     /* if there's just a single key, we can figure some
 4952        cases out already */
 4953     if (aggr_val_determined(jr, n, &x, err)) {
 4954         return x;
 4955     }
 4956     }
 4957 
 4958     if (jr->key_row[pos] < 0) {
 4959     /* "can't happen" */
 4960     return NADBL;
 4961     }
 4962 
 4963     /* set the range of rows for reading from the joiner rectangle */
 4964     imin = jr->key_row[pos];
 4965     imax = imin + n;
 4966 
 4967 #if AGGDEBUG
 4968     fprintf(stderr, "  aggregation row range: %d to %d\n", imin+1, imax);
 4969 #endif
 4970 
 4971     if (jr->aggr == AGGR_MIDAS) {
 4972     /* special case: MIDAS "spreading" */
 4973     int daily = dated_daily_data(jr->r_dset);
 4974     int gotit = 0;
 4975 
 4976     x = NADBL;
 4977 
 4978     for (i=imin; i<imax && !gotit; i++) {
 4979         /* loop across primary key matches */
 4980         jr_row *r = &jr->rows[i];
 4981 
 4982         if (jr->n_keys == 1 || key2 == r->keyval2) {
 4983         /* got secondary key match */
 4984         int sub, t = r->dset_row;
 4985 #if AGGDEBUG
 4986         fprintf(stderr, "  i=%d: 2-key match: %d,%d (revseq=%d)\n",
 4987             i, (int) key1, (int) key2, revseq);
 4988 #endif
 4989         if (daily) {
 4990             /* outer dataset has known daily structure */
 4991             sub = midas_day_index(t, jr->r_dset);
 4992             gotit = sub == revseq;
 4993         } else if (midas_daily(jr) && r->micro > 0) {
 4994             /* "other" daily data: r->micro holds day */
 4995             sub = month_day_index((int) key1, (int) key2,
 4996                       r->micro, jr->midas_pd);
 4997             gotit = sub == revseq;
 4998         } else {
 4999             if (r->micro > 0) {
 5000             /* if present, this is derived from the outer
 5001                time-key specification
 5002             */
 5003             sub = r->micro;
 5004             } else {
 5005             date_maj_min(t, jr->r_dset, NULL, &sub);
 5006             }
 5007             gotit = (sub - 1) % jr->midas_m + 1 == revseq;
 5008         }
 5009         if (gotit) {
 5010             x = jr->r_dset->Z[v][t];
 5011         }
 5012         }
 5013     }
 5014 
 5015     /* and we're done */
 5016     return x;
 5017     }
 5018 
 5019     /* We now fill out the array @xmatch with non-missing values
 5020        from the matching outer rows. If we have a secondary key
 5021        we screen for matches on that as we go.
 5022     */
 5023 
 5024     n = 0;      /* will now hold count of non-NA matches */
 5025     ntotal = 0; /* will ignore the OK/NA distinction */
 5026 
 5027     for (i=imin; i<imax; i++) {
 5028     jr_row *r = &jr->rows[i];
 5029 
 5030     if (jr->n_keys == 1 || key2 == r->keyval2) {
 5031         ntotal++;
 5032         x = jr->r_dset->Z[v][r->dset_row];
 5033         if (jr->auxcol) {
 5034         xa = r->aux;
 5035         if (!na(x) && na(xa)) {
 5036             /* we can't know the min/max of the aux var */
 5037             *err = E_MISSDATA;
 5038             return NADBL;
 5039         }
 5040         if (!na(xa)) {
 5041             auxmatch[n] = xa;
 5042             xmatch[n++] = x;
 5043         }
 5044         } else if (!na(x)) {
 5045         xmatch[n++] = x;
 5046         }
 5047     }
 5048     }
 5049 
 5050     if (jr->n_keys > 1) {
 5051     /* we've already checked this for the 1-key case */
 5052     if (aggr_val_determined(jr, n, &x, err)) {
 5053         return x;
 5054     }
 5055     }
 5056 
 5057     x = NADBL;
 5058 
 5059     if (n == 0) {
 5060     ; /* all matched observations are NA */
 5061     } else if (jr->aggr == AGGR_NONE) {
 5062     x = xmatch[0];
 5063     } else if (jr->aggr == AGGR_SEQ) {
 5064     int sval = jr->seqval;
 5065 
 5066     i = sval < 0 ? n + sval : sval - 1;
 5067     if (i >= 0 && i < n) {
 5068         x = xmatch[i];
 5069     }
 5070     } else if (jr->aggr == AGGR_MAX || jr->aggr == AGGR_MIN) {
 5071     if (jr->auxcol) {
 5072         /* using the max/min of an auxiliary var */
 5073         int idx = 0;
 5074 
 5075         x = auxmatch[0];
 5076         for (i=1; i<n; i++) {
 5077         if (min_max_cond(auxmatch[i], x, jr->aggr)) {
 5078             x = auxmatch[i];
 5079             idx = i;
 5080         }
 5081         }
 5082         x = xmatch[idx];
 5083     } else {
 5084         /* max/min of the actual data */
 5085         x = xmatch[0];
 5086         for (i=1; i<n; i++) {
 5087         if (min_max_cond(xmatch[i], x, jr->aggr)) {
 5088             x = xmatch[i];
 5089         }
 5090         }
 5091     }
 5092     } else if (jr->aggr == AGGR_SUM || jr->aggr == AGGR_AVG) {
 5093     x = 0.0;
 5094     for (i=0; i<n; i++) {
 5095         x += xmatch[i];
 5096     }
 5097     if (jr->aggr == AGGR_AVG) {
 5098         x /= n;
 5099     }
 5100     }
 5101 
 5102     return x;
 5103 }
 5104 
 5105 /* Handle the case where (a) the value from the right, @rz, is
 5106    actually the coding of a string value, and (b) the LHS series is
 5107    pre-existing and already has a string table attached. The RHS
 5108    coding must be made consistent with that on the left. We reach this
 5109    function only if we've verified that there are string tables on
 5110    both sides, and that @rz is not NA.
 5111 */
 5112 
 5113 static double maybe_adjust_string_code (series_table *rst,
 5114                     series_table *lst,
 5115                     double rz, int *err)
 5116 {
 5117     const char *rstr = series_table_get_string(rst, rz);
 5118