"Fossies" - the Fresh Open Source Software Archive

Member "dateutils-0.4.6/src/prchunk.c" (19 Mar 2019, 11542 Bytes) of package /linux/privat/dateutils-0.4.6.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "prchunk.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 0.4.5_vs_0.4.6.

    1 /*** prchunk.c -- guessing line oriented data formats
    2  *
    3  * Copyright (C) 2010-2019 Sebastian Freundt
    4  *
    5  * Author:  Sebastian Freundt <freundt@ga-group.nl>
    6  *
    7  * This file is part of uterus.
    8  *
    9  * Redistribution and use in source and binary forms, with or without
   10  * modification, are permitted provided that the following conditions
   11  * are met:
   12  *
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  *
   16  * 2. Redistributions in binary form must reproduce the above copyright
   17  *    notice, this list of conditions and the following disclaimer in the
   18  *    documentation and/or other materials provided with the distribution.
   19  *
   20  * 3. Neither the name of the author nor the names of any contributors
   21  *    may be used to endorse or promote products derived from this
   22  *    software without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
   25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   26  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   27  * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   29  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   30  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
   31  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
   32  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
   33  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
   34  * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   35  *
   36  ***/
   37 #define PRCHUNK_C
   38 #if defined HAVE_CONFIG_H
   39 # include "config.h"
   40 #endif  /* HAVE_CONFIG_H */
   41 #if defined MAP_ANON_NEEDS_DARWIN_SOURCE
   42 # define _DARWIN_C_SOURCE
   43 #endif  /* MAP_ANON_NEEDS_DARWIN_SOURCE */
   44 #if defined MAP_ANON_NEEDS_ALL_SOURCE
   45 # define _ALL_SOURCE
   46 #endif  /* MAP_ANON_NEEDS_ALL_SOURCE */
   47 #include <stddef.h>
   48 #include <stdlib.h>
   49 #include <stdio.h>
   50 #include <unistd.h>
   51 #include <fcntl.h>
   52 #include <string.h>
   53 #include <stdint.h>
   54 #include <sys/mman.h>
   55 #include <stdarg.h>
   56 #include <errno.h>
   57 
   58 #include "nifty.h"
   59 #include "prchunk.h"
   60 
   61 #define MAX_NLINES  (16384)
   62 #define MAX_LLEN    (1024)
   63 
   64 #if !defined MAP_ANONYMOUS && defined MAP_ANON
   65 # define MAP_ANONYMOUS  (MAP_ANON)
   66 #elif !defined MAP_ANON
   67 # define MAP_ANON   (0x1000U)
   68 #endif  /* MAP_ANON->MAP_ANONYMOUS */
   69 
   70 #if defined __INTEL_COMPILER
   71 # pragma warning(disable: 981)
   72 #endif  /* __INTEL_COMPILER */
   73 
   74 typedef uint32_t off32_t;
   75 typedef uint16_t off16_t;
   76 
   77 struct prch_ctx_s {
   78     /* file descriptor */
   79     int fd;
   80     /* buffer */
   81     char *buf;
   82     /* number of lines in the buffer */
   83     uint32_t tot_lno;
   84     /* number of columns per line */
   85     uint32_t tot_cno;
   86     /* number of bytes in the buffer */
   87     size_t bno;
   88     /* last known offset */
   89     size_t off;
   90     /* offsets */
   91     off32_t loff[MAX_NLINES];
   92     off32_t cur_lno;
   93     /* delimiter offsets */
   94     off16_t *soff;
   95 };
   96 
   97 
   98 /* error() impl */
   99 static void
  100 __attribute__((format(printf, 2, 3)))
  101 error(int eno, const char *fmt, ...)
  102 {
  103     va_list vap;
  104     va_start(vap, fmt);
  105     fputs("prchunk: ", stderr);
  106     vfprintf(stderr, fmt, vap);
  107     va_end(vap);
  108     if (eno) {
  109         fputc(':', stderr);
  110         fputc(' ', stderr);
  111         fputs(strerror(eno), stderr);
  112     }
  113     fputc('\n', stderr);
  114     return;
  115 }
  116 
  117 static inline void
  118 set_loff(prch_ctx_t ctx, uint32_t lno, off32_t off)
  119 {
  120     ctx->loff[lno] = off;
  121     ctx->loff[lno] <<= 1;
  122     return;
  123 }
  124 
  125 static inline off32_t
  126 get_loff(prch_ctx_t ctx, uint32_t lno)
  127 {
  128     off32_t res = ctx->loff[lno];
  129     return res >> 1;
  130 }
  131 
  132 /* return 0 if not \r terminated, 1 otherwise */
  133 static inline int
  134 lftermdp(prch_ctx_t ctx, uint32_t lno)
  135 {
  136     return ctx->loff[lno] & 1;
  137 }
  138 
  139 static inline void
  140 set_lftermd(prch_ctx_t ctx, uint32_t lno)
  141 {
  142     ctx->loff[lno] |= 1;
  143     return;
  144 }
  145 
  146 static inline size_t
  147 get_llen(prch_ctx_t ctx, uint32_t lno)
  148 {
  149     if (UNLIKELY(lno == 0)) {
  150         return get_loff(ctx, 0) - lftermdp(ctx, 0);
  151     }
  152     return get_loff(ctx, lno) -
  153         lftermdp(ctx, lno) -
  154         get_loff(ctx, lno - 1) - 1;
  155 }
  156 
  157 
  158 /* internal operations */
  159 FDEFU int
  160 prchunk_fill(prch_ctx_t ctx)
  161 {
  162 /* this is a coroutine consisting of a line counter yielding the number of
  163  * lines read so far and a reader yielding a buffer fill and the number of
  164  * bytes read */
  165 #define CHUNK_SIZE  (4096)
  166 #define YIELD(x)    goto yield##x
  167     char *off = ctx->buf + 0;
  168     char *bno = ctx->buf + ctx->bno;
  169     ssize_t nrd;
  170 
  171     /* initial work, reset the line counters et al */
  172     ctx->tot_lno = 0;
  173     /* we just memcpy() the left over stuff to the front and restart
  174      * from there, someone left us a note in __ctx with the left
  175      * over offset */
  176     /* normally we'd use memmove() but we know there's little chance
  177      * for overlapping regions */
  178     if (UNLIKELY(ctx->bno == 0)) {
  179         /* do nothing */
  180         ;
  181     } else if (LIKELY(ctx->bno > ctx->off)) {
  182         size_t rsz = ctx->bno - ctx->off;
  183         /* move the top RSZ bytes to the beginning */
  184         memcpy(ctx->buf, ctx->buf + ctx->off, rsz);
  185         ctx->bno = rsz;
  186         bno = ctx->buf + rsz;
  187     } else if (UNLIKELY(ctx->bno == ctx->off)) {
  188         /* what are the odds? just reset the counters */
  189         ctx->bno = 0;
  190         bno = ctx->buf;
  191     } else {
  192         /* the user didn't see the end of the file */
  193         return -1;
  194     }
  195 
  196 yield1:
  197     /* read CHUNK_SIZE bytes */
  198     bno += (nrd = read(ctx->fd, bno, CHUNK_SIZE));
  199     /* if we came from yield2 then off == __ctx->bno, and if we
  200      * read 0 or less bytes then off >= __ctx->bno + nrd, so we
  201      * can simply use that compact expression if the buffer has no
  202      * more input.
  203      * On the contrary if we came from the outside, i.e. fill_buffer()
  204      * has been called, then off would be 0 and __ctx->bno would be
  205      * the buffer filled so far, if no more bytes could be read then
  206      * we'd proceed processing them (off < __ctx->bno + nrd */
  207     if (UNLIKELY(!nrd && off < bno && ctx->cur_lno <= ctx->tot_lno)) {
  208         /* last line then, unyielded :| */
  209         set_loff(ctx, 0, bno - ctx->buf);
  210         YIELD(4);
  211     } else if (UNLIKELY(nrd <= 0 && off == ctx->buf)) {
  212         /* special case, we worked our arses off and nothing's
  213          * in the pipe line so just fuck off here */
  214         return -1;
  215     } else if (LIKELY(off < bno || off == ctx->buf)) {
  216         YIELD(2);
  217     }
  218     /* proceed to exit */
  219     YIELD(3);
  220 yield2:
  221     while (off < bno) {
  222         size_t rsz = bno - off;
  223         char *p = memchr(off, '\n', rsz);
  224         if (UNLIKELY(p == NULL)) {
  225             if (LIKELY(nrd > 0)) {
  226                 break;
  227             }
  228             /* fucking idiots didnt conclude with a \n */
  229             error(0, "ID:10T error");
  230             p = bno;
  231         }
  232         /* massage our status structures */
  233         set_loff(ctx, ctx->tot_lno, p - ctx->buf);
  234         if (UNLIKELY(p[-1] == '\r')) {
  235             /* oh god, when is this nightmare gonna end */
  236             p[-1] = '\0';
  237             set_lftermd(ctx, ctx->tot_lno);
  238         }
  239         *p = '\0';
  240         off = ++p;
  241         /* count it as line and check if we need more */
  242         if (++ctx->tot_lno >= MAX_NLINES) {
  243             YIELD(3);
  244         }
  245     }
  246     YIELD(1);
  247 yield3:
  248     /* need clean up, something like unread(),
  249      * in particular leave a note in __ctx with the left over offset */
  250     ctx->cur_lno = 0;
  251 yield4:
  252     ctx->off = off - ctx->buf;
  253     ctx->bno = bno - ctx->buf;
  254 #undef YIELD
  255 #undef CHUNK_SIZE
  256     return 0;
  257 }
  258 
  259 
  260 /* public operations */
  261 FDEFU prch_ctx_t
  262 init_prchunk(int fd)
  263 {
  264 #define MAP_MEM     (MAP_ANON | MAP_PRIVATE)
  265 #define PROT_MEM    (PROT_READ | PROT_WRITE)
  266 #define MAP_LEN     (MAX_NLINES * MAX_LLEN)
  267     static struct prch_ctx_s __ctx;
  268 
  269     __ctx.buf = mmap(NULL, MAP_LEN, PROT_MEM, MAP_MEM, -1, 0);
  270     if (__ctx.buf == MAP_FAILED) {
  271         return NULL;
  272     }
  273 
  274     /* bit of space for the rechunker */
  275     __ctx.soff = mmap(NULL, MAP_LEN, PROT_MEM, MAP_MEM, -1, 0);
  276     if (__ctx.soff == MAP_FAILED) {
  277         return NULL;
  278     }
  279 
  280     if ((__ctx.fd = fd) > STDIN_FILENO) {
  281 #if defined POSIX_FADV_SEQUENTIAL
  282         /* give advice about our read pattern */
  283         int rc = posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
  284 
  285         if (UNLIKELY(rc < 0)) {
  286             munmap(__ctx.soff, MAP_LEN);
  287             return NULL;
  288         }
  289 #endif  /* POSIX_FADV_SEQUENTIAL */
  290     }
  291     return &__ctx;
  292 }
  293 
  294 FDEFU void
  295 free_prchunk(prch_ctx_t ctx)
  296 {
  297     if (LIKELY(ctx->buf != NULL)) {
  298         munmap(ctx->buf, MAP_LEN);
  299         ctx->buf = NULL;
  300     }
  301     return;
  302 }
  303 
  304 
  305 /* accessors/iterators/et al. */
  306 FDEFU size_t
  307 prchunk_get_nlines(prch_ctx_t ctx)
  308 {
  309     return ctx->tot_lno;
  310 }
  311 
  312 FDEFU size_t
  313 prchunk_getlineno(prch_ctx_t ctx, char **p, int lno)
  314 {
  315     if (UNLIKELY(lno <= 0)) {
  316         *p = ctx->buf;
  317         return get_llen(ctx, 0);
  318     } else if (UNLIKELY((size_t)lno >= prchunk_get_nlines(ctx))) {
  319         *p = NULL;
  320         return 0;
  321     }
  322     /* likely case last, what bollocks */
  323     *p = ctx->buf + get_loff(ctx, lno - 1) + 1;
  324     return get_llen(ctx, lno);
  325 }
  326 
  327 FDEFU size_t
  328 prchunk_getline(prch_ctx_t ctx, char **p)
  329 {
  330     return prchunk_getlineno(ctx, p, ctx->cur_lno++);
  331 }
  332 
  333 FDEFU void
  334 prchunk_reset(prch_ctx_t ctx)
  335 {
  336     ctx->cur_lno = 0;
  337     return;
  338 }
  339 
  340 FDEFU int
  341 prchunk_haslinep(prch_ctx_t ctx)
  342 {
  343 /* the second condition is to allow unterminated last lines */
  344     return ctx->cur_lno < ctx->tot_lno || ctx->cur_lno == 0U;
  345 }
  346 
  347 
  348 static inline void
  349 set_ncols(prch_ctx_t ctx, size_t ncols)
  350 {
  351     ctx->tot_cno = ncols;
  352     return;
  353 }
  354 
  355 FDEFU size_t
  356 prchunk_get_ncols(prch_ctx_t ctx)
  357 {
  358     return ctx->tot_cno;
  359 }
  360 
  361 static inline void
  362 set_col_off(prch_ctx_t ctx, size_t lno, size_t cno, size_t off)
  363 {
  364     ctx->soff[lno * prchunk_get_ncols(ctx) + cno] = (off16_t)off;
  365     return;
  366 }
  367 
  368 static inline off16_t
  369 get_col_off(prch_ctx_t ctx, size_t lno, size_t cno)
  370 {
  371     return ctx->soff[lno * prchunk_get_ncols(ctx) + cno];
  372 }
  373 
  374 /* rechunker, chop the lines into smaller bits
  375  * Strategy is to go over all lines in the current chunk and
  376  * memchr() for the delimiter DELIM.
  377  * Store the offsets into __ctx->soff and bugger off leaving a \0
  378  * where the delimiter was. */
  379 FDEFU void
  380 prchunk_rechunk(prch_ctx_t ctx, char dlm, int ncols)
  381 {
  382 /* very naive implementation, we prefer prchunk_rechunk_by_dstfld()
  383  * where a distance histogram demarks possible places */
  384     size_t lno = 0;
  385     size_t cno = 0;
  386     char *line;
  387     char *off;
  388     char *p;
  389     char *bno = ctx->buf + ctx->off;
  390     size_t rsz;
  391 
  392     set_ncols(ctx, ncols);
  393     off = line = ctx->buf;
  394     rsz = bno - off;
  395     while ((p = memchr(off, dlm, rsz)) != NULL) {
  396         size_t co;
  397         size_t llen = get_llen(ctx, lno);
  398         while ((co = p - line) > llen) {
  399             /* last column offset equals the length of the line */
  400             set_col_off(ctx, lno, cno, llen);
  401             /* get the new line */
  402             line = ctx->buf + get_loff(ctx, lno++) + 1;
  403             cno = 0;
  404         }
  405         /* store the offset of the column within the line */
  406         set_col_off(ctx, lno, cno++, co);
  407         /* prepare the counters for the next round */
  408         *p = '\0';
  409         off = ++p;
  410         rsz = bno - off;
  411     }
  412     /* last column offset equals the length of the line */
  413     rsz = get_llen(ctx, lno);
  414     set_col_off(ctx, lno, cno, rsz);
  415     return;
  416 }
  417 
  418 FDEFU size_t
  419 prchunk_getcolno(prch_ctx_t ctx, char **p, int lno, int cno)
  420 {
  421     size_t co1, co2;
  422 
  423     if (UNLIKELY(cno < 0 || (size_t)cno >= prchunk_get_ncols(ctx))) {
  424         *p = NULL;
  425         return 0;
  426     }
  427     (void)prchunk_getlineno(ctx, p, lno);
  428     if (UNLIKELY(cno == 0)) {
  429         return get_col_off(ctx, lno, 0);
  430     }
  431     /* likely case last */
  432     co1 = get_col_off(ctx, lno, cno);
  433     co2 = get_col_off(ctx, lno, cno - 1);
  434     *p += co2 + 1;
  435     return co1 - co2 - 1;
  436 }
  437 
  438 
  439 #if defined STANDALONE
  440 int
  441 main(int argc, char *argv[])
  442 {
  443     int fd;
  444     prch_ctx_t ctx;
  445 
  446     if (argc <= 1) {
  447         fd = STDIN_FILENO;
  448     } else if ((fd = open(argv[1], O_RDONLY)) < 0) {
  449         return 1;
  450     }
  451     /* get our prchunk up n running */
  452     if ((ctx = init_prchunk(fd)) == NULL) {
  453         error(errno, "Error: ctx NULL");
  454         return 1;
  455     }
  456     /* fill the buffer */
  457     while (!(prchunk_fill(ctx) < 0)) {
  458         char *l[1];
  459         size_t llen;
  460         int i = 0;
  461 
  462         while ((llen = prchunk_getline(ctx, l))) {
  463             fprintf(stderr, "%d (%zu) %s\n", i++, llen, l[0]);
  464         }
  465     }
  466     /* and out */
  467     free_prchunk(ctx);
  468     close(fd);
  469     return 0;
  470 }
  471 #endif  /* STANDALONE */
  472 
  473 /* prchunk.c ends here */