"Fossies" - the Fresh Open Source Software Archive

Member "libextractor-1.11/src/main/extractor_datasource.c" (30 Jan 2021, 34206 Bytes) of package /linux/privat/libextractor-1.11.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "extractor_datasource.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 1.10_vs_1.11.

    1 /*
    2      This file is part of libextractor.
    3      Copyright (C) 2002, 2003, 2004, 2005, 2006, 2009, 2012 Vidyut Samanta and Christian Grothoff
    4 
    5      libextractor is free software; you can redistribute it and/or modify
    6      it under the terms of the GNU General Public License as published
    7      by the Free Software Foundation; either version 3, or (at your
    8      option) any later version.
    9 
   10      libextractor is distributed in the hope that it will be useful, but
   11      WITHOUT ANY WARRANTY; without even the implied warranty of
   12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13      General Public License for more details.
   14 
   15      You should have received a copy of the GNU General Public License
   16      along with libextractor; see the file COPYING.  If not, write to the
   17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
   18      Boston, MA 02110-1301, USA.
   19  */
   20 /**
   21  * @file main/extractor_datasource.c
   22  * @brief random access and possibly decompression of data from buffer in memory or file on disk
   23  * @author Christian Grothoff
   24  */
   25 #include "platform.h"
   26 #include "extractor_logging.h"
   27 #include "extractor_datasource.h"
   28 
   29 #if HAVE_LIBBZ2
   30 #include <bzlib.h>
   31 #define MIN_BZ2_HEADER 4
   32 #ifndef MIN_COMPRESSED_HEADER
   33 #define MIN_COMPRESSED_HEADER MIN_ZLIB_HEADER
   34 #endif
   35 #endif
   36 
   37 #if HAVE_ZLIB
   38 #include <zlib.h>
   39 #define MIN_ZLIB_HEADER 12
   40 #ifndef MIN_COMPRESSED_HEADER
   41 #define MIN_COMPRESSED_HEADER MIN_BZ2_HEADER
   42 #endif
   43 #endif
   44 
   45 #ifndef MIN_COMPRESSED_HEADER
   46 #define MIN_COMPRESSED_HEADER -1
   47 #endif
   48 
   49 #ifndef O_LARGEFILE
   50 #define O_LARGEFILE 0
   51 #endif
   52 
   53 /**
   54  * Maximum size of an IO buffer.
   55  */
   56 #define MAX_READ (4 * 1024 * 1024)
   57 
   58 /**
   59  * Data is read from the source and shoved into decompressor
   60  * in chunks this big.
   61  */
   62 #define COM_CHUNK_SIZE (16 * 1024)
   63 
   64 
   65 /**
   66  * Enum with the various possible types of compression supported.
   67  */
   68 enum ExtractorCompressionType
   69 {
   70   /**
   71    * We cannot tell from the data (header incomplete).
   72    */
   73   COMP_TYPE_UNDEFINED = -1,
   74 
   75   /**
   76    * Invalid header (likely uncompressed)
   77    */
   78   COMP_TYPE_INVALID = 0,
   79 
   80   /**
   81    * libz / gzip compression.
   82    */
   83   COMP_TYPE_ZLIB = 1,
   84 
   85   /**
   86    * bz2 compression
   87    */
   88   COMP_TYPE_BZ2 = 2
   89 };
   90 
   91 
   92 /**
   93  * Abstraction of the data source (file or a memory buffer)
   94  * for the decompressor.
   95  */
   96 struct BufferedFileDataSource
   97 {
   98   /**
   99    * Pointer to the buffer to read from (may be NULL)
  100    */
  101   const void *data;
  102 
  103   /**
  104    * A buffer to read into. For fd != -1: when data != NULL,
  105    * data is used directly.
  106    */
  107   void *buffer;
  108 
  109   /**
  110    * Size of the file (or the data buffer)
  111    */
  112   uint64_t fsize;
  113 
  114   /**
  115    * Position of the buffer in the file.
  116    */
  117   uint64_t fpos;
  118 
  119   /**
  120    * Position within the buffer.  Our absolute offset in the file
  121    * is thus 'fpos + buffer_pos'.
  122    */
  123   size_t buffer_pos;
  124 
  125   /**
  126    * Number of valid bytes in the buffer (<= buffer_size)
  127    */
  128   size_t buffer_bytes;
  129 
  130   /**
  131    * Allocated size of the buffer
  132    */
  133   size_t buffer_size;
  134 
  135   /**
  136    * Descriptor of the file to read data from (may be -1)
  137    */
  138   int fd;
  139 
  140 };
  141 
  142 
  143 /**
  144  * An object from which uncompressed data can be read
  145  */
  146 struct CompressedFileSource
  147 {
  148   /**
  149    * The source of data
  150    */
  151   struct BufferedFileDataSource *bfds;
  152 
  153   /**
  154    * Decompression target buffer.
  155    */
  156   char result[COM_CHUNK_SIZE];
  157 
  158   /**
  159    * At which offset in 'result' is 'fpos'?
  160    */
  161   size_t result_pos;
  162 
  163   /**
  164    * Size of the source (same as bfds->fsize)
  165    */
  166   int64_t fsize;
  167 
  168   /**
  169    * Position within the (decompressed) source
  170    */
  171   int64_t fpos;
  172 
  173   /**
  174    * Total size of the uncompressed data. Remains -1 until
  175    * decompression is finished.
  176    */
  177   int64_t uncompressed_size;
  178 
  179 #if HAVE_LIBBZ2
  180   /**
  181    * BZ2 stream object
  182    */
  183   bz_stream bstrm;
  184 #endif
  185 
  186 #if HAVE_ZLIB
  187   /**
  188    * ZLIB stream object
  189    */
  190   z_stream strm;
  191 
  192   /**
  193    * Length of gzip header (may be 0, in that case ZLIB parses the header)
  194    */
  195   int gzip_header_length;
  196 #endif
  197 
  198   /**
  199    * The type of compression used in the source
  200    */
  201   enum ExtractorCompressionType compression_type;
  202 
  203 };
  204 
  205 
  206 /**
  207  * Makes bfds seek to 'pos' and read a chunk of bytes there.
  208  * Changes bfds->fpos, bfds->buffer_bytes and bfds->buffer_pos.
  209  * Does almost nothing for memory-backed bfds.
  210  *
  211  * @param bfds bfds
  212  * @param pos position
  213  * @return 0 on success, -1 on error
  214  */
  215 static int
  216 bfds_pick_next_buffer_at (struct BufferedFileDataSource *bfds,
  217                           uint64_t pos)
  218 {
  219   int64_t position;
  220   ssize_t rd;
  221 
  222   if (pos > bfds->fsize)
  223   {
  224     LOG ("Invalid seek operation\n");
  225     return -1;   /* invalid */
  226   }
  227   if (NULL == bfds->buffer)
  228   {
  229     bfds->buffer_pos = pos;
  230     return 0;
  231   }
  232   position = (int64_t) lseek (bfds->fd, pos, SEEK_SET);
  233   if (position < 0)
  234   {
  235     LOG_STRERROR ("lseek");
  236     return -1;
  237   }
  238   bfds->fpos = position;
  239   bfds->buffer_pos = 0;
  240   rd = read (bfds->fd, bfds->buffer, bfds->buffer_size);
  241   if (rd < 0)
  242   {
  243     LOG_STRERROR ("read");
  244     return -1;
  245   }
  246   bfds->buffer_bytes = rd;
  247   return 0;
  248 }
  249 
  250 
  251 /**
  252  * Creates a bfds
  253  *
  254  * @param data data buffer to use as a source (NULL if fd != -1)
  255  * @param fd file descriptor to use as a source (-1 if data != NULL)
  256  * @param fsize size of the file (or the buffer)
  257  * @return newly allocated bfds
  258  */
  259 static struct BufferedFileDataSource *
  260 bfds_new (const void *data,
  261           int fd,
  262           int64_t fsize)
  263 {
  264   struct BufferedFileDataSource *result;
  265   size_t xtra;
  266 
  267   if (fsize > MAX_READ)
  268     xtra = MAX_READ;
  269   else
  270     xtra = (size_t) fsize;
  271   if ( (-1 == fd) && (NULL == data) )
  272   {
  273     LOG ("Invalid arguments\n");
  274     return NULL;
  275   }
  276   if ( (-1 != fd) && (NULL != data) )
  277     fd = -1; /* don't need fd */
  278   if (NULL != data)
  279     xtra = 0;
  280   if (NULL == (result = malloc (sizeof (struct BufferedFileDataSource) + xtra)))
  281   {
  282     LOG_STRERROR ("malloc");
  283     return NULL;
  284   }
  285   memset (result, 0, sizeof (struct BufferedFileDataSource));
  286   result->data = (NULL != data) ? data : &result[1];
  287   result->buffer = (NULL != data) ? NULL : &result[1];
  288   result->buffer_size = (NULL != data) ? fsize : xtra;
  289   result->buffer_bytes = (NULL != data) ? fsize : 0;
  290   result->fsize = fsize;
  291   result->fd = fd;
  292   bfds_pick_next_buffer_at (result, 0);
  293   return result;
  294 }
  295 
  296 
  297 /**
  298  * Unallocates bfds
  299  *
  300  * @param bfds bfds to deallocate
  301  */
  302 static void
  303 bfds_delete (struct BufferedFileDataSource *bfds)
  304 {
  305   free (bfds);
  306 }
  307 
  308 
  309 /**
  310  * Makes bfds seek to 'pos' in 'whence' mode.
  311  * Will try to seek within the buffer, will move the buffer location if
  312  * the seek request falls outside of the buffer range.
  313  *
  314  * @param bfds bfds
  315  * @param pos position to seek to
  316  * @param whence one of the seek constants (SEEK_CUR, SEEK_SET, SEEK_END)
  317  * @return new absolute position, -1 on error
  318  */
  319 static int64_t
  320 bfds_seek (struct BufferedFileDataSource *bfds,
  321            int64_t pos, int whence)
  322 {
  323   uint64_t npos;
  324   size_t nbpos;
  325 
  326   switch (whence)
  327   {
  328   case SEEK_CUR:
  329     npos = bfds->fpos + bfds->buffer_pos + pos;
  330     if (npos > bfds->fsize)
  331     {
  332       LOG ("Invalid seek operation to %lld from %llu (max is %llu)\n",
  333            (long long) pos,
  334            bfds->fpos + bfds->buffer_pos,
  335            (unsigned long long) bfds->fsize);
  336       return -1;
  337     }
  338     nbpos = bfds->buffer_pos + pos;
  339     if ( (NULL == bfds->buffer) ||
  340          (nbpos < bfds->buffer_bytes) )
  341     {
  342       bfds->buffer_pos = nbpos;
  343       return npos;
  344     }
  345     if (0 != bfds_pick_next_buffer_at (bfds,
  346                                        npos))
  347     {
  348       LOG ("seek operation failed\n");
  349       return -1;
  350     }
  351     return npos;
  352   case SEEK_END:
  353     if (pos > 0)
  354     {
  355       LOG ("Invalid seek operation\n");
  356       return -1;
  357     }
  358     if (bfds->fsize < -pos)
  359     {
  360       LOG ("Invalid seek operation\n");
  361       return -1;
  362     }
  363     pos = bfds->fsize + pos;
  364   /* fall-through! */
  365   case SEEK_SET:
  366     if (pos < 0)
  367     {
  368       LOG ("Invalid seek operation\n");
  369       return -1;
  370     }
  371     if (pos > bfds->fsize)
  372     {
  373       LOG ("Invalid seek operation (%lld > %llu) %d\n",
  374            (long long) pos,
  375            (unsigned long long) bfds->fsize,
  376            SEEK_SET == whence);
  377       return -1;
  378     }
  379     if ( (NULL == bfds->buffer) ||
  380          ( (bfds->fpos <= pos) &&
  381            (bfds->fpos + bfds->buffer_bytes > pos) ) )
  382     {
  383       bfds->buffer_pos = pos - bfds->fpos;
  384       return pos;
  385     }
  386     if (0 != bfds_pick_next_buffer_at (bfds, pos))
  387     {
  388       LOG ("seek operation failed\n");
  389       return -1;
  390     }
  391     ASSERT (pos == bfds->fpos + bfds->buffer_pos);
  392     return pos;
  393   }
  394   return -1;
  395 }
  396 
  397 
  398 /**
  399  * Fills 'buf_ptr' with a chunk of data. Will
  400  * fail if 'count' exceeds buffer size.
  401  *
  402  * @param bfds bfds
  403  * @param buf_ptr location to store data
  404  * @param count number of bytes to read
  405  * @return number of bytes (<= count) available at location pointed by buf_ptr,
  406  *         0 for end of stream, -1 on error
  407  */
  408 static ssize_t
  409 bfds_read (struct BufferedFileDataSource *bfds,
  410            void *buf_ptr,
  411            size_t count)
  412 {
  413   char *cbuf = buf_ptr;
  414   uint64_t old_off;
  415   size_t avail;
  416   size_t ret;
  417 
  418   old_off = bfds->fpos + bfds->buffer_pos;
  419   if (old_off == bfds->fsize)
  420     return 0; /* end of stream */
  421   ret = 0;
  422   while (count > 0)
  423   {
  424     if ( (bfds->buffer_bytes == bfds->buffer_pos) &&
  425          (0 != bfds_pick_next_buffer_at (bfds,
  426                                          bfds->fpos + bfds->buffer_bytes)) )
  427     {
  428       /* revert to original position, invalidate buffer */
  429       bfds->fpos = old_off;
  430       bfds->buffer_bytes = 0;
  431       bfds->buffer_pos = 0;
  432       LOG ("read operation failed\n");
  433       return -1; /* getting more failed */
  434     }
  435     avail = bfds->buffer_bytes - bfds->buffer_pos;
  436     if (avail > count)
  437       avail = count;
  438     if (0 == avail)
  439       break;
  440     memcpy (&cbuf[ret], bfds->data + bfds->buffer_pos, avail);
  441     bfds->buffer_pos += avail;
  442     count -= avail;
  443     ret += avail;
  444   }
  445   return ret;
  446 }
  447 
  448 
  449 #if HAVE_ZLIB
  450 /**
  451  * Initializes gz-decompression object. Might report metadata about
  452  * compresse stream, if available. Resets the stream to the beginning.
  453  *
  454  * @param cfs cfs to initialize
  455  * @param proc callback for metadata
  456  * @param proc_cls callback cls
  457  * @return 1 on success, 0 to terminate extraction, -1 on error
  458  */
  459 static int
  460 cfs_init_decompressor_zlib (struct CompressedFileSource *cfs,
  461                             EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
  462 {
  463   unsigned int gzip_header_length = 10;
  464   unsigned char hdata[12];
  465   ssize_t rsize;
  466 
  467   if (0 != bfds_seek (cfs->bfds, 0, SEEK_SET))
  468   {
  469     LOG ("Failed to seek to offset 0!\n");
  470     return -1;
  471   }
  472   /* Process gzip header */
  473   rsize = bfds_read (cfs->bfds, hdata, sizeof (hdata));
  474   if ( (-1 == rsize) ||
  475        (sizeof (hdata) > (size_t) rsize) )
  476     return -1;
  477   if (0 != (hdata[3] & 0x4)) /* FEXTRA  set */
  478     gzip_header_length += 2 + (hdata[10] & 0xff) + ((hdata[11] & 0xff) * 256);
  479 
  480   if (0 != (hdata[3] & 0x8))
  481   {
  482     /* FNAME set */
  483     char fname[1024];
  484     char *cptr;
  485     size_t len;
  486     ssize_t buf_bytes;
  487 
  488     if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length,
  489                                         SEEK_SET))
  490     {
  491       LOG ("Corrupt gzip, failed to seek to end of header\n");
  492       return -1;
  493     }
  494     buf_bytes = bfds_read (cfs->bfds, fname, sizeof (fname));
  495     if (buf_bytes <= 0)
  496     {
  497       LOG ("Corrupt gzip, failed to read filename\n");
  498       return -1;
  499     }
  500     if (NULL == (cptr = memchr (fname, 0, buf_bytes)))
  501     {
  502       LOG ("Corrupt gzip, failed to read filename terminator\n");
  503       return -1;
  504     }
  505     len = cptr - fname;
  506     if ( (NULL != proc) &&
  507          (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_FILENAME,
  508                      EXTRACTOR_METAFORMAT_C_STRING, "text/plain",
  509                      fname,
  510                      len)) )
  511       return 0; /* done */
  512     gzip_header_length += len + 1;
  513   }
  514 
  515   if (0 != (hdata[3] & 0x16))
  516   {
  517     /* FCOMMENT set */
  518     char fcomment[1024];
  519     char *cptr;
  520     ssize_t buf_bytes;
  521     size_t len;
  522 
  523     if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length,
  524                                         SEEK_SET))
  525     {
  526       LOG ("Corrupt gzip, failed to seek to end of header\n");
  527       return -1;
  528     }
  529     buf_bytes = bfds_read (cfs->bfds, fcomment, sizeof (fcomment));
  530     if (buf_bytes <= 0)
  531     {
  532       LOG ("Corrupt gzip, failed to read comment\n");
  533       return -1;
  534     }
  535     if (NULL == (cptr = memchr (fcomment, 0, buf_bytes)))
  536     {
  537       LOG ("Corrupt gzip, failed to read comment terminator\n");
  538       return -1;
  539     }
  540     len = cptr - fcomment;
  541     if ( (NULL != proc) &&
  542          (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_COMMENT,
  543                      EXTRACTOR_METAFORMAT_C_STRING, "text/plain",
  544                      (const char *) fcomment,
  545                      len)) )
  546       return 0; /* done */
  547     gzip_header_length += len + 1;
  548   }
  549   if (0 != (hdata[3] & 0x2)) /* FCHRC set */
  550     gzip_header_length += 2;
  551   memset (&cfs->strm, 0, sizeof (z_stream));
  552 
  553 #ifdef ZLIB_VERNUM
  554   /* zlib will take care of its header */
  555   gzip_header_length = 0;
  556 #endif
  557   cfs->gzip_header_length = gzip_header_length;
  558 
  559   if (cfs->gzip_header_length !=
  560       bfds_seek (cfs->bfds, cfs->gzip_header_length, SEEK_SET))
  561   {
  562     LOG ("Failed to seek to start to initialize gzip decompressor\n");
  563     return -1;
  564   }
  565   cfs->strm.avail_out = COM_CHUNK_SIZE;
  566   /*
  567    * note: maybe plain inflateInit(&strm) is adequate,
  568    * it looks more backward-compatible also ;
  569    *
  570    * ZLIB_VERNUM isn't defined by zlib version 1.1.4 ;
  571    * there might be a better check.
  572    */if (Z_OK != inflateInit2 (&cfs->strm,
  573 #ifdef ZLIB_VERNUM
  574                             15 + 32
  575 #else
  576                             -MAX_WBITS
  577 #endif
  578                             ))
  579   {
  580     LOG ("Failed to initialize zlib decompression\n");
  581     return -1;
  582   }
  583   return 1;
  584 }
  585 
  586 
  587 #endif
  588 
  589 
  590 #if HAVE_LIBBZ2
  591 /**
  592  * Initializes bz2-decompression object. Might report metadata about
  593  * compresse stream, if available. Resets the stream to the beginning.
  594  *
  595  * @param cfs cfs to initialize
  596  * @param proc callback for metadata
  597  * @param proc_cls callback cls
  598  * @return 1 on success, -1 on error
  599  */
  600 static int
  601 cfs_init_decompressor_bz2 (struct CompressedFileSource *cfs,
  602                            EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
  603 {
  604   if (0 !=
  605       bfds_seek (cfs->bfds, 0, SEEK_SET))
  606   {
  607     LOG ("Failed to seek to start to initialize BZ2 decompressor\n");
  608     return -1;
  609   }
  610   memset (&cfs->bstrm, 0, sizeof (bz_stream));
  611   if (BZ_OK !=
  612       BZ2_bzDecompressInit (&cfs->bstrm, 0, 0))
  613   {
  614     LOG ("Failed to initialize BZ2 decompressor\n");
  615     return -1;
  616   }
  617   cfs->bstrm.avail_out = COM_CHUNK_SIZE;
  618   return 1;
  619 }
  620 
  621 
  622 #endif
  623 
  624 
  625 /**
  626  * Initializes decompression object. Might report metadata about
  627  * compresse stream, if available. Resets the stream to the beginning.
  628  *
  629  * @param cfs cfs to initialize
  630  * @param proc callback for metadata
  631  * @param proc_cls callback cls
  632  * @return 1 on success, 0 to terminate extraction, -1 on error
  633  */
  634 static int
  635 cfs_init_decompressor (struct CompressedFileSource *cfs,
  636                        EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
  637 {
  638   cfs->result_pos = 0;
  639   cfs->fpos = 0;
  640   switch (cfs->compression_type)
  641   {
  642 #if HAVE_ZLIB
  643   case COMP_TYPE_ZLIB:
  644     return cfs_init_decompressor_zlib (cfs, proc, proc_cls);
  645 #endif
  646 #if HAVE_LIBBZ2
  647   case COMP_TYPE_BZ2:
  648     return cfs_init_decompressor_bz2 (cfs, proc, proc_cls);
  649 #endif
  650   default:
  651     LOG ("invalid compression type selected\n");
  652     return -1;
  653   }
  654 }
  655 
  656 
  657 #if HAVE_ZLIB
  658 /**
  659  * Deinitializes gz-decompression object.
  660  *
  661  * @param cfs cfs to deinitialize
  662  * @return 1 on success, -1 on error
  663  */
  664 static int
  665 cfs_deinit_decompressor_zlib (struct CompressedFileSource *cfs)
  666 {
  667   inflateEnd (&cfs->strm);
  668   return 1;
  669 }
  670 
  671 
  672 #endif
  673 
  674 
  675 #if HAVE_LIBBZ2
  676 /**
  677  * Deinitializes bz2-decompression object.
  678  *
  679  * @param cfs cfs to deinitialize
  680  * @return 1 on success, -1 on error
  681  */
  682 static int
  683 cfs_deinit_decompressor_bz2 (struct CompressedFileSource *cfs)
  684 {
  685   BZ2_bzDecompressEnd (&cfs->bstrm);
  686   return 1;
  687 }
  688 
  689 
  690 #endif
  691 
  692 
  693 /**
  694  * Deinitializes decompression object.
  695  *
  696  * @param cfs cfs to deinitialize
  697  * @return 1 on success, -1 on error
  698  */
  699 static int
  700 cfs_deinit_decompressor (struct CompressedFileSource *cfs)
  701 {
  702   switch (cfs->compression_type)
  703   {
  704 #if HAVE_ZLIB
  705   case COMP_TYPE_ZLIB:
  706     return cfs_deinit_decompressor_zlib (cfs);
  707 #endif
  708 #if HAVE_LIBBZ2
  709   case COMP_TYPE_BZ2:
  710     return cfs_deinit_decompressor_bz2 (cfs);
  711 #endif
  712   default:
  713     LOG ("invalid compression type selected\n");
  714     return -1;
  715   }
  716 }
  717 
  718 
  719 /**
  720  * Resets the compression stream to begin uncompressing
  721  * from the beginning. Used at initialization time, and when
  722  * seeking backward.
  723  *
  724  * @param cfs cfs to reset
  725  * @return 1 on success, 0 to terminate extraction,
  726  *        -1 on error
  727  */
  728 static int
  729 cfs_reset_stream (struct CompressedFileSource *cfs)
  730 {
  731   if (-1 == cfs_deinit_decompressor (cfs))
  732     return -1;
  733   return cfs_init_decompressor (cfs, NULL, NULL);
  734 }
  735 
  736 
  737 /**
  738  * Destroy compressed file source.
  739  *
  740  * @param cfs source to destroy
  741  */
  742 static void
  743 cfs_destroy (struct CompressedFileSource *cfs)
  744 {
  745   cfs_deinit_decompressor (cfs);
  746   free (cfs);
  747 }
  748 
  749 
  750 /**
  751  * Allocates and initializes new cfs object.
  752  *
  753  * @param bfds data source to use
  754  * @param fsize size of the source
  755  * @param compression_type type of compression used
  756  * @param proc metadata callback to call with meta data found upon opening
  757  * @param proc_cls callback cls
  758  * @return newly allocated cfs on success, NULL on error
  759  */
  760 struct CompressedFileSource *
  761 cfs_new (struct BufferedFileDataSource *bfds,
  762          int64_t fsize,
  763          enum ExtractorCompressionType compression_type,
  764          EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
  765 {
  766   struct CompressedFileSource *cfs;
  767 
  768   if (NULL == (cfs = malloc (sizeof (struct CompressedFileSource))))
  769   {
  770     LOG_STRERROR ("malloc");
  771     return NULL;
  772   }
  773   memset (cfs, 0, sizeof (struct CompressedFileSource));
  774   cfs->compression_type = compression_type;
  775   cfs->bfds = bfds;
  776   cfs->fsize = fsize;
  777   cfs->uncompressed_size = -1;
  778   if (1 != cfs_init_decompressor (cfs,
  779                                   proc, proc_cls))
  780   {
  781     free (cfs);
  782     return NULL;
  783   }
  784   return cfs;
  785 }
  786 
  787 
  788 #if HAVE_ZLIB
  789 /**
  790  * Fills 'data' with new uncompressed data.  Does the actual
  791  * decompression. Will set uncompressed_size on the end of compressed
  792  * stream.
  793  *
  794  * @param cfds cfs to read from
  795  * @param data where to copy the data
  796  * @param size number of bytes available in data
  797  * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
  798  */
  799 static ssize_t
  800 cfs_read_zlib (struct CompressedFileSource *cfs,
  801                void *data,
  802                size_t size)
  803 {
  804   char *dst = data;
  805   int ret;
  806   size_t rc;
  807   ssize_t in;
  808   unsigned char buf[COM_CHUNK_SIZE];
  809 
  810   if (cfs->fpos == cfs->uncompressed_size)
  811   {
  812     /* end of file */
  813     return 0;
  814   }
  815   rc = 0;
  816   if (COM_CHUNK_SIZE > cfs->strm.avail_out + cfs->result_pos)
  817   {
  818     /* got left-over decompressed data from previous round! */
  819     in = COM_CHUNK_SIZE - (cfs->strm.avail_out + cfs->result_pos);
  820     if (in > size)
  821       in = size;
  822     memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
  823     cfs->fpos += in;
  824     cfs->result_pos += in;
  825     rc += in;
  826   }
  827   ret = Z_OK;
  828   while ( (rc < size) && (Z_STREAM_END != ret) )
  829   {
  830     /* read block from original data source */
  831     in = bfds_read (cfs->bfds,
  832                     buf, sizeof (buf));
  833     if (in < 0)
  834     {
  835       LOG ("unexpected EOF\n");
  836       return -1; /* unexpected EOF */
  837     }
  838     if (0 == in)
  839     {
  840       cfs->uncompressed_size = cfs->fpos;
  841       return rc;
  842     }
  843     cfs->strm.next_in = buf;
  844     cfs->strm.avail_in = (uInt) in;
  845     cfs->strm.next_out = (unsigned char *) cfs->result;
  846     cfs->strm.avail_out = COM_CHUNK_SIZE;
  847     cfs->result_pos = 0;
  848     ret = inflate (&cfs->strm, Z_SYNC_FLUSH);
  849     if ( (Z_OK != ret) && (Z_STREAM_END != ret) )
  850     {
  851       LOG ("unexpected gzip inflate error: %d\n", ret);
  852       return -1; /* unexpected error */
  853     }
  854     /* go backwards by the number of bytes left in the buffer */
  855     if (-1 == bfds_seek (cfs->bfds, -(int64_t) cfs->strm.avail_in, SEEK_CUR))
  856     {
  857       LOG ("seek failed\n");
  858       return -1;
  859     }
  860     /* copy decompressed bytes to target buffer */
  861     in = COM_CHUNK_SIZE - cfs->strm.avail_out;
  862     if (in > size - rc)
  863     {
  864       if (Z_STREAM_END == ret)
  865       {
  866         cfs->uncompressed_size = cfs->fpos + in;
  867         ret = Z_OK;
  868       }
  869       in = size - rc;
  870     }
  871     memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
  872     cfs->fpos += in;
  873     cfs->result_pos += in;
  874     rc += in;
  875   }
  876   if (Z_STREAM_END == ret)
  877   {
  878     cfs->uncompressed_size = cfs->fpos;
  879   }
  880   return rc;
  881 }
  882 
  883 
  884 #endif
  885 
  886 
  887 #if HAVE_LIBBZ2
  888 /**
  889  * Fills 'data' with new uncompressed data.  Does the actual
  890  * decompression. Will set uncompressed_size on the end of compressed
  891  * stream.
  892  *
  893  * @param cfds cfs to read from
  894  * @param data where to copy the data
  895  * @param size number of bytes available in data
  896  * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
  897  */
  898 static ssize_t
  899 cfs_read_bz2 (struct CompressedFileSource *cfs,
  900               void *data,
  901               size_t size)
  902 {
  903   char *dst = data;
  904   int ret;
  905   size_t rc;
  906   ssize_t in;
  907   char buf[COM_CHUNK_SIZE];
  908 
  909   if (cfs->fpos == cfs->uncompressed_size)
  910   {
  911     /* end of file */
  912     return 0;
  913   }
  914   rc = 0;
  915   if (COM_CHUNK_SIZE > cfs->bstrm.avail_out + cfs->result_pos)
  916   {
  917     /* got left-over decompressed data from previous round! */
  918     in = COM_CHUNK_SIZE - (cfs->bstrm.avail_out + cfs->result_pos);
  919     if (in > size)
  920       in = size;
  921     memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
  922     cfs->fpos += in;
  923     cfs->result_pos += in;
  924     rc += in;
  925   }
  926   ret = BZ_OK;
  927   while ( (rc < size) && (BZ_STREAM_END != ret) )
  928   {
  929     /* read block from original data source */
  930     in = bfds_read (cfs->bfds,
  931                     buf, sizeof (buf));
  932     if (in < 0)
  933     {
  934       LOG ("unexpected EOF\n");
  935       return -1; /* unexpected EOF */
  936     }
  937     if (0 == in)
  938     {
  939       cfs->uncompressed_size = cfs->fpos;
  940       return rc;
  941     }
  942     cfs->bstrm.next_in = buf;
  943     cfs->bstrm.avail_in = (unsigned int) in;
  944     cfs->bstrm.next_out = cfs->result;
  945     cfs->bstrm.avail_out = COM_CHUNK_SIZE;
  946     cfs->result_pos = 0;
  947     ret = BZ2_bzDecompress (&cfs->bstrm);
  948     if ( (BZ_OK != ret) && (BZ_STREAM_END != ret) )
  949     {
  950       LOG ("unexpected bzip2 decompress error: %d\n", ret);
  951       return -1; /* unexpected error */
  952     }
  953     /* go backwards by the number of bytes left in the buffer */
  954     if (-1 == bfds_seek (cfs->bfds, -(int64_t) cfs->bstrm.avail_in, SEEK_CUR))
  955     {
  956       LOG ("seek failed\n");
  957       return -1;
  958     }
  959     /* copy decompressed bytes to target buffer */
  960     in = COM_CHUNK_SIZE - cfs->bstrm.avail_out;
  961     if (in > size - rc)
  962     {
  963       if (BZ_STREAM_END == ret)
  964       {
  965         cfs->uncompressed_size = cfs->fpos + in;
  966         ret = BZ_OK;
  967       }
  968       in = size - rc;
  969     }
  970     memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
  971     cfs->fpos += in;
  972     cfs->result_pos += in;
  973     rc += in;
  974   }
  975   if (BZ_STREAM_END == ret)
  976   {
  977     cfs->uncompressed_size = cfs->fpos;
  978   }
  979   return rc;
  980 }
  981 
  982 
  983 #endif
  984 
  985 
  986 /**
  987  * Fills 'data' with new uncompressed data.  Does the actual
  988  * decompression. Will set uncompressed_size on the end of compressed
  989  * stream.
  990  *
  991  * @param cfds cfs to read from
  992  * @param data where to copy the data
  993  * @param size number of bytes available in data
  994  * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
  995  */
  996 static ssize_t
  997 cfs_read (struct CompressedFileSource *cfs,
  998           void *data,
  999           size_t size)
 1000 {
 1001   switch (cfs->compression_type)
 1002   {
 1003 #if HAVE_ZLIB
 1004   case COMP_TYPE_ZLIB:
 1005     return cfs_read_zlib (cfs, data, size);
 1006 #endif
 1007 #if HAVE_LIBBZ2
 1008   case COMP_TYPE_BZ2:
 1009     return cfs_read_bz2 (cfs, data, size);
 1010 #endif
 1011   default:
 1012     LOG ("invalid compression type selected\n");
 1013     return -1;
 1014   }
 1015 }
 1016 
 1017 
 1018 /**
 1019  * Moves the buffer to 'position' in uncompressed steam. If position
 1020  * requires seeking backwards beyond the boundaries of the buffer, resets the
 1021  * stream and repeats decompression from the beginning to 'position'.
 1022  *
 1023  * @param cfs cfs to seek on
 1024  * @param position new starting point for the buffer
 1025  * @param whence one of the seek constants (SEEK_CUR, SEEK_SET, SEEK_END)
 1026  * @return new absolute buffer position, -1 on error or EOS
 1027  */
 1028 static int64_t
 1029 cfs_seek (struct CompressedFileSource *cfs,
 1030           int64_t position,
 1031           int whence)
 1032 {
 1033   uint64_t nposition;
 1034   int64_t delta;
 1035 
 1036   switch (whence)
 1037   {
 1038   case SEEK_CUR:
 1039     if (cfs->fpos + position < 0)
 1040     {
 1041       /* underflow */
 1042       LOG ("Invalid seek operation\n");
 1043       return -1;
 1044     }
 1045     if ( (-1 != cfs->uncompressed_size) &&
 1046          (cfs->fpos + position > cfs->uncompressed_size) )
 1047     {
 1048       LOG ("Invalid seek operation\n");
 1049       return -1;
 1050     }
 1051     nposition = cfs->fpos + position;
 1052     break;
 1053   case SEEK_END:
 1054     ASSERT (-1 != cfs->uncompressed_size);
 1055     if (position > 0)
 1056     {
 1057       LOG ("Invalid seek operation\n");
 1058       return -1;
 1059     }
 1060     if (cfs->uncompressed_size < -position)
 1061     {
 1062       LOG ("Invalid seek operation\n");
 1063       return -1;
 1064     }
 1065     nposition = cfs->uncompressed_size + position;
 1066     break;
 1067   case SEEK_SET:
 1068     if (position < 0)
 1069     {
 1070       LOG ("Invalid seek operation\n");
 1071       return -1;
 1072     }
 1073     if ( (-1 != cfs->uncompressed_size) &&
 1074          (cfs->uncompressed_size < position) )
 1075     {
 1076       LOG ("Invalid seek operation\n");
 1077       return -1;
 1078     }
 1079     nposition = (uint64_t) position;
 1080     break;
 1081   default:
 1082     LOG ("Invalid seek operation\n");
 1083     return -1;
 1084   }
 1085   delta = nposition - cfs->fpos;
 1086   if (delta < 0)
 1087   {
 1088     if (cfs->result_pos >= -delta)
 1089     {
 1090       cfs->result_pos += delta;
 1091       cfs->fpos += delta;
 1092       delta = 0;
 1093     }
 1094     else
 1095     {
 1096       if (-1 == cfs_reset_stream (cfs))
 1097       {
 1098         LOG ("Failed to restart compressed stream for seek operation\n");
 1099         return -1;
 1100       }
 1101       delta = nposition;
 1102     }
 1103   }
 1104   while (delta > 0)
 1105   {
 1106     char buf[COM_CHUNK_SIZE];
 1107     size_t max;
 1108     int64_t ret;
 1109 
 1110     max = (sizeof (buf) > delta) ? delta : sizeof (buf);
 1111     ret = cfs_read (cfs, buf, max);
 1112     if (-1 == ret)
 1113     {
 1114       LOG ("Failed to read decompressed stream for seek operation\n");
 1115       return -1;
 1116     }
 1117     if (0 == ret)
 1118     {
 1119       LOG (
 1120         "Reached unexpected end of stream at %llu during seek operation to %llu (%d left)\n",
 1121         (unsigned long long) cfs->fpos,
 1122         (unsigned long long) nposition,
 1123         delta);
 1124       return -1;
 1125     }
 1126     ASSERT (ret <= delta);
 1127     delta -= ret;
 1128   }
 1129   return cfs->fpos;
 1130 }
 1131 
 1132 
 1133 /**
 1134  * Detect if we have compressed data on our hands.
 1135  *
 1136  * @param data pointer to a data buffer or NULL (in case fd is not -1)
 1137  * @param fd a file to read data from, or -1 (if data is not NULL)
 1138  * @param fsize size of data (if data is not NULL) or of file (if fd is not -1)
 1139  * @return -1 to indicate an error, 0 to indicate uncompressed data, or a type (> 0) of compression
 1140  */
 1141 static enum ExtractorCompressionType
 1142 get_compression_type (struct BufferedFileDataSource *bfds)
 1143 {
 1144   unsigned char read_data[3];
 1145 
 1146   if (0 != bfds_seek (bfds, 0, SEEK_SET))
 1147     return COMP_TYPE_INVALID;
 1148   if (sizeof (read_data) !=
 1149       bfds_read (bfds, read_data, sizeof (read_data)))
 1150     return COMP_TYPE_UNDEFINED;
 1151 
 1152 #if HAVE_ZLIB
 1153   if ( (bfds->fsize >= MIN_ZLIB_HEADER) &&
 1154        (read_data[0] == 0x1f) &&
 1155        (read_data[1] == 0x8b) &&
 1156        (read_data[2] == 0x08) )
 1157     return COMP_TYPE_ZLIB;
 1158 #endif
 1159 #if HAVE_LIBBZ2
 1160   if ( (bfds->fsize >= MIN_BZ2_HEADER) &&
 1161        (read_data[0] == 'B') &&
 1162        (read_data[1] == 'Z') &&
 1163        (read_data[2] == 'h'))
 1164     return COMP_TYPE_BZ2;
 1165 #endif
 1166   return COMP_TYPE_INVALID;
 1167 }
 1168 
 1169 
 1170 /**
 1171  * Handle to a datasource we can use for the plugins.
 1172  */
 1173 struct EXTRACTOR_Datasource
 1174 {
 1175 
 1176   /**
 1177    * Underlying buffered data source.
 1178    */
 1179   struct BufferedFileDataSource *bfds;
 1180 
 1181   /**
 1182    * Compressed file source (NULL if not applicable).
 1183    */
 1184   struct CompressedFileSource *cfs;
 1185 
 1186   /**
 1187    * Underlying file descriptor, -1 for none.
 1188    */
 1189   int fd;
 1190 };
 1191 
 1192 
 1193 /**
 1194  * Create a datasource from a file on disk.
 1195  *
 1196  * @param filename name of the file on disk
 1197  * @param proc metadata callback to call with meta data found upon opening
 1198  * @param proc_cls callback cls
 1199  * @return handle to the datasource, NULL on error
 1200  */
 1201 struct EXTRACTOR_Datasource *
 1202 EXTRACTOR_datasource_create_from_file_ (const char *filename,
 1203                                         EXTRACTOR_MetaDataProcessor proc,
 1204                                         void *proc_cls)
 1205 {
 1206   struct BufferedFileDataSource *bfds;
 1207   struct EXTRACTOR_Datasource *ds;
 1208   enum ExtractorCompressionType ct;
 1209   int fd;
 1210   struct stat sb;
 1211   int64_t fsize;
 1212   int winmode = 0;
 1213 #if WINDOWS
 1214   winmode = O_BINARY;
 1215 #endif
 1216 
 1217   if (-1 == (fd = open (filename, O_RDONLY | O_LARGEFILE | winmode)))
 1218   {
 1219     LOG_STRERROR_FILE ("open", filename);
 1220     return NULL;
 1221   }
 1222   if ( (0 != fstat (fd, &sb)) ||
 1223        (S_ISDIR (sb.st_mode)) )
 1224   {
 1225     if (! S_ISDIR (sb.st_mode))
 1226       LOG_STRERROR_FILE ("fstat", filename);
 1227     else
 1228       LOG ("Skipping directory `%s'\n", filename);
 1229     (void) close (fd);
 1230     return NULL;
 1231   }
 1232   fsize = (int64_t) sb.st_size;
 1233   if (0 == fsize)
 1234   {
 1235     (void) close (fd);
 1236     return NULL;
 1237   }
 1238   bfds = bfds_new (NULL, fd, fsize);
 1239   if (NULL == bfds)
 1240   {
 1241     (void) close (fd);
 1242     return NULL;
 1243   }
 1244   if (NULL == (ds = malloc (sizeof (struct EXTRACTOR_Datasource))))
 1245   {
 1246     LOG_STRERROR ("malloc");
 1247     bfds_delete (bfds);
 1248     (void) close (fd);
 1249     return NULL;
 1250   }
 1251   ds->bfds = bfds;
 1252   ds->fd = fd;
 1253   ds->cfs = NULL;
 1254   ct = get_compression_type (bfds);
 1255   if ( (COMP_TYPE_ZLIB == ct) ||
 1256        (COMP_TYPE_BZ2 == ct) )
 1257   {
 1258     ds->cfs = cfs_new (bfds, fsize, ct, proc, proc_cls);
 1259     if (NULL == ds->cfs)
 1260     {
 1261       LOG ("Failed to initialize decompressor\n");
 1262       bfds_delete (bfds);
 1263       free (ds);
 1264       (void) close (fd);
 1265       return NULL;
 1266     }
 1267   }
 1268   return ds;
 1269 }
 1270 
 1271 
 1272 /**
 1273  * Create a datasource from a buffer in memory.
 1274  *
 1275  * @param buf data in memory
 1276  * @param size number of bytes in 'buf'
 1277  * @param proc metadata callback to call with meta data found upon opening
 1278  * @param proc_cls callback cls
 1279  * @return handle to the datasource
 1280  */
 1281 struct EXTRACTOR_Datasource *
 1282 EXTRACTOR_datasource_create_from_buffer_ (const char *buf,
 1283                                           size_t size,
 1284                                           EXTRACTOR_MetaDataProcessor proc,
 1285                                           void *proc_cls)
 1286 {
 1287   struct BufferedFileDataSource *bfds;
 1288   struct EXTRACTOR_Datasource *ds;
 1289   enum ExtractorCompressionType ct;
 1290 
 1291   if (0 == size)
 1292     return NULL;
 1293   if (NULL == (bfds = bfds_new (buf, -1, size)))
 1294   {
 1295     LOG ("Failed to initialize buffer data source\n");
 1296     return NULL;
 1297   }
 1298   if (NULL == (ds = malloc (sizeof (struct EXTRACTOR_Datasource))))
 1299   {
 1300     LOG_STRERROR ("malloc");
 1301     bfds_delete (bfds);
 1302     return NULL;
 1303   }
 1304   ds->bfds = bfds;
 1305   ds->fd = -1;
 1306   ds->cfs = NULL;
 1307   ct = get_compression_type (bfds);
 1308   if ( (COMP_TYPE_ZLIB == ct) ||
 1309        (COMP_TYPE_BZ2 == ct) )
 1310   {
 1311     ds->cfs = cfs_new (bfds, size, ct, proc, proc_cls);
 1312     if (NULL == ds->cfs)
 1313     {
 1314       LOG ("Failed to initialize decompressor\n");
 1315       bfds_delete (bfds);
 1316       free (ds);
 1317       return NULL;
 1318     }
 1319   }
 1320   return ds;
 1321 }
 1322 
 1323 
 1324 /**
 1325  * Destroy a data source.
 1326  *
 1327  * @param ds source to destroy
 1328  */
 1329 void
 1330 EXTRACTOR_datasource_destroy_ (struct EXTRACTOR_Datasource *ds)
 1331 {
 1332   if (NULL != ds->cfs)
 1333     cfs_destroy (ds->cfs);
 1334   bfds_delete (ds->bfds);
 1335   if (-1 != ds->fd)
 1336     (void) close (ds->fd);
 1337   free (ds);
 1338 }
 1339 
 1340 
 1341 /**
 1342  * Make 'size' bytes of data from the data source available at 'data'.
 1343  *
 1344  * @param cls must be a 'struct EXTRACTOR_Datasource'
 1345  * @param data where the data should be copied to
 1346  * @param size maximum number of bytes requested
 1347  * @return number of bytes now available in data (can be smaller than 'size'),
 1348  *         -1 on error
 1349  */
 1350 ssize_t
 1351 EXTRACTOR_datasource_read_ (void *cls,
 1352                             void *data,
 1353                             size_t size)
 1354 {
 1355   struct EXTRACTOR_Datasource *ds = cls;
 1356 
 1357   if (NULL != ds->cfs)
 1358     return cfs_read (ds->cfs, data, size);
 1359   return bfds_read (ds->bfds, data, size);
 1360 }
 1361 
 1362 
 1363 /**
 1364  * Seek in the datasource.  Use 'SEEK_CUR' for whence and 'pos' of 0 to
 1365  * obtain the current position in the file.
 1366  *
 1367  * @param cls must be a 'struct EXTRACTOR_Datasource'
 1368  * @param pos position to seek (see 'man lseek')
 1369  * @param whence how to see (absolute to start, relative, absolute to end)
 1370  * @return new absolute position, UINT64_MAX on error (i.e. desired position
 1371  *         does not exist)
 1372  */
 1373 int64_t
 1374 EXTRACTOR_datasource_seek_ (void *cls,
 1375                             int64_t pos,
 1376                             int whence)
 1377 {
 1378   struct EXTRACTOR_Datasource *ds = cls;
 1379 
 1380   if (NULL != ds->cfs)
 1381   {
 1382     if ( (SEEK_END == whence) &&
 1383          (-1 == ds->cfs->uncompressed_size) )
 1384     {
 1385       /* need to obtain uncompressed size */
 1386       (void) EXTRACTOR_datasource_get_size_ (ds, 1);
 1387       if (-1 == ds->cfs->uncompressed_size)
 1388         return -1;
 1389     }
 1390     return cfs_seek (ds->cfs, pos, whence);
 1391   }
 1392   return bfds_seek (ds->bfds, pos, whence);
 1393 }
 1394 
 1395 
 1396 /**
 1397  * Determine the overall size of the data source (after compression).
 1398  *
 1399  * @param cls must be a 'struct EXTRACTOR_Datasource'
 1400  * @param force force computing the size if it is unavailable
 1401  * @return overall file size, UINT64_MAX on error or unknown
 1402  */
 1403 int64_t
 1404 EXTRACTOR_datasource_get_size_ (void *cls,
 1405                                 int force)
 1406 {
 1407   struct EXTRACTOR_Datasource *ds = cls;
 1408   char buf[32 * 1024];
 1409   uint64_t pos;
 1410 
 1411   if (NULL != ds->cfs)
 1412   {
 1413     if ( (force) &&
 1414          (-1 == ds->cfs->uncompressed_size) )
 1415     {
 1416       pos = ds->cfs->fpos;
 1417       while ( (-1 == ds->cfs->uncompressed_size) &&
 1418               (-1 != cfs_read (ds->cfs, buf, sizeof (buf))) )
 1419         ;
 1420       if (-1 == cfs_seek (ds->cfs, pos, SEEK_SET))
 1421       {
 1422         LOG (
 1423           "Serious problem, I moved the buffer to determine the file size but could not restore it...\n");
 1424         return -1;
 1425       }
 1426       if (-1 == ds->cfs->uncompressed_size)
 1427         return -1;
 1428     }
 1429     return ds->cfs->uncompressed_size;
 1430   }
 1431   return ds->bfds->fsize;
 1432 }
 1433 
 1434 
 1435 /* end of extractor_datasource.c */