libextractor  1.11
About: GNU libextractor is a library used to extract meta-data from files of arbitrary type.
  Fossies Dox: libextractor-1.11.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

extractor_datasource.c
Go to the documentation of this file.
1 /*
2  This file is part of libextractor.
3  Copyright (C) 2002, 2003, 2004, 2005, 2006, 2009, 2012 Vidyut Samanta and Christian Grothoff
4 
5  libextractor is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published
7  by the Free Software Foundation; either version 3, or (at your
8  option) any later version.
9 
10  libextractor is distributed in the hope that it will be useful, but
11  WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with libextractor; see the file COPYING. If not, write to the
17  Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18  Boston, MA 02110-1301, USA.
19  */
20 /**
21  * @file main/extractor_datasource.c
22  * @brief random access and possibly decompression of data from buffer in memory or file on disk
23  * @author Christian Grothoff
24  */
25 #include "platform.h"
26 #include "extractor_logging.h"
27 #include "extractor_datasource.h"
28 
29 #if HAVE_LIBBZ2
30 #include <bzlib.h>
31 #define MIN_BZ2_HEADER 4
32 #ifndef MIN_COMPRESSED_HEADER
33 #define MIN_COMPRESSED_HEADER MIN_ZLIB_HEADER
34 #endif
35 #endif
36 
37 #if HAVE_ZLIB
38 #include <zlib.h>
39 #define MIN_ZLIB_HEADER 12
40 #ifndef MIN_COMPRESSED_HEADER
41 #define MIN_COMPRESSED_HEADER MIN_BZ2_HEADER
42 #endif
43 #endif
44 
45 #ifndef MIN_COMPRESSED_HEADER
46 #define MIN_COMPRESSED_HEADER -1
47 #endif
48 
49 #ifndef O_LARGEFILE
50 #define O_LARGEFILE 0
51 #endif
52 
53 /**
54  * Maximum size of an IO buffer.
55  */
56 #define MAX_READ (4 * 1024 * 1024)
57 
58 /**
59  * Data is read from the source and shoved into decompressor
60  * in chunks this big.
61  */
62 #define COM_CHUNK_SIZE (16 * 1024)
63 
64 
65 /**
66  * Enum with the various possible types of compression supported.
67  */
69 {
70  /**
71  * We cannot tell from the data (header incomplete).
72  */
74 
75  /**
76  * Invalid header (likely uncompressed)
77  */
79 
80  /**
81  * libz / gzip compression.
82  */
84 
85  /**
86  * bz2 compression
87  */
88  COMP_TYPE_BZ2 = 2
89 };
90 
91 
92 /**
93  * Abstraction of the data source (file or a memory buffer)
94  * for the decompressor.
95  */
97 {
98  /**
99  * Pointer to the buffer to read from (may be NULL)
100  */
101  const void *data;
102 
103  /**
104  * A buffer to read into. For fd != -1: when data != NULL,
105  * data is used directly.
106  */
107  void *buffer;
108 
109  /**
110  * Size of the file (or the data buffer)
111  */
112  uint64_t fsize;
113 
114  /**
115  * Position of the buffer in the file.
116  */
117  uint64_t fpos;
118 
119  /**
120  * Position within the buffer. Our absolute offset in the file
121  * is thus 'fpos + buffer_pos'.
122  */
123  size_t buffer_pos;
124 
125  /**
126  * Number of valid bytes in the buffer (<= buffer_size)
127  */
128  size_t buffer_bytes;
129 
130  /**
131  * Allocated size of the buffer
132  */
133  size_t buffer_size;
134 
135  /**
136  * Descriptor of the file to read data from (may be -1)
137  */
138  int fd;
139 
140 };
141 
142 
143 /**
144  * An object from which uncompressed data can be read
145  */
147 {
148  /**
149  * The source of data
150  */
152 
153  /**
154  * Decompression target buffer.
155  */
157 
158  /**
159  * At which offset in 'result' is 'fpos'?
160  */
161  size_t result_pos;
162 
163  /**
164  * Size of the source (same as bfds->fsize)
165  */
166  int64_t fsize;
167 
168  /**
169  * Position within the (decompressed) source
170  */
171  int64_t fpos;
172 
173  /**
174  * Total size of the uncompressed data. Remains -1 until
175  * decompression is finished.
176  */
178 
179 #if HAVE_LIBBZ2
180  /**
181  * BZ2 stream object
182  */
183  bz_stream bstrm;
184 #endif
185 
186 #if HAVE_ZLIB
187  /**
188  * ZLIB stream object
189  */
190  z_stream strm;
191 
192  /**
193  * Length of gzip header (may be 0, in that case ZLIB parses the header)
194  */
195  int gzip_header_length;
196 #endif
197 
198  /**
199  * The type of compression used in the source
200  */
202 
203 };
204 
205 
206 /**
207  * Makes bfds seek to 'pos' and read a chunk of bytes there.
208  * Changes bfds->fpos, bfds->buffer_bytes and bfds->buffer_pos.
209  * Does almost nothing for memory-backed bfds.
210  *
211  * @param bfds bfds
212  * @param pos position
213  * @return 0 on success, -1 on error
214  */
215 static int
217  uint64_t pos)
218 {
219  int64_t position;
220  ssize_t rd;
221 
222  if (pos > bfds->fsize)
223  {
224  LOG ("Invalid seek operation\n");
225  return -1; /* invalid */
226  }
227  if (NULL == bfds->buffer)
228  {
229  bfds->buffer_pos = pos;
230  return 0;
231  }
232  position = (int64_t) lseek (bfds->fd, pos, SEEK_SET);
233  if (position < 0)
234  {
235  LOG_STRERROR ("lseek");
236  return -1;
237  }
238  bfds->fpos = position;
239  bfds->buffer_pos = 0;
240  rd = read (bfds->fd, bfds->buffer, bfds->buffer_size);
241  if (rd < 0)
242  {
243  LOG_STRERROR ("read");
244  return -1;
245  }
246  bfds->buffer_bytes = rd;
247  return 0;
248 }
249 
250 
251 /**
252  * Creates a bfds
253  *
254  * @param data data buffer to use as a source (NULL if fd != -1)
255  * @param fd file descriptor to use as a source (-1 if data != NULL)
256  * @param fsize size of the file (or the buffer)
257  * @return newly allocated bfds
258  */
259 static struct BufferedFileDataSource *
260 bfds_new (const void *data,
261  int fd,
262  int64_t fsize)
263 {
264  struct BufferedFileDataSource *result;
265  size_t xtra;
266 
267  if (fsize > MAX_READ)
268  xtra = MAX_READ;
269  else
270  xtra = (size_t) fsize;
271  if ( (-1 == fd) && (NULL == data) )
272  {
273  LOG ("Invalid arguments\n");
274  return NULL;
275  }
276  if ( (-1 != fd) && (NULL != data) )
277  fd = -1; /* don't need fd */
278  if (NULL != data)
279  xtra = 0;
280  if (NULL == (result = malloc (sizeof (struct BufferedFileDataSource) + xtra)))
281  {
282  LOG_STRERROR ("malloc");
283  return NULL;
284  }
285  memset (result, 0, sizeof (struct BufferedFileDataSource));
286  result->data = (NULL != data) ? data : &result[1];
287  result->buffer = (NULL != data) ? NULL : &result[1];
288  result->buffer_size = (NULL != data) ? fsize : xtra;
289  result->buffer_bytes = (NULL != data) ? fsize : 0;
290  result->fsize = fsize;
291  result->fd = fd;
292  bfds_pick_next_buffer_at (result, 0);
293  return result;
294 }
295 
296 
297 /**
298  * Unallocates bfds
299  *
300  * @param bfds bfds to deallocate
301  */
302 static void
304 {
305  free (bfds);
306 }
307 
308 
309 /**
310  * Makes bfds seek to 'pos' in 'whence' mode.
311  * Will try to seek within the buffer, will move the buffer location if
312  * the seek request falls outside of the buffer range.
313  *
314  * @param bfds bfds
315  * @param pos position to seek to
316  * @param whence one of the seek constants (SEEK_CUR, SEEK_SET, SEEK_END)
317  * @return new absolute position, -1 on error
318  */
319 static int64_t
321  int64_t pos, int whence)
322 {
323  uint64_t npos;
324  size_t nbpos;
325 
326  switch (whence)
327  {
328  case SEEK_CUR:
329  npos = bfds->fpos + bfds->buffer_pos + pos;
330  if (npos > bfds->fsize)
331  {
332  LOG ("Invalid seek operation to %lld from %llu (max is %llu)\n",
333  (long long) pos,
334  bfds->fpos + bfds->buffer_pos,
335  (unsigned long long) bfds->fsize);
336  return -1;
337  }
338  nbpos = bfds->buffer_pos + pos;
339  if ( (NULL == bfds->buffer) ||
340  (nbpos < bfds->buffer_bytes) )
341  {
342  bfds->buffer_pos = nbpos;
343  return npos;
344  }
345  if (0 != bfds_pick_next_buffer_at (bfds,
346  npos))
347  {
348  LOG ("seek operation failed\n");
349  return -1;
350  }
351  return npos;
352  case SEEK_END:
353  if (pos > 0)
354  {
355  LOG ("Invalid seek operation\n");
356  return -1;
357  }
358  if (bfds->fsize < -pos)
359  {
360  LOG ("Invalid seek operation\n");
361  return -1;
362  }
363  pos = bfds->fsize + pos;
364  /* fall-through! */
365  case SEEK_SET:
366  if (pos < 0)
367  {
368  LOG ("Invalid seek operation\n");
369  return -1;
370  }
371  if (pos > bfds->fsize)
372  {
373  LOG ("Invalid seek operation (%lld > %llu) %d\n",
374  (long long) pos,
375  (unsigned long long) bfds->fsize,
376  SEEK_SET == whence);
377  return -1;
378  }
379  if ( (NULL == bfds->buffer) ||
380  ( (bfds->fpos <= pos) &&
381  (bfds->fpos + bfds->buffer_bytes > pos) ) )
382  {
383  bfds->buffer_pos = pos - bfds->fpos;
384  return pos;
385  }
386  if (0 != bfds_pick_next_buffer_at (bfds, pos))
387  {
388  LOG ("seek operation failed\n");
389  return -1;
390  }
391  ASSERT (pos == bfds->fpos + bfds->buffer_pos);
392  return pos;
393  }
394  return -1;
395 }
396 
397 
398 /**
399  * Fills 'buf_ptr' with a chunk of data. Will
400  * fail if 'count' exceeds buffer size.
401  *
402  * @param bfds bfds
403  * @param buf_ptr location to store data
404  * @param count number of bytes to read
405  * @return number of bytes (<= count) available at location pointed by buf_ptr,
406  * 0 for end of stream, -1 on error
407  */
408 static ssize_t
410  void *buf_ptr,
411  size_t count)
412 {
413  char *cbuf = buf_ptr;
414  uint64_t old_off;
415  size_t avail;
416  size_t ret;
417 
418  old_off = bfds->fpos + bfds->buffer_pos;
419  if (old_off == bfds->fsize)
420  return 0; /* end of stream */
421  ret = 0;
422  while (count > 0)
423  {
424  if ( (bfds->buffer_bytes == bfds->buffer_pos) &&
425  (0 != bfds_pick_next_buffer_at (bfds,
426  bfds->fpos + bfds->buffer_bytes)) )
427  {
428  /* revert to original position, invalidate buffer */
429  bfds->fpos = old_off;
430  bfds->buffer_bytes = 0;
431  bfds->buffer_pos = 0;
432  LOG ("read operation failed\n");
433  return -1; /* getting more failed */
434  }
435  avail = bfds->buffer_bytes - bfds->buffer_pos;
436  if (avail > count)
437  avail = count;
438  if (0 == avail)
439  break;
440  memcpy (&cbuf[ret], bfds->data + bfds->buffer_pos, avail);
441  bfds->buffer_pos += avail;
442  count -= avail;
443  ret += avail;
444  }
445  return ret;
446 }
447 
448 
449 #if HAVE_ZLIB
450 /**
451  * Initializes gz-decompression object. Might report metadata about
452  * compresse stream, if available. Resets the stream to the beginning.
453  *
454  * @param cfs cfs to initialize
455  * @param proc callback for metadata
456  * @param proc_cls callback cls
457  * @return 1 on success, 0 to terminate extraction, -1 on error
458  */
459 static int
460 cfs_init_decompressor_zlib (struct CompressedFileSource *cfs,
461  EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
462 {
463  unsigned int gzip_header_length = 10;
464  unsigned char hdata[12];
465  ssize_t rsize;
466 
467  if (0 != bfds_seek (cfs->bfds, 0, SEEK_SET))
468  {
469  LOG ("Failed to seek to offset 0!\n");
470  return -1;
471  }
472  /* Process gzip header */
473  rsize = bfds_read (cfs->bfds, hdata, sizeof (hdata));
474  if ( (-1 == rsize) ||
475  (sizeof (hdata) > (size_t) rsize) )
476  return -1;
477  if (0 != (hdata[3] & 0x4)) /* FEXTRA set */
478  gzip_header_length += 2 + (hdata[10] & 0xff) + ((hdata[11] & 0xff) * 256);
479 
480  if (0 != (hdata[3] & 0x8))
481  {
482  /* FNAME set */
483  char fname[1024];
484  char *cptr;
485  size_t len;
486  ssize_t buf_bytes;
487 
488  if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length,
489  SEEK_SET))
490  {
491  LOG ("Corrupt gzip, failed to seek to end of header\n");
492  return -1;
493  }
494  buf_bytes = bfds_read (cfs->bfds, fname, sizeof (fname));
495  if (buf_bytes <= 0)
496  {
497  LOG ("Corrupt gzip, failed to read filename\n");
498  return -1;
499  }
500  if (NULL == (cptr = memchr (fname, 0, buf_bytes)))
501  {
502  LOG ("Corrupt gzip, failed to read filename terminator\n");
503  return -1;
504  }
505  len = cptr - fname;
506  if ( (NULL != proc) &&
507  (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_FILENAME,
508  EXTRACTOR_METAFORMAT_C_STRING, "text/plain",
509  fname,
510  len)) )
511  return 0; /* done */
512  gzip_header_length += len + 1;
513  }
514 
515  if (0 != (hdata[3] & 0x16))
516  {
517  /* FCOMMENT set */
518  char fcomment[1024];
519  char *cptr;
520  ssize_t buf_bytes;
521  size_t len;
522 
523  if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length,
524  SEEK_SET))
525  {
526  LOG ("Corrupt gzip, failed to seek to end of header\n");
527  return -1;
528  }
529  buf_bytes = bfds_read (cfs->bfds, fcomment, sizeof (fcomment));
530  if (buf_bytes <= 0)
531  {
532  LOG ("Corrupt gzip, failed to read comment\n");
533  return -1;
534  }
535  if (NULL == (cptr = memchr (fcomment, 0, buf_bytes)))
536  {
537  LOG ("Corrupt gzip, failed to read comment terminator\n");
538  return -1;
539  }
540  len = cptr - fcomment;
541  if ( (NULL != proc) &&
542  (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_COMMENT,
543  EXTRACTOR_METAFORMAT_C_STRING, "text/plain",
544  (const char *) fcomment,
545  len)) )
546  return 0; /* done */
547  gzip_header_length += len + 1;
548  }
549  if (0 != (hdata[3] & 0x2)) /* FCHRC set */
550  gzip_header_length += 2;
551  memset (&cfs->strm, 0, sizeof (z_stream));
552 
553 #ifdef ZLIB_VERNUM
554  /* zlib will take care of its header */
555  gzip_header_length = 0;
556 #endif
557  cfs->gzip_header_length = gzip_header_length;
558 
559  if (cfs->gzip_header_length !=
560  bfds_seek (cfs->bfds, cfs->gzip_header_length, SEEK_SET))
561  {
562  LOG ("Failed to seek to start to initialize gzip decompressor\n");
563  return -1;
564  }
565  cfs->strm.avail_out = COM_CHUNK_SIZE;
566  /*
567  * note: maybe plain inflateInit(&strm) is adequate,
568  * it looks more backward-compatible also ;
569  *
570  * ZLIB_VERNUM isn't defined by zlib version 1.1.4 ;
571  * there might be a better check.
572  */if (Z_OK != inflateInit2 (&cfs->strm,
573 #ifdef ZLIB_VERNUM
574  15 + 32
575 #else
576  -MAX_WBITS
577 #endif
578  ))
579  {
580  LOG ("Failed to initialize zlib decompression\n");
581  return -1;
582  }
583  return 1;
584 }
585 
586 
587 #endif
588 
589 
590 #if HAVE_LIBBZ2
591 /**
592  * Initializes bz2-decompression object. Might report metadata about
593  * compresse stream, if available. Resets the stream to the beginning.
594  *
595  * @param cfs cfs to initialize
596  * @param proc callback for metadata
597  * @param proc_cls callback cls
598  * @return 1 on success, -1 on error
599  */
600 static int
601 cfs_init_decompressor_bz2 (struct CompressedFileSource *cfs,
602  EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
603 {
604  if (0 !=
605  bfds_seek (cfs->bfds, 0, SEEK_SET))
606  {
607  LOG ("Failed to seek to start to initialize BZ2 decompressor\n");
608  return -1;
609  }
610  memset (&cfs->bstrm, 0, sizeof (bz_stream));
611  if (BZ_OK !=
612  BZ2_bzDecompressInit (&cfs->bstrm, 0, 0))
613  {
614  LOG ("Failed to initialize BZ2 decompressor\n");
615  return -1;
616  }
617  cfs->bstrm.avail_out = COM_CHUNK_SIZE;
618  return 1;
619 }
620 
621 
622 #endif
623 
624 
625 /**
626  * Initializes decompression object. Might report metadata about
627  * compresse stream, if available. Resets the stream to the beginning.
628  *
629  * @param cfs cfs to initialize
630  * @param proc callback for metadata
631  * @param proc_cls callback cls
632  * @return 1 on success, 0 to terminate extraction, -1 on error
633  */
634 static int
636  EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
637 {
638  cfs->result_pos = 0;
639  cfs->fpos = 0;
640  switch (cfs->compression_type)
641  {
642 #if HAVE_ZLIB
643  case COMP_TYPE_ZLIB:
644  return cfs_init_decompressor_zlib (cfs, proc, proc_cls);
645 #endif
646 #if HAVE_LIBBZ2
647  case COMP_TYPE_BZ2:
648  return cfs_init_decompressor_bz2 (cfs, proc, proc_cls);
649 #endif
650  default:
651  LOG ("invalid compression type selected\n");
652  return -1;
653  }
654 }
655 
656 
657 #if HAVE_ZLIB
658 /**
659  * Deinitializes gz-decompression object.
660  *
661  * @param cfs cfs to deinitialize
662  * @return 1 on success, -1 on error
663  */
664 static int
665 cfs_deinit_decompressor_zlib (struct CompressedFileSource *cfs)
666 {
667  inflateEnd (&cfs->strm);
668  return 1;
669 }
670 
671 
672 #endif
673 
674 
675 #if HAVE_LIBBZ2
676 /**
677  * Deinitializes bz2-decompression object.
678  *
679  * @param cfs cfs to deinitialize
680  * @return 1 on success, -1 on error
681  */
682 static int
683 cfs_deinit_decompressor_bz2 (struct CompressedFileSource *cfs)
684 {
685  BZ2_bzDecompressEnd (&cfs->bstrm);
686  return 1;
687 }
688 
689 
690 #endif
691 
692 
693 /**
694  * Deinitializes decompression object.
695  *
696  * @param cfs cfs to deinitialize
697  * @return 1 on success, -1 on error
698  */
699 static int
701 {
702  switch (cfs->compression_type)
703  {
704 #if HAVE_ZLIB
705  case COMP_TYPE_ZLIB:
706  return cfs_deinit_decompressor_zlib (cfs);
707 #endif
708 #if HAVE_LIBBZ2
709  case COMP_TYPE_BZ2:
710  return cfs_deinit_decompressor_bz2 (cfs);
711 #endif
712  default:
713  LOG ("invalid compression type selected\n");
714  return -1;
715  }
716 }
717 
718 
719 /**
720  * Resets the compression stream to begin uncompressing
721  * from the beginning. Used at initialization time, and when
722  * seeking backward.
723  *
724  * @param cfs cfs to reset
725  * @return 1 on success, 0 to terminate extraction,
726  * -1 on error
727  */
728 static int
730 {
731  if (-1 == cfs_deinit_decompressor (cfs))
732  return -1;
733  return cfs_init_decompressor (cfs, NULL, NULL);
734 }
735 
736 
737 /**
738  * Destroy compressed file source.
739  *
740  * @param cfs source to destroy
741  */
742 static void
744 {
746  free (cfs);
747 }
748 
749 
750 /**
751  * Allocates and initializes new cfs object.
752  *
753  * @param bfds data source to use
754  * @param fsize size of the source
755  * @param compression_type type of compression used
756  * @param proc metadata callback to call with meta data found upon opening
757  * @param proc_cls callback cls
758  * @return newly allocated cfs on success, NULL on error
759  */
760 struct CompressedFileSource *
762  int64_t fsize,
764  EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
765 {
766  struct CompressedFileSource *cfs;
767 
768  if (NULL == (cfs = malloc (sizeof (struct CompressedFileSource))))
769  {
770  LOG_STRERROR ("malloc");
771  return NULL;
772  }
773  memset (cfs, 0, sizeof (struct CompressedFileSource));
775  cfs->bfds = bfds;
776  cfs->fsize = fsize;
777  cfs->uncompressed_size = -1;
778  if (1 != cfs_init_decompressor (cfs,
779  proc, proc_cls))
780  {
781  free (cfs);
782  return NULL;
783  }
784  return cfs;
785 }
786 
787 
788 #if HAVE_ZLIB
789 /**
790  * Fills 'data' with new uncompressed data. Does the actual
791  * decompression. Will set uncompressed_size on the end of compressed
792  * stream.
793  *
794  * @param cfds cfs to read from
795  * @param data where to copy the data
796  * @param size number of bytes available in data
797  * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
798  */
799 static ssize_t
800 cfs_read_zlib (struct CompressedFileSource *cfs,
801  void *data,
802  size_t size)
803 {
804  char *dst = data;
805  int ret;
806  size_t rc;
807  ssize_t in;
808  unsigned char buf[COM_CHUNK_SIZE];
809 
810  if (cfs->fpos == cfs->uncompressed_size)
811  {
812  /* end of file */
813  return 0;
814  }
815  rc = 0;
816  if (COM_CHUNK_SIZE > cfs->strm.avail_out + cfs->result_pos)
817  {
818  /* got left-over decompressed data from previous round! */
819  in = COM_CHUNK_SIZE - (cfs->strm.avail_out + cfs->result_pos);
820  if (in > size)
821  in = size;
822  memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
823  cfs->fpos += in;
824  cfs->result_pos += in;
825  rc += in;
826  }
827  ret = Z_OK;
828  while ( (rc < size) && (Z_STREAM_END != ret) )
829  {
830  /* read block from original data source */
831  in = bfds_read (cfs->bfds,
832  buf, sizeof (buf));
833  if (in < 0)
834  {
835  LOG ("unexpected EOF\n");
836  return -1; /* unexpected EOF */
837  }
838  if (0 == in)
839  {
840  cfs->uncompressed_size = cfs->fpos;
841  return rc;
842  }
843  cfs->strm.next_in = buf;
844  cfs->strm.avail_in = (uInt) in;
845  cfs->strm.next_out = (unsigned char *) cfs->result;
846  cfs->strm.avail_out = COM_CHUNK_SIZE;
847  cfs->result_pos = 0;
848  ret = inflate (&cfs->strm, Z_SYNC_FLUSH);
849  if ( (Z_OK != ret) && (Z_STREAM_END != ret) )
850  {
851  LOG ("unexpected gzip inflate error: %d\n", ret);
852  return -1; /* unexpected error */
853  }
854  /* go backwards by the number of bytes left in the buffer */
855  if (-1 == bfds_seek (cfs->bfds, -(int64_t) cfs->strm.avail_in, SEEK_CUR))
856  {
857  LOG ("seek failed\n");
858  return -1;
859  }
860  /* copy decompressed bytes to target buffer */
861  in = COM_CHUNK_SIZE - cfs->strm.avail_out;
862  if (in > size - rc)
863  {
864  if (Z_STREAM_END == ret)
865  {
866  cfs->uncompressed_size = cfs->fpos + in;
867  ret = Z_OK;
868  }
869  in = size - rc;
870  }
871  memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
872  cfs->fpos += in;
873  cfs->result_pos += in;
874  rc += in;
875  }
876  if (Z_STREAM_END == ret)
877  {
878  cfs->uncompressed_size = cfs->fpos;
879  }
880  return rc;
881 }
882 
883 
884 #endif
885 
886 
887 #if HAVE_LIBBZ2
888 /**
889  * Fills 'data' with new uncompressed data. Does the actual
890  * decompression. Will set uncompressed_size on the end of compressed
891  * stream.
892  *
893  * @param cfds cfs to read from
894  * @param data where to copy the data
895  * @param size number of bytes available in data
896  * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
897  */
898 static ssize_t
899 cfs_read_bz2 (struct CompressedFileSource *cfs,
900  void *data,
901  size_t size)
902 {
903  char *dst = data;
904  int ret;
905  size_t rc;
906  ssize_t in;
907  char buf[COM_CHUNK_SIZE];
908 
909  if (cfs->fpos == cfs->uncompressed_size)
910  {
911  /* end of file */
912  return 0;
913  }
914  rc = 0;
915  if (COM_CHUNK_SIZE > cfs->bstrm.avail_out + cfs->result_pos)
916  {
917  /* got left-over decompressed data from previous round! */
918  in = COM_CHUNK_SIZE - (cfs->bstrm.avail_out + cfs->result_pos);
919  if (in > size)
920  in = size;
921  memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
922  cfs->fpos += in;
923  cfs->result_pos += in;
924  rc += in;
925  }
926  ret = BZ_OK;
927  while ( (rc < size) && (BZ_STREAM_END != ret) )
928  {
929  /* read block from original data source */
930  in = bfds_read (cfs->bfds,
931  buf, sizeof (buf));
932  if (in < 0)
933  {
934  LOG ("unexpected EOF\n");
935  return -1; /* unexpected EOF */
936  }
937  if (0 == in)
938  {
939  cfs->uncompressed_size = cfs->fpos;
940  return rc;
941  }
942  cfs->bstrm.next_in = buf;
943  cfs->bstrm.avail_in = (unsigned int) in;
944  cfs->bstrm.next_out = cfs->result;
945  cfs->bstrm.avail_out = COM_CHUNK_SIZE;
946  cfs->result_pos = 0;
947  ret = BZ2_bzDecompress (&cfs->bstrm);
948  if ( (BZ_OK != ret) && (BZ_STREAM_END != ret) )
949  {
950  LOG ("unexpected bzip2 decompress error: %d\n", ret);
951  return -1; /* unexpected error */
952  }
953  /* go backwards by the number of bytes left in the buffer */
954  if (-1 == bfds_seek (cfs->bfds, -(int64_t) cfs->bstrm.avail_in, SEEK_CUR))
955  {
956  LOG ("seek failed\n");
957  return -1;
958  }
959  /* copy decompressed bytes to target buffer */
960  in = COM_CHUNK_SIZE - cfs->bstrm.avail_out;
961  if (in > size - rc)
962  {
963  if (BZ_STREAM_END == ret)
964  {
965  cfs->uncompressed_size = cfs->fpos + in;
966  ret = BZ_OK;
967  }
968  in = size - rc;
969  }
970  memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
971  cfs->fpos += in;
972  cfs->result_pos += in;
973  rc += in;
974  }
975  if (BZ_STREAM_END == ret)
976  {
977  cfs->uncompressed_size = cfs->fpos;
978  }
979  return rc;
980 }
981 
982 
983 #endif
984 
985 
986 /**
987  * Fills 'data' with new uncompressed data. Does the actual
988  * decompression. Will set uncompressed_size on the end of compressed
989  * stream.
990  *
991  * @param cfds cfs to read from
992  * @param data where to copy the data
993  * @param size number of bytes available in data
994  * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
995  */
996 static ssize_t
998  void *data,
999  size_t size)
1000 {
1001  switch (cfs->compression_type)
1002  {
1003 #if HAVE_ZLIB
1004  case COMP_TYPE_ZLIB:
1005  return cfs_read_zlib (cfs, data, size);
1006 #endif
1007 #if HAVE_LIBBZ2
1008  case COMP_TYPE_BZ2:
1009  return cfs_read_bz2 (cfs, data, size);
1010 #endif
1011  default:
1012  LOG ("invalid compression type selected\n");
1013  return -1;
1014  }
1015 }
1016 
1017 
1018 /**
1019  * Moves the buffer to 'position' in uncompressed steam. If position
1020  * requires seeking backwards beyond the boundaries of the buffer, resets the
1021  * stream and repeats decompression from the beginning to 'position'.
1022  *
1023  * @param cfs cfs to seek on
1024  * @param position new starting point for the buffer
1025  * @param whence one of the seek constants (SEEK_CUR, SEEK_SET, SEEK_END)
1026  * @return new absolute buffer position, -1 on error or EOS
1027  */
1028 static int64_t
1030  int64_t position,
1031  int whence)
1032 {
1033  uint64_t nposition;
1034  int64_t delta;
1035 
1036  switch (whence)
1037  {
1038  case SEEK_CUR:
1039  if (cfs->fpos + position < 0)
1040  {
1041  /* underflow */
1042  LOG ("Invalid seek operation\n");
1043  return -1;
1044  }
1045  if ( (-1 != cfs->uncompressed_size) &&
1046  (cfs->fpos + position > cfs->uncompressed_size) )
1047  {
1048  LOG ("Invalid seek operation\n");
1049  return -1;
1050  }
1051  nposition = cfs->fpos + position;
1052  break;
1053  case SEEK_END:
1054  ASSERT (-1 != cfs->uncompressed_size);
1055  if (position > 0)
1056  {
1057  LOG ("Invalid seek operation\n");
1058  return -1;
1059  }
1060  if (cfs->uncompressed_size < -position)
1061  {
1062  LOG ("Invalid seek operation\n");
1063  return -1;
1064  }
1065  nposition = cfs->uncompressed_size + position;
1066  break;
1067  case SEEK_SET:
1068  if (position < 0)
1069  {
1070  LOG ("Invalid seek operation\n");
1071  return -1;
1072  }
1073  if ( (-1 != cfs->uncompressed_size) &&
1074  (cfs->uncompressed_size < position) )
1075  {
1076  LOG ("Invalid seek operation\n");
1077  return -1;
1078  }
1079  nposition = (uint64_t) position;
1080  break;
1081  default:
1082  LOG ("Invalid seek operation\n");
1083  return -1;
1084  }
1085  delta = nposition - cfs->fpos;
1086  if (delta < 0)
1087  {
1088  if (cfs->result_pos >= -delta)
1089  {
1090  cfs->result_pos += delta;
1091  cfs->fpos += delta;
1092  delta = 0;
1093  }
1094  else
1095  {
1096  if (-1 == cfs_reset_stream (cfs))
1097  {
1098  LOG ("Failed to restart compressed stream for seek operation\n");
1099  return -1;
1100  }
1101  delta = nposition;
1102  }
1103  }
1104  while (delta > 0)
1105  {
1106  char buf[COM_CHUNK_SIZE];
1107  size_t max;
1108  int64_t ret;
1109 
1110  max = (sizeof (buf) > delta) ? delta : sizeof (buf);
1111  ret = cfs_read (cfs, buf, max);
1112  if (-1 == ret)
1113  {
1114  LOG ("Failed to read decompressed stream for seek operation\n");
1115  return -1;
1116  }
1117  if (0 == ret)
1118  {
1119  LOG (
1120  "Reached unexpected end of stream at %llu during seek operation to %llu (%d left)\n",
1121  (unsigned long long) cfs->fpos,
1122  (unsigned long long) nposition,
1123  delta);
1124  return -1;
1125  }
1126  ASSERT (ret <= delta);
1127  delta -= ret;
1128  }
1129  return cfs->fpos;
1130 }
1131 
1132 
1133 /**
1134  * Detect if we have compressed data on our hands.
1135  *
1136  * @param data pointer to a data buffer or NULL (in case fd is not -1)
1137  * @param fd a file to read data from, or -1 (if data is not NULL)
1138  * @param fsize size of data (if data is not NULL) or of file (if fd is not -1)
1139  * @return -1 to indicate an error, 0 to indicate uncompressed data, or a type (> 0) of compression
1140  */
1141 static enum ExtractorCompressionType
1143 {
1144  unsigned char read_data[3];
1145 
1146  if (0 != bfds_seek (bfds, 0, SEEK_SET))
1147  return COMP_TYPE_INVALID;
1148  if (sizeof (read_data) !=
1149  bfds_read (bfds, read_data, sizeof (read_data)))
1150  return COMP_TYPE_UNDEFINED;
1151 
1152 #if HAVE_ZLIB
1153  if ( (bfds->fsize >= MIN_ZLIB_HEADER) &&
1154  (read_data[0] == 0x1f) &&
1155  (read_data[1] == 0x8b) &&
1156  (read_data[2] == 0x08) )
1157  return COMP_TYPE_ZLIB;
1158 #endif
1159 #if HAVE_LIBBZ2
1160  if ( (bfds->fsize >= MIN_BZ2_HEADER) &&
1161  (read_data[0] == 'B') &&
1162  (read_data[1] == 'Z') &&
1163  (read_data[2] == 'h'))
1164  return COMP_TYPE_BZ2;
1165 #endif
1166  return COMP_TYPE_INVALID;
1167 }
1168 
1169 
1170 /**
1171  * Handle to a datasource we can use for the plugins.
1172  */
1174 {
1175 
1176  /**
1177  * Underlying buffered data source.
1178  */
1180 
1181  /**
1182  * Compressed file source (NULL if not applicable).
1183  */
1185 
1186  /**
1187  * Underlying file descriptor, -1 for none.
1188  */
1189  int fd;
1190 };
1191 
1192 
1193 /**
1194  * Create a datasource from a file on disk.
1195  *
1196  * @param filename name of the file on disk
1197  * @param proc metadata callback to call with meta data found upon opening
1198  * @param proc_cls callback cls
1199  * @return handle to the datasource, NULL on error
1200  */
1201 struct EXTRACTOR_Datasource *
1204  void *proc_cls)
1205 {
1206  struct BufferedFileDataSource *bfds;
1207  struct EXTRACTOR_Datasource *ds;
1208  enum ExtractorCompressionType ct;
1209  int fd;
1210  struct stat sb;
1211  int64_t fsize;
1212  int winmode = 0;
1213 #if WINDOWS
1214  winmode = O_BINARY;
1215 #endif
1216 
1217  if (-1 == (fd = open (filename, O_RDONLY | O_LARGEFILE | winmode)))
1218  {
1219  LOG_STRERROR_FILE ("open", filename);
1220  return NULL;
1221  }
1222  if ( (0 != fstat (fd, &sb)) ||
1223  (S_ISDIR (sb.st_mode)) )
1224  {
1225  if (! S_ISDIR (sb.st_mode))
1226  LOG_STRERROR_FILE ("fstat", filename);
1227  else
1228  LOG ("Skipping directory `%s'\n", filename);
1229  (void) close (fd);
1230  return NULL;
1231  }
1232  fsize = (int64_t) sb.st_size;
1233  if (0 == fsize)
1234  {
1235  (void) close (fd);
1236  return NULL;
1237  }
1238  bfds = bfds_new (NULL, fd, fsize);
1239  if (NULL == bfds)
1240  {
1241  (void) close (fd);
1242  return NULL;
1243  }
1244  if (NULL == (ds = malloc (sizeof (struct EXTRACTOR_Datasource))))
1245  {
1246  LOG_STRERROR ("malloc");
1247  bfds_delete (bfds);
1248  (void) close (fd);
1249  return NULL;
1250  }
1251  ds->bfds = bfds;
1252  ds->fd = fd;
1253  ds->cfs = NULL;
1254  ct = get_compression_type (bfds);
1255  if ( (COMP_TYPE_ZLIB == ct) ||
1256  (COMP_TYPE_BZ2 == ct) )
1257  {
1258  ds->cfs = cfs_new (bfds, fsize, ct, proc, proc_cls);
1259  if (NULL == ds->cfs)
1260  {
1261  LOG ("Failed to initialize decompressor\n");
1262  bfds_delete (bfds);
1263  free (ds);
1264  (void) close (fd);
1265  return NULL;
1266  }
1267  }
1268  return ds;
1269 }
1270 
1271 
1272 /**
1273  * Create a datasource from a buffer in memory.
1274  *
1275  * @param buf data in memory
1276  * @param size number of bytes in 'buf'
1277  * @param proc metadata callback to call with meta data found upon opening
1278  * @param proc_cls callback cls
1279  * @return handle to the datasource
1280  */
1281 struct EXTRACTOR_Datasource *
1283  size_t size,
1285  void *proc_cls)
1286 {
1287  struct BufferedFileDataSource *bfds;
1288  struct EXTRACTOR_Datasource *ds;
1289  enum ExtractorCompressionType ct;
1290 
1291  if (0 == size)
1292  return NULL;
1293  if (NULL == (bfds = bfds_new (buf, -1, size)))
1294  {
1295  LOG ("Failed to initialize buffer data source\n");
1296  return NULL;
1297  }
1298  if (NULL == (ds = malloc (sizeof (struct EXTRACTOR_Datasource))))
1299  {
1300  LOG_STRERROR ("malloc");
1301  bfds_delete (bfds);
1302  return NULL;
1303  }
1304  ds->bfds = bfds;
1305  ds->fd = -1;
1306  ds->cfs = NULL;
1307  ct = get_compression_type (bfds);
1308  if ( (COMP_TYPE_ZLIB == ct) ||
1309  (COMP_TYPE_BZ2 == ct) )
1310  {
1311  ds->cfs = cfs_new (bfds, size, ct, proc, proc_cls);
1312  if (NULL == ds->cfs)
1313  {
1314  LOG ("Failed to initialize decompressor\n");
1315  bfds_delete (bfds);
1316  free (ds);
1317  return NULL;
1318  }
1319  }
1320  return ds;
1321 }
1322 
1323 
1324 /**
1325  * Destroy a data source.
1326  *
1327  * @param ds source to destroy
1328  */
1329 void
1331 {
1332  if (NULL != ds->cfs)
1333  cfs_destroy (ds->cfs);
1334  bfds_delete (ds->bfds);
1335  if (-1 != ds->fd)
1336  (void) close (ds->fd);
1337  free (ds);
1338 }
1339 
1340 
1341 /**
1342  * Make 'size' bytes of data from the data source available at 'data'.
1343  *
1344  * @param cls must be a 'struct EXTRACTOR_Datasource'
1345  * @param data where the data should be copied to
1346  * @param size maximum number of bytes requested
1347  * @return number of bytes now available in data (can be smaller than 'size'),
1348  * -1 on error
1349  */
1350 ssize_t
1352  void *data,
1353  size_t size)
1354 {
1355  struct EXTRACTOR_Datasource *ds = cls;
1356 
1357  if (NULL != ds->cfs)
1358  return cfs_read (ds->cfs, data, size);
1359  return bfds_read (ds->bfds, data, size);
1360 }
1361 
1362 
1363 /**
1364  * Seek in the datasource. Use 'SEEK_CUR' for whence and 'pos' of 0 to
1365  * obtain the current position in the file.
1366  *
1367  * @param cls must be a 'struct EXTRACTOR_Datasource'
1368  * @param pos position to seek (see 'man lseek')
1369  * @param whence how to see (absolute to start, relative, absolute to end)
1370  * @return new absolute position, UINT64_MAX on error (i.e. desired position
1371  * does not exist)
1372  */
1373 int64_t
1375  int64_t pos,
1376  int whence)
1377 {
1378  struct EXTRACTOR_Datasource *ds = cls;
1379 
1380  if (NULL != ds->cfs)
1381  {
1382  if ( (SEEK_END == whence) &&
1383  (-1 == ds->cfs->uncompressed_size) )
1384  {
1385  /* need to obtain uncompressed size */
1386  (void) EXTRACTOR_datasource_get_size_ (ds, 1);
1387  if (-1 == ds->cfs->uncompressed_size)
1388  return -1;
1389  }
1390  return cfs_seek (ds->cfs, pos, whence);
1391  }
1392  return bfds_seek (ds->bfds, pos, whence);
1393 }
1394 
1395 
1396 /**
1397  * Determine the overall size of the data source (after compression).
1398  *
1399  * @param cls must be a 'struct EXTRACTOR_Datasource'
1400  * @param force force computing the size if it is unavailable
1401  * @return overall file size, UINT64_MAX on error or unknown
1402  */
1403 int64_t
1405  int force)
1406 {
1407  struct EXTRACTOR_Datasource *ds = cls;
1408  char buf[32 * 1024];
1409  uint64_t pos;
1410 
1411  if (NULL != ds->cfs)
1412  {
1413  if ( (force) &&
1414  (-1 == ds->cfs->uncompressed_size) )
1415  {
1416  pos = ds->cfs->fpos;
1417  while ( (-1 == ds->cfs->uncompressed_size) &&
1418  (-1 != cfs_read (ds->cfs, buf, sizeof (buf))) )
1419  ;
1420  if (-1 == cfs_seek (ds->cfs, pos, SEEK_SET))
1421  {
1422  LOG (
1423  "Serious problem, I moved the buffer to determine the file size but could not restore it...\n");
1424  return -1;
1425  }
1426  if (-1 == ds->cfs->uncompressed_size)
1427  return -1;
1428  }
1429  return ds->cfs->uncompressed_size;
1430  }
1431  return ds->bfds->fsize;
1432 }
1433 
1434 
1435 /* end of extractor_datasource.c */
int(* EXTRACTOR_MetaDataProcessor)(void *cls, const char *plugin_name, enum EXTRACTOR_MetaType type, enum EXTRACTOR_MetaFormat format, const char *data_mime_type, const char *data, size_t data_len)
Definition: extractor.h:460
@ EXTRACTOR_METAFORMAT_C_STRING
Definition: extractor.h:113
#define MAX_READ
struct EXTRACTOR_Datasource * EXTRACTOR_datasource_create_from_buffer_(const char *buf, size_t size, EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
#define O_LARGEFILE
static int cfs_deinit_decompressor(struct CompressedFileSource *cfs)
static void cfs_destroy(struct CompressedFileSource *cfs)
static int64_t cfs_seek(struct CompressedFileSource *cfs, int64_t position, int whence)
void EXTRACTOR_datasource_destroy_(struct EXTRACTOR_Datasource *ds)
ssize_t EXTRACTOR_datasource_read_(void *cls, void *data, size_t size)
int64_t EXTRACTOR_datasource_get_size_(void *cls, int force)
int64_t EXTRACTOR_datasource_seek_(void *cls, int64_t pos, int whence)
static ssize_t bfds_read(struct BufferedFileDataSource *bfds, void *buf_ptr, size_t count)
#define COM_CHUNK_SIZE
struct CompressedFileSource * cfs_new(struct BufferedFileDataSource *bfds, int64_t fsize, enum ExtractorCompressionType compression_type, EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
static struct BufferedFileDataSource * bfds_new(const void *data, int fd, int64_t fsize)
static int cfs_init_decompressor(struct CompressedFileSource *cfs, EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
static int cfs_reset_stream(struct CompressedFileSource *cfs)
static enum ExtractorCompressionType get_compression_type(struct BufferedFileDataSource *bfds)
static void bfds_delete(struct BufferedFileDataSource *bfds)
static ssize_t cfs_read(struct CompressedFileSource *cfs, void *data, size_t size)
ExtractorCompressionType
@ COMP_TYPE_INVALID
@ COMP_TYPE_ZLIB
@ COMP_TYPE_UNDEFINED
@ COMP_TYPE_BZ2
static int bfds_pick_next_buffer_at(struct BufferedFileDataSource *bfds, uint64_t pos)
struct EXTRACTOR_Datasource * EXTRACTOR_datasource_create_from_file_(const char *filename, EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
static int64_t bfds_seek(struct BufferedFileDataSource *bfds, int64_t pos, int whence)
random access and possibly decompression of data from buffer in memory or file on disk
logging API for GNU libextractor
#define ASSERT(cond)
#define LOG(...)
#define LOG_STRERROR(syscall)
#define LOG_STRERROR_FILE(syscall, filename)
#define NULL
Definition: getopt1.c:60
@ EXTRACTOR_METATYPE_COMMENT
Definition: extractor.h:131
@ EXTRACTOR_METATYPE_FILENAME
Definition: extractor.h:130
plaform specifics
enum ExtractorCompressionType compression_type
struct BufferedFileDataSource * bfds
struct BufferedFileDataSource * bfds
struct CompressedFileSource * cfs