"Fossies" - the Fresh Open Source Software Archive 
Member "libextractor-1.11/src/main/extractor_datasource.c" (30 Jan 2021, 34206 Bytes) of package /linux/privat/libextractor-1.11.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "extractor_datasource.c" see the
Fossies "Dox" file reference documentation and the latest
Fossies "Diffs" side-by-side code changes report:
1.10_vs_1.11.
1 /*
2 This file is part of libextractor.
3 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2009, 2012 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 Boston, MA 02110-1301, USA.
19 */
20 /**
21 * @file main/extractor_datasource.c
22 * @brief random access and possibly decompression of data from buffer in memory or file on disk
23 * @author Christian Grothoff
24 */
25 #include "platform.h"
26 #include "extractor_logging.h"
27 #include "extractor_datasource.h"
28
29 #if HAVE_LIBBZ2
30 #include <bzlib.h>
31 #define MIN_BZ2_HEADER 4
32 #ifndef MIN_COMPRESSED_HEADER
33 #define MIN_COMPRESSED_HEADER MIN_ZLIB_HEADER
34 #endif
35 #endif
36
37 #if HAVE_ZLIB
38 #include <zlib.h>
39 #define MIN_ZLIB_HEADER 12
40 #ifndef MIN_COMPRESSED_HEADER
41 #define MIN_COMPRESSED_HEADER MIN_BZ2_HEADER
42 #endif
43 #endif
44
45 #ifndef MIN_COMPRESSED_HEADER
46 #define MIN_COMPRESSED_HEADER -1
47 #endif
48
49 #ifndef O_LARGEFILE
50 #define O_LARGEFILE 0
51 #endif
52
53 /**
54 * Maximum size of an IO buffer.
55 */
56 #define MAX_READ (4 * 1024 * 1024)
57
58 /**
59 * Data is read from the source and shoved into decompressor
60 * in chunks this big.
61 */
62 #define COM_CHUNK_SIZE (16 * 1024)
63
64
65 /**
66 * Enum with the various possible types of compression supported.
67 */
68 enum ExtractorCompressionType
69 {
70 /**
71 * We cannot tell from the data (header incomplete).
72 */
73 COMP_TYPE_UNDEFINED = -1,
74
75 /**
76 * Invalid header (likely uncompressed)
77 */
78 COMP_TYPE_INVALID = 0,
79
80 /**
81 * libz / gzip compression.
82 */
83 COMP_TYPE_ZLIB = 1,
84
85 /**
86 * bz2 compression
87 */
88 COMP_TYPE_BZ2 = 2
89 };
90
91
92 /**
93 * Abstraction of the data source (file or a memory buffer)
94 * for the decompressor.
95 */
96 struct BufferedFileDataSource
97 {
98 /**
99 * Pointer to the buffer to read from (may be NULL)
100 */
101 const void *data;
102
103 /**
104 * A buffer to read into. For fd != -1: when data != NULL,
105 * data is used directly.
106 */
107 void *buffer;
108
109 /**
110 * Size of the file (or the data buffer)
111 */
112 uint64_t fsize;
113
114 /**
115 * Position of the buffer in the file.
116 */
117 uint64_t fpos;
118
119 /**
120 * Position within the buffer. Our absolute offset in the file
121 * is thus 'fpos + buffer_pos'.
122 */
123 size_t buffer_pos;
124
125 /**
126 * Number of valid bytes in the buffer (<= buffer_size)
127 */
128 size_t buffer_bytes;
129
130 /**
131 * Allocated size of the buffer
132 */
133 size_t buffer_size;
134
135 /**
136 * Descriptor of the file to read data from (may be -1)
137 */
138 int fd;
139
140 };
141
142
143 /**
144 * An object from which uncompressed data can be read
145 */
146 struct CompressedFileSource
147 {
148 /**
149 * The source of data
150 */
151 struct BufferedFileDataSource *bfds;
152
153 /**
154 * Decompression target buffer.
155 */
156 char result[COM_CHUNK_SIZE];
157
158 /**
159 * At which offset in 'result' is 'fpos'?
160 */
161 size_t result_pos;
162
163 /**
164 * Size of the source (same as bfds->fsize)
165 */
166 int64_t fsize;
167
168 /**
169 * Position within the (decompressed) source
170 */
171 int64_t fpos;
172
173 /**
174 * Total size of the uncompressed data. Remains -1 until
175 * decompression is finished.
176 */
177 int64_t uncompressed_size;
178
179 #if HAVE_LIBBZ2
180 /**
181 * BZ2 stream object
182 */
183 bz_stream bstrm;
184 #endif
185
186 #if HAVE_ZLIB
187 /**
188 * ZLIB stream object
189 */
190 z_stream strm;
191
192 /**
193 * Length of gzip header (may be 0, in that case ZLIB parses the header)
194 */
195 int gzip_header_length;
196 #endif
197
198 /**
199 * The type of compression used in the source
200 */
201 enum ExtractorCompressionType compression_type;
202
203 };
204
205
206 /**
207 * Makes bfds seek to 'pos' and read a chunk of bytes there.
208 * Changes bfds->fpos, bfds->buffer_bytes and bfds->buffer_pos.
209 * Does almost nothing for memory-backed bfds.
210 *
211 * @param bfds bfds
212 * @param pos position
213 * @return 0 on success, -1 on error
214 */
215 static int
216 bfds_pick_next_buffer_at (struct BufferedFileDataSource *bfds,
217 uint64_t pos)
218 {
219 int64_t position;
220 ssize_t rd;
221
222 if (pos > bfds->fsize)
223 {
224 LOG ("Invalid seek operation\n");
225 return -1; /* invalid */
226 }
227 if (NULL == bfds->buffer)
228 {
229 bfds->buffer_pos = pos;
230 return 0;
231 }
232 position = (int64_t) lseek (bfds->fd, pos, SEEK_SET);
233 if (position < 0)
234 {
235 LOG_STRERROR ("lseek");
236 return -1;
237 }
238 bfds->fpos = position;
239 bfds->buffer_pos = 0;
240 rd = read (bfds->fd, bfds->buffer, bfds->buffer_size);
241 if (rd < 0)
242 {
243 LOG_STRERROR ("read");
244 return -1;
245 }
246 bfds->buffer_bytes = rd;
247 return 0;
248 }
249
250
251 /**
252 * Creates a bfds
253 *
254 * @param data data buffer to use as a source (NULL if fd != -1)
255 * @param fd file descriptor to use as a source (-1 if data != NULL)
256 * @param fsize size of the file (or the buffer)
257 * @return newly allocated bfds
258 */
259 static struct BufferedFileDataSource *
260 bfds_new (const void *data,
261 int fd,
262 int64_t fsize)
263 {
264 struct BufferedFileDataSource *result;
265 size_t xtra;
266
267 if (fsize > MAX_READ)
268 xtra = MAX_READ;
269 else
270 xtra = (size_t) fsize;
271 if ( (-1 == fd) && (NULL == data) )
272 {
273 LOG ("Invalid arguments\n");
274 return NULL;
275 }
276 if ( (-1 != fd) && (NULL != data) )
277 fd = -1; /* don't need fd */
278 if (NULL != data)
279 xtra = 0;
280 if (NULL == (result = malloc (sizeof (struct BufferedFileDataSource) + xtra)))
281 {
282 LOG_STRERROR ("malloc");
283 return NULL;
284 }
285 memset (result, 0, sizeof (struct BufferedFileDataSource));
286 result->data = (NULL != data) ? data : &result[1];
287 result->buffer = (NULL != data) ? NULL : &result[1];
288 result->buffer_size = (NULL != data) ? fsize : xtra;
289 result->buffer_bytes = (NULL != data) ? fsize : 0;
290 result->fsize = fsize;
291 result->fd = fd;
292 bfds_pick_next_buffer_at (result, 0);
293 return result;
294 }
295
296
297 /**
298 * Unallocates bfds
299 *
300 * @param bfds bfds to deallocate
301 */
302 static void
303 bfds_delete (struct BufferedFileDataSource *bfds)
304 {
305 free (bfds);
306 }
307
308
309 /**
310 * Makes bfds seek to 'pos' in 'whence' mode.
311 * Will try to seek within the buffer, will move the buffer location if
312 * the seek request falls outside of the buffer range.
313 *
314 * @param bfds bfds
315 * @param pos position to seek to
316 * @param whence one of the seek constants (SEEK_CUR, SEEK_SET, SEEK_END)
317 * @return new absolute position, -1 on error
318 */
319 static int64_t
320 bfds_seek (struct BufferedFileDataSource *bfds,
321 int64_t pos, int whence)
322 {
323 uint64_t npos;
324 size_t nbpos;
325
326 switch (whence)
327 {
328 case SEEK_CUR:
329 npos = bfds->fpos + bfds->buffer_pos + pos;
330 if (npos > bfds->fsize)
331 {
332 LOG ("Invalid seek operation to %lld from %llu (max is %llu)\n",
333 (long long) pos,
334 bfds->fpos + bfds->buffer_pos,
335 (unsigned long long) bfds->fsize);
336 return -1;
337 }
338 nbpos = bfds->buffer_pos + pos;
339 if ( (NULL == bfds->buffer) ||
340 (nbpos < bfds->buffer_bytes) )
341 {
342 bfds->buffer_pos = nbpos;
343 return npos;
344 }
345 if (0 != bfds_pick_next_buffer_at (bfds,
346 npos))
347 {
348 LOG ("seek operation failed\n");
349 return -1;
350 }
351 return npos;
352 case SEEK_END:
353 if (pos > 0)
354 {
355 LOG ("Invalid seek operation\n");
356 return -1;
357 }
358 if (bfds->fsize < -pos)
359 {
360 LOG ("Invalid seek operation\n");
361 return -1;
362 }
363 pos = bfds->fsize + pos;
364 /* fall-through! */
365 case SEEK_SET:
366 if (pos < 0)
367 {
368 LOG ("Invalid seek operation\n");
369 return -1;
370 }
371 if (pos > bfds->fsize)
372 {
373 LOG ("Invalid seek operation (%lld > %llu) %d\n",
374 (long long) pos,
375 (unsigned long long) bfds->fsize,
376 SEEK_SET == whence);
377 return -1;
378 }
379 if ( (NULL == bfds->buffer) ||
380 ( (bfds->fpos <= pos) &&
381 (bfds->fpos + bfds->buffer_bytes > pos) ) )
382 {
383 bfds->buffer_pos = pos - bfds->fpos;
384 return pos;
385 }
386 if (0 != bfds_pick_next_buffer_at (bfds, pos))
387 {
388 LOG ("seek operation failed\n");
389 return -1;
390 }
391 ASSERT (pos == bfds->fpos + bfds->buffer_pos);
392 return pos;
393 }
394 return -1;
395 }
396
397
398 /**
399 * Fills 'buf_ptr' with a chunk of data. Will
400 * fail if 'count' exceeds buffer size.
401 *
402 * @param bfds bfds
403 * @param buf_ptr location to store data
404 * @param count number of bytes to read
405 * @return number of bytes (<= count) available at location pointed by buf_ptr,
406 * 0 for end of stream, -1 on error
407 */
408 static ssize_t
409 bfds_read (struct BufferedFileDataSource *bfds,
410 void *buf_ptr,
411 size_t count)
412 {
413 char *cbuf = buf_ptr;
414 uint64_t old_off;
415 size_t avail;
416 size_t ret;
417
418 old_off = bfds->fpos + bfds->buffer_pos;
419 if (old_off == bfds->fsize)
420 return 0; /* end of stream */
421 ret = 0;
422 while (count > 0)
423 {
424 if ( (bfds->buffer_bytes == bfds->buffer_pos) &&
425 (0 != bfds_pick_next_buffer_at (bfds,
426 bfds->fpos + bfds->buffer_bytes)) )
427 {
428 /* revert to original position, invalidate buffer */
429 bfds->fpos = old_off;
430 bfds->buffer_bytes = 0;
431 bfds->buffer_pos = 0;
432 LOG ("read operation failed\n");
433 return -1; /* getting more failed */
434 }
435 avail = bfds->buffer_bytes - bfds->buffer_pos;
436 if (avail > count)
437 avail = count;
438 if (0 == avail)
439 break;
440 memcpy (&cbuf[ret], bfds->data + bfds->buffer_pos, avail);
441 bfds->buffer_pos += avail;
442 count -= avail;
443 ret += avail;
444 }
445 return ret;
446 }
447
448
449 #if HAVE_ZLIB
450 /**
451 * Initializes gz-decompression object. Might report metadata about
452 * compresse stream, if available. Resets the stream to the beginning.
453 *
454 * @param cfs cfs to initialize
455 * @param proc callback for metadata
456 * @param proc_cls callback cls
457 * @return 1 on success, 0 to terminate extraction, -1 on error
458 */
459 static int
460 cfs_init_decompressor_zlib (struct CompressedFileSource *cfs,
461 EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
462 {
463 unsigned int gzip_header_length = 10;
464 unsigned char hdata[12];
465 ssize_t rsize;
466
467 if (0 != bfds_seek (cfs->bfds, 0, SEEK_SET))
468 {
469 LOG ("Failed to seek to offset 0!\n");
470 return -1;
471 }
472 /* Process gzip header */
473 rsize = bfds_read (cfs->bfds, hdata, sizeof (hdata));
474 if ( (-1 == rsize) ||
475 (sizeof (hdata) > (size_t) rsize) )
476 return -1;
477 if (0 != (hdata[3] & 0x4)) /* FEXTRA set */
478 gzip_header_length += 2 + (hdata[10] & 0xff) + ((hdata[11] & 0xff) * 256);
479
480 if (0 != (hdata[3] & 0x8))
481 {
482 /* FNAME set */
483 char fname[1024];
484 char *cptr;
485 size_t len;
486 ssize_t buf_bytes;
487
488 if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length,
489 SEEK_SET))
490 {
491 LOG ("Corrupt gzip, failed to seek to end of header\n");
492 return -1;
493 }
494 buf_bytes = bfds_read (cfs->bfds, fname, sizeof (fname));
495 if (buf_bytes <= 0)
496 {
497 LOG ("Corrupt gzip, failed to read filename\n");
498 return -1;
499 }
500 if (NULL == (cptr = memchr (fname, 0, buf_bytes)))
501 {
502 LOG ("Corrupt gzip, failed to read filename terminator\n");
503 return -1;
504 }
505 len = cptr - fname;
506 if ( (NULL != proc) &&
507 (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_FILENAME,
508 EXTRACTOR_METAFORMAT_C_STRING, "text/plain",
509 fname,
510 len)) )
511 return 0; /* done */
512 gzip_header_length += len + 1;
513 }
514
515 if (0 != (hdata[3] & 0x16))
516 {
517 /* FCOMMENT set */
518 char fcomment[1024];
519 char *cptr;
520 ssize_t buf_bytes;
521 size_t len;
522
523 if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length,
524 SEEK_SET))
525 {
526 LOG ("Corrupt gzip, failed to seek to end of header\n");
527 return -1;
528 }
529 buf_bytes = bfds_read (cfs->bfds, fcomment, sizeof (fcomment));
530 if (buf_bytes <= 0)
531 {
532 LOG ("Corrupt gzip, failed to read comment\n");
533 return -1;
534 }
535 if (NULL == (cptr = memchr (fcomment, 0, buf_bytes)))
536 {
537 LOG ("Corrupt gzip, failed to read comment terminator\n");
538 return -1;
539 }
540 len = cptr - fcomment;
541 if ( (NULL != proc) &&
542 (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_COMMENT,
543 EXTRACTOR_METAFORMAT_C_STRING, "text/plain",
544 (const char *) fcomment,
545 len)) )
546 return 0; /* done */
547 gzip_header_length += len + 1;
548 }
549 if (0 != (hdata[3] & 0x2)) /* FCHRC set */
550 gzip_header_length += 2;
551 memset (&cfs->strm, 0, sizeof (z_stream));
552
553 #ifdef ZLIB_VERNUM
554 /* zlib will take care of its header */
555 gzip_header_length = 0;
556 #endif
557 cfs->gzip_header_length = gzip_header_length;
558
559 if (cfs->gzip_header_length !=
560 bfds_seek (cfs->bfds, cfs->gzip_header_length, SEEK_SET))
561 {
562 LOG ("Failed to seek to start to initialize gzip decompressor\n");
563 return -1;
564 }
565 cfs->strm.avail_out = COM_CHUNK_SIZE;
566 /*
567 * note: maybe plain inflateInit(&strm) is adequate,
568 * it looks more backward-compatible also ;
569 *
570 * ZLIB_VERNUM isn't defined by zlib version 1.1.4 ;
571 * there might be a better check.
572 */if (Z_OK != inflateInit2 (&cfs->strm,
573 #ifdef ZLIB_VERNUM
574 15 + 32
575 #else
576 -MAX_WBITS
577 #endif
578 ))
579 {
580 LOG ("Failed to initialize zlib decompression\n");
581 return -1;
582 }
583 return 1;
584 }
585
586
587 #endif
588
589
590 #if HAVE_LIBBZ2
591 /**
592 * Initializes bz2-decompression object. Might report metadata about
593 * compresse stream, if available. Resets the stream to the beginning.
594 *
595 * @param cfs cfs to initialize
596 * @param proc callback for metadata
597 * @param proc_cls callback cls
598 * @return 1 on success, -1 on error
599 */
600 static int
601 cfs_init_decompressor_bz2 (struct CompressedFileSource *cfs,
602 EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
603 {
604 if (0 !=
605 bfds_seek (cfs->bfds, 0, SEEK_SET))
606 {
607 LOG ("Failed to seek to start to initialize BZ2 decompressor\n");
608 return -1;
609 }
610 memset (&cfs->bstrm, 0, sizeof (bz_stream));
611 if (BZ_OK !=
612 BZ2_bzDecompressInit (&cfs->bstrm, 0, 0))
613 {
614 LOG ("Failed to initialize BZ2 decompressor\n");
615 return -1;
616 }
617 cfs->bstrm.avail_out = COM_CHUNK_SIZE;
618 return 1;
619 }
620
621
622 #endif
623
624
625 /**
626 * Initializes decompression object. Might report metadata about
627 * compresse stream, if available. Resets the stream to the beginning.
628 *
629 * @param cfs cfs to initialize
630 * @param proc callback for metadata
631 * @param proc_cls callback cls
632 * @return 1 on success, 0 to terminate extraction, -1 on error
633 */
634 static int
635 cfs_init_decompressor (struct CompressedFileSource *cfs,
636 EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
637 {
638 cfs->result_pos = 0;
639 cfs->fpos = 0;
640 switch (cfs->compression_type)
641 {
642 #if HAVE_ZLIB
643 case COMP_TYPE_ZLIB:
644 return cfs_init_decompressor_zlib (cfs, proc, proc_cls);
645 #endif
646 #if HAVE_LIBBZ2
647 case COMP_TYPE_BZ2:
648 return cfs_init_decompressor_bz2 (cfs, proc, proc_cls);
649 #endif
650 default:
651 LOG ("invalid compression type selected\n");
652 return -1;
653 }
654 }
655
656
657 #if HAVE_ZLIB
658 /**
659 * Deinitializes gz-decompression object.
660 *
661 * @param cfs cfs to deinitialize
662 * @return 1 on success, -1 on error
663 */
664 static int
665 cfs_deinit_decompressor_zlib (struct CompressedFileSource *cfs)
666 {
667 inflateEnd (&cfs->strm);
668 return 1;
669 }
670
671
672 #endif
673
674
675 #if HAVE_LIBBZ2
676 /**
677 * Deinitializes bz2-decompression object.
678 *
679 * @param cfs cfs to deinitialize
680 * @return 1 on success, -1 on error
681 */
682 static int
683 cfs_deinit_decompressor_bz2 (struct CompressedFileSource *cfs)
684 {
685 BZ2_bzDecompressEnd (&cfs->bstrm);
686 return 1;
687 }
688
689
690 #endif
691
692
693 /**
694 * Deinitializes decompression object.
695 *
696 * @param cfs cfs to deinitialize
697 * @return 1 on success, -1 on error
698 */
699 static int
700 cfs_deinit_decompressor (struct CompressedFileSource *cfs)
701 {
702 switch (cfs->compression_type)
703 {
704 #if HAVE_ZLIB
705 case COMP_TYPE_ZLIB:
706 return cfs_deinit_decompressor_zlib (cfs);
707 #endif
708 #if HAVE_LIBBZ2
709 case COMP_TYPE_BZ2:
710 return cfs_deinit_decompressor_bz2 (cfs);
711 #endif
712 default:
713 LOG ("invalid compression type selected\n");
714 return -1;
715 }
716 }
717
718
719 /**
720 * Resets the compression stream to begin uncompressing
721 * from the beginning. Used at initialization time, and when
722 * seeking backward.
723 *
724 * @param cfs cfs to reset
725 * @return 1 on success, 0 to terminate extraction,
726 * -1 on error
727 */
728 static int
729 cfs_reset_stream (struct CompressedFileSource *cfs)
730 {
731 if (-1 == cfs_deinit_decompressor (cfs))
732 return -1;
733 return cfs_init_decompressor (cfs, NULL, NULL);
734 }
735
736
737 /**
738 * Destroy compressed file source.
739 *
740 * @param cfs source to destroy
741 */
742 static void
743 cfs_destroy (struct CompressedFileSource *cfs)
744 {
745 cfs_deinit_decompressor (cfs);
746 free (cfs);
747 }
748
749
750 /**
751 * Allocates and initializes new cfs object.
752 *
753 * @param bfds data source to use
754 * @param fsize size of the source
755 * @param compression_type type of compression used
756 * @param proc metadata callback to call with meta data found upon opening
757 * @param proc_cls callback cls
758 * @return newly allocated cfs on success, NULL on error
759 */
760 struct CompressedFileSource *
761 cfs_new (struct BufferedFileDataSource *bfds,
762 int64_t fsize,
763 enum ExtractorCompressionType compression_type,
764 EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
765 {
766 struct CompressedFileSource *cfs;
767
768 if (NULL == (cfs = malloc (sizeof (struct CompressedFileSource))))
769 {
770 LOG_STRERROR ("malloc");
771 return NULL;
772 }
773 memset (cfs, 0, sizeof (struct CompressedFileSource));
774 cfs->compression_type = compression_type;
775 cfs->bfds = bfds;
776 cfs->fsize = fsize;
777 cfs->uncompressed_size = -1;
778 if (1 != cfs_init_decompressor (cfs,
779 proc, proc_cls))
780 {
781 free (cfs);
782 return NULL;
783 }
784 return cfs;
785 }
786
787
788 #if HAVE_ZLIB
789 /**
790 * Fills 'data' with new uncompressed data. Does the actual
791 * decompression. Will set uncompressed_size on the end of compressed
792 * stream.
793 *
794 * @param cfds cfs to read from
795 * @param data where to copy the data
796 * @param size number of bytes available in data
797 * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
798 */
799 static ssize_t
800 cfs_read_zlib (struct CompressedFileSource *cfs,
801 void *data,
802 size_t size)
803 {
804 char *dst = data;
805 int ret;
806 size_t rc;
807 ssize_t in;
808 unsigned char buf[COM_CHUNK_SIZE];
809
810 if (cfs->fpos == cfs->uncompressed_size)
811 {
812 /* end of file */
813 return 0;
814 }
815 rc = 0;
816 if (COM_CHUNK_SIZE > cfs->strm.avail_out + cfs->result_pos)
817 {
818 /* got left-over decompressed data from previous round! */
819 in = COM_CHUNK_SIZE - (cfs->strm.avail_out + cfs->result_pos);
820 if (in > size)
821 in = size;
822 memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
823 cfs->fpos += in;
824 cfs->result_pos += in;
825 rc += in;
826 }
827 ret = Z_OK;
828 while ( (rc < size) && (Z_STREAM_END != ret) )
829 {
830 /* read block from original data source */
831 in = bfds_read (cfs->bfds,
832 buf, sizeof (buf));
833 if (in < 0)
834 {
835 LOG ("unexpected EOF\n");
836 return -1; /* unexpected EOF */
837 }
838 if (0 == in)
839 {
840 cfs->uncompressed_size = cfs->fpos;
841 return rc;
842 }
843 cfs->strm.next_in = buf;
844 cfs->strm.avail_in = (uInt) in;
845 cfs->strm.next_out = (unsigned char *) cfs->result;
846 cfs->strm.avail_out = COM_CHUNK_SIZE;
847 cfs->result_pos = 0;
848 ret = inflate (&cfs->strm, Z_SYNC_FLUSH);
849 if ( (Z_OK != ret) && (Z_STREAM_END != ret) )
850 {
851 LOG ("unexpected gzip inflate error: %d\n", ret);
852 return -1; /* unexpected error */
853 }
854 /* go backwards by the number of bytes left in the buffer */
855 if (-1 == bfds_seek (cfs->bfds, -(int64_t) cfs->strm.avail_in, SEEK_CUR))
856 {
857 LOG ("seek failed\n");
858 return -1;
859 }
860 /* copy decompressed bytes to target buffer */
861 in = COM_CHUNK_SIZE - cfs->strm.avail_out;
862 if (in > size - rc)
863 {
864 if (Z_STREAM_END == ret)
865 {
866 cfs->uncompressed_size = cfs->fpos + in;
867 ret = Z_OK;
868 }
869 in = size - rc;
870 }
871 memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
872 cfs->fpos += in;
873 cfs->result_pos += in;
874 rc += in;
875 }
876 if (Z_STREAM_END == ret)
877 {
878 cfs->uncompressed_size = cfs->fpos;
879 }
880 return rc;
881 }
882
883
884 #endif
885
886
887 #if HAVE_LIBBZ2
888 /**
889 * Fills 'data' with new uncompressed data. Does the actual
890 * decompression. Will set uncompressed_size on the end of compressed
891 * stream.
892 *
893 * @param cfds cfs to read from
894 * @param data where to copy the data
895 * @param size number of bytes available in data
896 * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
897 */
898 static ssize_t
899 cfs_read_bz2 (struct CompressedFileSource *cfs,
900 void *data,
901 size_t size)
902 {
903 char *dst = data;
904 int ret;
905 size_t rc;
906 ssize_t in;
907 char buf[COM_CHUNK_SIZE];
908
909 if (cfs->fpos == cfs->uncompressed_size)
910 {
911 /* end of file */
912 return 0;
913 }
914 rc = 0;
915 if (COM_CHUNK_SIZE > cfs->bstrm.avail_out + cfs->result_pos)
916 {
917 /* got left-over decompressed data from previous round! */
918 in = COM_CHUNK_SIZE - (cfs->bstrm.avail_out + cfs->result_pos);
919 if (in > size)
920 in = size;
921 memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
922 cfs->fpos += in;
923 cfs->result_pos += in;
924 rc += in;
925 }
926 ret = BZ_OK;
927 while ( (rc < size) && (BZ_STREAM_END != ret) )
928 {
929 /* read block from original data source */
930 in = bfds_read (cfs->bfds,
931 buf, sizeof (buf));
932 if (in < 0)
933 {
934 LOG ("unexpected EOF\n");
935 return -1; /* unexpected EOF */
936 }
937 if (0 == in)
938 {
939 cfs->uncompressed_size = cfs->fpos;
940 return rc;
941 }
942 cfs->bstrm.next_in = buf;
943 cfs->bstrm.avail_in = (unsigned int) in;
944 cfs->bstrm.next_out = cfs->result;
945 cfs->bstrm.avail_out = COM_CHUNK_SIZE;
946 cfs->result_pos = 0;
947 ret = BZ2_bzDecompress (&cfs->bstrm);
948 if ( (BZ_OK != ret) && (BZ_STREAM_END != ret) )
949 {
950 LOG ("unexpected bzip2 decompress error: %d\n", ret);
951 return -1; /* unexpected error */
952 }
953 /* go backwards by the number of bytes left in the buffer */
954 if (-1 == bfds_seek (cfs->bfds, -(int64_t) cfs->bstrm.avail_in, SEEK_CUR))
955 {
956 LOG ("seek failed\n");
957 return -1;
958 }
959 /* copy decompressed bytes to target buffer */
960 in = COM_CHUNK_SIZE - cfs->bstrm.avail_out;
961 if (in > size - rc)
962 {
963 if (BZ_STREAM_END == ret)
964 {
965 cfs->uncompressed_size = cfs->fpos + in;
966 ret = BZ_OK;
967 }
968 in = size - rc;
969 }
970 memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
971 cfs->fpos += in;
972 cfs->result_pos += in;
973 rc += in;
974 }
975 if (BZ_STREAM_END == ret)
976 {
977 cfs->uncompressed_size = cfs->fpos;
978 }
979 return rc;
980 }
981
982
983 #endif
984
985
986 /**
987 * Fills 'data' with new uncompressed data. Does the actual
988 * decompression. Will set uncompressed_size on the end of compressed
989 * stream.
990 *
991 * @param cfds cfs to read from
992 * @param data where to copy the data
993 * @param size number of bytes available in data
994 * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
995 */
996 static ssize_t
997 cfs_read (struct CompressedFileSource *cfs,
998 void *data,
999 size_t size)
1000 {
1001 switch (cfs->compression_type)
1002 {
1003 #if HAVE_ZLIB
1004 case COMP_TYPE_ZLIB:
1005 return cfs_read_zlib (cfs, data, size);
1006 #endif
1007 #if HAVE_LIBBZ2
1008 case COMP_TYPE_BZ2:
1009 return cfs_read_bz2 (cfs, data, size);
1010 #endif
1011 default:
1012 LOG ("invalid compression type selected\n");
1013 return -1;
1014 }
1015 }
1016
1017
1018 /**
1019 * Moves the buffer to 'position' in uncompressed steam. If position
1020 * requires seeking backwards beyond the boundaries of the buffer, resets the
1021 * stream and repeats decompression from the beginning to 'position'.
1022 *
1023 * @param cfs cfs to seek on
1024 * @param position new starting point for the buffer
1025 * @param whence one of the seek constants (SEEK_CUR, SEEK_SET, SEEK_END)
1026 * @return new absolute buffer position, -1 on error or EOS
1027 */
1028 static int64_t
1029 cfs_seek (struct CompressedFileSource *cfs,
1030 int64_t position,
1031 int whence)
1032 {
1033 uint64_t nposition;
1034 int64_t delta;
1035
1036 switch (whence)
1037 {
1038 case SEEK_CUR:
1039 if (cfs->fpos + position < 0)
1040 {
1041 /* underflow */
1042 LOG ("Invalid seek operation\n");
1043 return -1;
1044 }
1045 if ( (-1 != cfs->uncompressed_size) &&
1046 (cfs->fpos + position > cfs->uncompressed_size) )
1047 {
1048 LOG ("Invalid seek operation\n");
1049 return -1;
1050 }
1051 nposition = cfs->fpos + position;
1052 break;
1053 case SEEK_END:
1054 ASSERT (-1 != cfs->uncompressed_size);
1055 if (position > 0)
1056 {
1057 LOG ("Invalid seek operation\n");
1058 return -1;
1059 }
1060 if (cfs->uncompressed_size < -position)
1061 {
1062 LOG ("Invalid seek operation\n");
1063 return -1;
1064 }
1065 nposition = cfs->uncompressed_size + position;
1066 break;
1067 case SEEK_SET:
1068 if (position < 0)
1069 {
1070 LOG ("Invalid seek operation\n");
1071 return -1;
1072 }
1073 if ( (-1 != cfs->uncompressed_size) &&
1074 (cfs->uncompressed_size < position) )
1075 {
1076 LOG ("Invalid seek operation\n");
1077 return -1;
1078 }
1079 nposition = (uint64_t) position;
1080 break;
1081 default:
1082 LOG ("Invalid seek operation\n");
1083 return -1;
1084 }
1085 delta = nposition - cfs->fpos;
1086 if (delta < 0)
1087 {
1088 if (cfs->result_pos >= -delta)
1089 {
1090 cfs->result_pos += delta;
1091 cfs->fpos += delta;
1092 delta = 0;
1093 }
1094 else
1095 {
1096 if (-1 == cfs_reset_stream (cfs))
1097 {
1098 LOG ("Failed to restart compressed stream for seek operation\n");
1099 return -1;
1100 }
1101 delta = nposition;
1102 }
1103 }
1104 while (delta > 0)
1105 {
1106 char buf[COM_CHUNK_SIZE];
1107 size_t max;
1108 int64_t ret;
1109
1110 max = (sizeof (buf) > delta) ? delta : sizeof (buf);
1111 ret = cfs_read (cfs, buf, max);
1112 if (-1 == ret)
1113 {
1114 LOG ("Failed to read decompressed stream for seek operation\n");
1115 return -1;
1116 }
1117 if (0 == ret)
1118 {
1119 LOG (
1120 "Reached unexpected end of stream at %llu during seek operation to %llu (%d left)\n",
1121 (unsigned long long) cfs->fpos,
1122 (unsigned long long) nposition,
1123 delta);
1124 return -1;
1125 }
1126 ASSERT (ret <= delta);
1127 delta -= ret;
1128 }
1129 return cfs->fpos;
1130 }
1131
1132
1133 /**
1134 * Detect if we have compressed data on our hands.
1135 *
1136 * @param data pointer to a data buffer or NULL (in case fd is not -1)
1137 * @param fd a file to read data from, or -1 (if data is not NULL)
1138 * @param fsize size of data (if data is not NULL) or of file (if fd is not -1)
1139 * @return -1 to indicate an error, 0 to indicate uncompressed data, or a type (> 0) of compression
1140 */
1141 static enum ExtractorCompressionType
1142 get_compression_type (struct BufferedFileDataSource *bfds)
1143 {
1144 unsigned char read_data[3];
1145
1146 if (0 != bfds_seek (bfds, 0, SEEK_SET))
1147 return COMP_TYPE_INVALID;
1148 if (sizeof (read_data) !=
1149 bfds_read (bfds, read_data, sizeof (read_data)))
1150 return COMP_TYPE_UNDEFINED;
1151
1152 #if HAVE_ZLIB
1153 if ( (bfds->fsize >= MIN_ZLIB_HEADER) &&
1154 (read_data[0] == 0x1f) &&
1155 (read_data[1] == 0x8b) &&
1156 (read_data[2] == 0x08) )
1157 return COMP_TYPE_ZLIB;
1158 #endif
1159 #if HAVE_LIBBZ2
1160 if ( (bfds->fsize >= MIN_BZ2_HEADER) &&
1161 (read_data[0] == 'B') &&
1162 (read_data[1] == 'Z') &&
1163 (read_data[2] == 'h'))
1164 return COMP_TYPE_BZ2;
1165 #endif
1166 return COMP_TYPE_INVALID;
1167 }
1168
1169
1170 /**
1171 * Handle to a datasource we can use for the plugins.
1172 */
1173 struct EXTRACTOR_Datasource
1174 {
1175
1176 /**
1177 * Underlying buffered data source.
1178 */
1179 struct BufferedFileDataSource *bfds;
1180
1181 /**
1182 * Compressed file source (NULL if not applicable).
1183 */
1184 struct CompressedFileSource *cfs;
1185
1186 /**
1187 * Underlying file descriptor, -1 for none.
1188 */
1189 int fd;
1190 };
1191
1192
1193 /**
1194 * Create a datasource from a file on disk.
1195 *
1196 * @param filename name of the file on disk
1197 * @param proc metadata callback to call with meta data found upon opening
1198 * @param proc_cls callback cls
1199 * @return handle to the datasource, NULL on error
1200 */
1201 struct EXTRACTOR_Datasource *
1202 EXTRACTOR_datasource_create_from_file_ (const char *filename,
1203 EXTRACTOR_MetaDataProcessor proc,
1204 void *proc_cls)
1205 {
1206 struct BufferedFileDataSource *bfds;
1207 struct EXTRACTOR_Datasource *ds;
1208 enum ExtractorCompressionType ct;
1209 int fd;
1210 struct stat sb;
1211 int64_t fsize;
1212 int winmode = 0;
1213 #if WINDOWS
1214 winmode = O_BINARY;
1215 #endif
1216
1217 if (-1 == (fd = open (filename, O_RDONLY | O_LARGEFILE | winmode)))
1218 {
1219 LOG_STRERROR_FILE ("open", filename);
1220 return NULL;
1221 }
1222 if ( (0 != fstat (fd, &sb)) ||
1223 (S_ISDIR (sb.st_mode)) )
1224 {
1225 if (! S_ISDIR (sb.st_mode))
1226 LOG_STRERROR_FILE ("fstat", filename);
1227 else
1228 LOG ("Skipping directory `%s'\n", filename);
1229 (void) close (fd);
1230 return NULL;
1231 }
1232 fsize = (int64_t) sb.st_size;
1233 if (0 == fsize)
1234 {
1235 (void) close (fd);
1236 return NULL;
1237 }
1238 bfds = bfds_new (NULL, fd, fsize);
1239 if (NULL == bfds)
1240 {
1241 (void) close (fd);
1242 return NULL;
1243 }
1244 if (NULL == (ds = malloc (sizeof (struct EXTRACTOR_Datasource))))
1245 {
1246 LOG_STRERROR ("malloc");
1247 bfds_delete (bfds);
1248 (void) close (fd);
1249 return NULL;
1250 }
1251 ds->bfds = bfds;
1252 ds->fd = fd;
1253 ds->cfs = NULL;
1254 ct = get_compression_type (bfds);
1255 if ( (COMP_TYPE_ZLIB == ct) ||
1256 (COMP_TYPE_BZ2 == ct) )
1257 {
1258 ds->cfs = cfs_new (bfds, fsize, ct, proc, proc_cls);
1259 if (NULL == ds->cfs)
1260 {
1261 LOG ("Failed to initialize decompressor\n");
1262 bfds_delete (bfds);
1263 free (ds);
1264 (void) close (fd);
1265 return NULL;
1266 }
1267 }
1268 return ds;
1269 }
1270
1271
1272 /**
1273 * Create a datasource from a buffer in memory.
1274 *
1275 * @param buf data in memory
1276 * @param size number of bytes in 'buf'
1277 * @param proc metadata callback to call with meta data found upon opening
1278 * @param proc_cls callback cls
1279 * @return handle to the datasource
1280 */
1281 struct EXTRACTOR_Datasource *
1282 EXTRACTOR_datasource_create_from_buffer_ (const char *buf,
1283 size_t size,
1284 EXTRACTOR_MetaDataProcessor proc,
1285 void *proc_cls)
1286 {
1287 struct BufferedFileDataSource *bfds;
1288 struct EXTRACTOR_Datasource *ds;
1289 enum ExtractorCompressionType ct;
1290
1291 if (0 == size)
1292 return NULL;
1293 if (NULL == (bfds = bfds_new (buf, -1, size)))
1294 {
1295 LOG ("Failed to initialize buffer data source\n");
1296 return NULL;
1297 }
1298 if (NULL == (ds = malloc (sizeof (struct EXTRACTOR_Datasource))))
1299 {
1300 LOG_STRERROR ("malloc");
1301 bfds_delete (bfds);
1302 return NULL;
1303 }
1304 ds->bfds = bfds;
1305 ds->fd = -1;
1306 ds->cfs = NULL;
1307 ct = get_compression_type (bfds);
1308 if ( (COMP_TYPE_ZLIB == ct) ||
1309 (COMP_TYPE_BZ2 == ct) )
1310 {
1311 ds->cfs = cfs_new (bfds, size, ct, proc, proc_cls);
1312 if (NULL == ds->cfs)
1313 {
1314 LOG ("Failed to initialize decompressor\n");
1315 bfds_delete (bfds);
1316 free (ds);
1317 return NULL;
1318 }
1319 }
1320 return ds;
1321 }
1322
1323
1324 /**
1325 * Destroy a data source.
1326 *
1327 * @param ds source to destroy
1328 */
1329 void
1330 EXTRACTOR_datasource_destroy_ (struct EXTRACTOR_Datasource *ds)
1331 {
1332 if (NULL != ds->cfs)
1333 cfs_destroy (ds->cfs);
1334 bfds_delete (ds->bfds);
1335 if (-1 != ds->fd)
1336 (void) close (ds->fd);
1337 free (ds);
1338 }
1339
1340
1341 /**
1342 * Make 'size' bytes of data from the data source available at 'data'.
1343 *
1344 * @param cls must be a 'struct EXTRACTOR_Datasource'
1345 * @param data where the data should be copied to
1346 * @param size maximum number of bytes requested
1347 * @return number of bytes now available in data (can be smaller than 'size'),
1348 * -1 on error
1349 */
1350 ssize_t
1351 EXTRACTOR_datasource_read_ (void *cls,
1352 void *data,
1353 size_t size)
1354 {
1355 struct EXTRACTOR_Datasource *ds = cls;
1356
1357 if (NULL != ds->cfs)
1358 return cfs_read (ds->cfs, data, size);
1359 return bfds_read (ds->bfds, data, size);
1360 }
1361
1362
1363 /**
1364 * Seek in the datasource. Use 'SEEK_CUR' for whence and 'pos' of 0 to
1365 * obtain the current position in the file.
1366 *
1367 * @param cls must be a 'struct EXTRACTOR_Datasource'
1368 * @param pos position to seek (see 'man lseek')
1369 * @param whence how to see (absolute to start, relative, absolute to end)
1370 * @return new absolute position, UINT64_MAX on error (i.e. desired position
1371 * does not exist)
1372 */
1373 int64_t
1374 EXTRACTOR_datasource_seek_ (void *cls,
1375 int64_t pos,
1376 int whence)
1377 {
1378 struct EXTRACTOR_Datasource *ds = cls;
1379
1380 if (NULL != ds->cfs)
1381 {
1382 if ( (SEEK_END == whence) &&
1383 (-1 == ds->cfs->uncompressed_size) )
1384 {
1385 /* need to obtain uncompressed size */
1386 (void) EXTRACTOR_datasource_get_size_ (ds, 1);
1387 if (-1 == ds->cfs->uncompressed_size)
1388 return -1;
1389 }
1390 return cfs_seek (ds->cfs, pos, whence);
1391 }
1392 return bfds_seek (ds->bfds, pos, whence);
1393 }
1394
1395
1396 /**
1397 * Determine the overall size of the data source (after compression).
1398 *
1399 * @param cls must be a 'struct EXTRACTOR_Datasource'
1400 * @param force force computing the size if it is unavailable
1401 * @return overall file size, UINT64_MAX on error or unknown
1402 */
1403 int64_t
1404 EXTRACTOR_datasource_get_size_ (void *cls,
1405 int force)
1406 {
1407 struct EXTRACTOR_Datasource *ds = cls;
1408 char buf[32 * 1024];
1409 uint64_t pos;
1410
1411 if (NULL != ds->cfs)
1412 {
1413 if ( (force) &&
1414 (-1 == ds->cfs->uncompressed_size) )
1415 {
1416 pos = ds->cfs->fpos;
1417 while ( (-1 == ds->cfs->uncompressed_size) &&
1418 (-1 != cfs_read (ds->cfs, buf, sizeof (buf))) )
1419 ;
1420 if (-1 == cfs_seek (ds->cfs, pos, SEEK_SET))
1421 {
1422 LOG (
1423 "Serious problem, I moved the buffer to determine the file size but could not restore it...\n");
1424 return -1;
1425 }
1426 if (-1 == ds->cfs->uncompressed_size)
1427 return -1;
1428 }
1429 return ds->cfs->uncompressed_size;
1430 }
1431 return ds->bfds->fsize;
1432 }
1433
1434
1435 /* end of extractor_datasource.c */