"Fossies" - the Fresh Open Source Software Archive 
Member "libextractor-1.11/src/main/extractor.c" (30 Jan 2021, 19487 Bytes) of package /linux/privat/libextractor-1.11.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "extractor.c" see the
Fossies "Dox" file reference documentation and the latest
Fossies "Diffs" side-by-side code changes report:
1.10_vs_1.11.
1 /*
2 This file is part of libextractor.
3 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2009, 2012 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 Boston, MA 02110-1301, USA.
19 */
20
21 #include "platform.h"
22 #include "extractor.h"
23 #include <dirent.h>
24 #include <sys/types.h>
25 #include <signal.h>
26 #include <ltdl.h>
27 #include "extractor_datasource.h"
28 #include "extractor_ipc.h"
29 #include "extractor_logging.h"
30 #include "extractor_plugpath.h"
31 #include "extractor_plugins.h"
32
33
34 /**
35 * Size used for the shared memory segment.
36 */
37 #define DEFAULT_SHM_SIZE (16 * 1024)
38
39
40 /**
41 * Closure for #process_plugin_reply()
42 */
43 struct PluginReplyProcessor
44 {
45 /**
46 * Function to call if we receive meta data from the plugin.
47 */
48 EXTRACTOR_MetaDataProcessor proc;
49
50 /**
51 * Closure for @e proc.
52 */
53 void *proc_cls;
54
55 /**
56 * Are we done with processing this file? 0 to continue, 1 to terminate.
57 */
58 int file_finished;
59
60 };
61
62
63 /**
64 * Send an 'update' message to the plugin.
65 *
66 * @param plugin plugin to notify
67 * @param shm_off new offset for the SHM
68 * @param data_available number of bytes available in shm
69 * @param ds datastore backend we are using
70 */
71 static void
72 send_update_message (struct EXTRACTOR_PluginList *plugin,
73 int64_t shm_off,
74 size_t data_available,
75 struct EXTRACTOR_Datasource *ds)
76 {
77 struct UpdateMessage um;
78
79 um.opcode = MESSAGE_UPDATED_SHM;
80 um.reserved = 0;
81 um.reserved2 = 0;
82 um.shm_ready_bytes = (uint32_t) data_available;
83 um.shm_off = (uint64_t) shm_off;
84 um.file_size = EXTRACTOR_datasource_get_size_ (ds, 0);
85 if (sizeof (um) !=
86 EXTRACTOR_IPC_channel_send_ (plugin->channel,
87 &um,
88 sizeof (um)) )
89 {
90 LOG ("Failed to send UPDATED_SHM message to plugin\n");
91 EXTRACTOR_IPC_channel_destroy_ (plugin->channel);
92 plugin->channel = NULL;
93 plugin->round_finished = 1;
94 }
95 }
96
97
98 /**
99 * Send a 'discard state' message to the plugin and mark it as finished
100 * for this round.
101 *
102 * @param plugin plugin to notify
103 */
104 static void
105 send_discard_message (struct EXTRACTOR_PluginList *plugin)
106 {
107 static unsigned char disc_msg = MESSAGE_DISCARD_STATE;
108
109 if (sizeof (disc_msg) !=
110 EXTRACTOR_IPC_channel_send_ (plugin->channel,
111 &disc_msg,
112 sizeof (disc_msg)) )
113 {
114 LOG ("Failed to send DISCARD_STATE message to plugin\n");
115 EXTRACTOR_IPC_channel_destroy_ (plugin->channel);
116 plugin->channel = NULL;
117 plugin->round_finished = 1;
118 }
119 }
120
121
122 /**
123 * We had some serious trouble. Abort all channels.
124 *
125 * @param plugins list of plugins with channels to abort
126 */
127 static void
128 abort_all_channels (struct EXTRACTOR_PluginList *plugins)
129 {
130 struct EXTRACTOR_PluginList *pos;
131
132 for (pos = plugins; NULL != pos; pos = pos->next)
133 {
134 if (NULL == pos->channel)
135 continue;
136 EXTRACTOR_IPC_channel_destroy_ (pos->channel);
137 pos->channel = NULL;
138 }
139 }
140
141
142 /**
143 * Handler for a message from one of the plugins.
144 *
145 * @param cls closure with our 'struct PluginReplyProcessor'
146 * @param plugin plugin of the channel sending the message
147 * @param meta_type type of the meta data
148 * @param meta_format format of the meta data
149 * @param mime mime string send from the plugin
150 * @param value 'data' send from the plugin
151 * @param value_len number of bytes in 'value'
152 */
153 static void
154 process_plugin_reply (void *cls,
155 struct EXTRACTOR_PluginList *plugin,
156 enum EXTRACTOR_MetaType meta_type,
157 enum EXTRACTOR_MetaFormat meta_format,
158 const char *mime,
159 const void *value,
160 size_t value_len)
161 {
162 static unsigned char cont_msg = MESSAGE_CONTINUE_EXTRACTING;
163 struct PluginReplyProcessor *prp = cls;
164
165 if (0 != prp->file_finished)
166 {
167 /* client already aborted, ignore message, tell plugin about abort */
168 return;
169 }
170 if (0 != prp->proc (prp->proc_cls,
171 plugin->short_libname,
172 meta_type,
173 meta_format,
174 mime,
175 value,
176 value_len))
177 {
178 prp->file_finished = 1;
179 #if DEBUG
180 fprintf (stderr, "Sending ABRT\n");
181 #endif
182 send_discard_message (plugin);
183 return;
184 }
185 if (sizeof (cont_msg) !=
186 EXTRACTOR_IPC_channel_send_ (plugin->channel,
187 &cont_msg,
188 sizeof (cont_msg)) )
189 {
190 LOG ("Failed to send CONTINUE_EXTRACTING message to plugin\n");
191 EXTRACTOR_IPC_channel_destroy_ (plugin->channel);
192 plugin->channel = NULL;
193 plugin->round_finished = 1;
194 }
195 }
196
197
198 /**
199 * Closure for the in-process callbacks.
200 */
201 struct InProcessContext
202 {
203 /**
204 * Current plugin.
205 */
206 struct EXTRACTOR_PluginList *plugin;
207
208 /**
209 * Data source to use.
210 */
211 struct EXTRACTOR_Datasource *ds;
212
213 /**
214 * Function to call with meta data.
215 */
216 EXTRACTOR_MetaDataProcessor proc;
217
218 /**
219 * Closure for @e proc.
220 */
221 void *proc_cls;
222
223 /**
224 * IO buffer.
225 */
226 char buf[DEFAULT_SHM_SIZE];
227
228 /**
229 * 0 to continue extracting, 1 if we are finished
230 */
231 int finished;
232 };
233
234
235 /**
236 * Obtain a pointer to up to @a size bytes of data from the file to process.
237 * Callback used for in-process plugins.
238 *
239 * @param cls a `struct InProcessContext`
240 * @param data pointer to set to the file data, set to NULL on error
241 * @param size maximum number of bytes requested
242 * @return number of bytes now available in data (can be smaller than @a size),
243 * -1 on error
244 */
245 static ssize_t
246 in_process_read (void *cls,
247 void **data,
248 size_t size)
249 {
250 struct InProcessContext *ctx = cls;
251 ssize_t ret;
252 size_t bsize;
253
254 bsize = sizeof (ctx->buf);
255 if (size < bsize)
256 bsize = size;
257 ret = EXTRACTOR_datasource_read_ (ctx->ds,
258 ctx->buf,
259 bsize);
260 if (-1 == ret)
261 *data = NULL;
262 else
263 *data = ctx->buf;
264 return ret;
265 }
266
267
268 /**
269 * Seek in the file. Use 'SEEK_CUR' for @a whence and @a pos of 0 to
270 * obtain the current position in the file.
271 * Callback used for in-process plugins.
272 *
273 * @param cls a 'struct InProcessContext'
274 * @param pos position to seek (see 'man lseek')
275 * @param whence how to see (absolute to start, relative, absolute to end)
276 * @return new absolute position, -1 on error (i.e. desired position
277 * does not exist)
278 */
279 static int64_t
280 in_process_seek (void *cls,
281 int64_t pos,
282 int whence)
283 {
284 struct InProcessContext *ctx = cls;
285
286 return EXTRACTOR_datasource_seek_ (ctx->ds,
287 pos,
288 whence);
289 }
290
291
292 /**
293 * Determine the overall size of the file.
294 * Callback used for in-process plugins.
295 *
296 * @param cls a `struct InProcessContext`
297 * @return overall file size, UINT64_MAX on error (i.e. IPC failure)
298 */
299 static uint64_t
300 in_process_get_size (void *cls)
301 {
302 struct InProcessContext *ctx = cls;
303
304 return (uint64_t) EXTRACTOR_datasource_get_size_ (ctx->ds, 0);
305 }
306
307
308 /**
309 * Type of a function that libextractor calls for each
310 * meta data item found.
311 * Callback used for in-process plugins.
312 *
313 * @param cls a 'struct InProcessContext'
314 * @param plugin_name name of the plugin that produced this value;
315 * special values can be used (i.e. '<zlib>' for zlib being
316 * used in the main libextractor library and yielding
317 * meta data).
318 * @param type libextractor-type describing the meta data
319 * @param format basic format information about data
320 * @param data_mime_type mime-type of data (not of the original file);
321 * can be NULL (if mime-type is not known)
322 * @param data actual meta-data found
323 * @param data_len number of bytes in data
324 * @return 0 to continue extracting, 1 to abort
325 */
326 static int
327 in_process_proc (void *cls,
328 const char *plugin_name,
329 enum EXTRACTOR_MetaType type,
330 enum EXTRACTOR_MetaFormat format,
331 const char *data_mime_type,
332 const char *data,
333 size_t data_len)
334 {
335 struct InProcessContext *ctx = cls;
336 int ret;
337
338 if (0 != ctx->finished)
339 return 1;
340 ret = ctx->proc (ctx->proc_cls,
341 plugin_name,
342 type,
343 format,
344 data_mime_type,
345 data,
346 data_len);
347 if (0 != ret)
348 ctx->finished = 1;
349 return ret;
350 }
351
352
353 /**
354 * Extract keywords using the given set of plugins.
355 *
356 * @param plugins the list of plugins to use
357 * @param shm shared memory object used by the plugins (NULL if
358 * all plugins are in-process)
359 * @param ds data to process
360 * @param proc function to call for each meta data item found
361 * @param proc_cls cls argument to @a proc
362 */
363 static void
364 do_extract (struct EXTRACTOR_PluginList *plugins,
365 struct EXTRACTOR_SharedMemory *shm,
366 struct EXTRACTOR_Datasource *ds,
367 EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
368 {
369 unsigned int plugin_count;
370 unsigned int plugin_off;
371 struct EXTRACTOR_PluginList *pos;
372 struct StartMessage start;
373 struct EXTRACTOR_Channel *channel;
374 struct PluginReplyProcessor prp;
375 struct InProcessContext ctx;
376 struct EXTRACTOR_ExtractContext ec;
377 int64_t min_seek;
378 int64_t end;
379 ssize_t data_available;
380 ssize_t ready;
381 int done;
382 int have_in_memory;
383
384 plugin_count = 0;
385 for (pos = plugins; NULL != pos; pos = pos->next)
386 plugin_count++;
387 if (NULL != shm)
388 ready = EXTRACTOR_IPC_shared_memory_set_ (shm,
389 ds,
390 0,
391 DEFAULT_SHM_SIZE);
392 else
393 ready = 0;
394 if (-1 == ready)
395 return; /* failed to ready _any_ data!? */
396 have_in_memory = 0;
397 prp.file_finished = 0;
398 prp.proc = proc;
399 prp.proc_cls = proc_cls;
400
401 /* send 'start' message */
402 start.opcode = MESSAGE_EXTRACT_START;
403 start.reserved = 0;
404 start.reserved2 = 0;
405 start.shm_ready_bytes = (uint32_t) ready;
406 start.file_size = EXTRACTOR_datasource_get_size_ (ds, 0);
407 for (pos = plugins; NULL != pos; pos = pos->next)
408 {
409 if (EXTRACTOR_OPTION_IN_PROCESS == pos->flags)
410 have_in_memory = 1;
411 if ( (NULL != pos->channel) &&
412 (-1 == EXTRACTOR_IPC_channel_send_ (pos->channel,
413 &start,
414 sizeof (start)) ) )
415 {
416 LOG ("Failed to send EXTRACT_START message to plugin\n");
417 EXTRACTOR_IPC_channel_destroy_ (pos->channel);
418 pos->channel = NULL;
419 }
420 }
421 done = 0;
422 while (! done)
423 {
424 struct EXTRACTOR_Channel *channels[plugin_count];
425
426 /* calculate current 'channels' array */
427 plugin_off = 0;
428 for (pos = plugins; NULL != pos; pos = pos->next)
429 {
430 if (-1 == pos->seek_request)
431 {
432 /* channel is not seeking, must be running or done */
433 channels[plugin_off] = pos->channel;
434 }
435 else
436 {
437 /* not running this round, seeking! */
438 channels[plugin_off] = NULL;
439 }
440 plugin_off++;
441 }
442 /* give plugins chance to send us meta data, seek or finished messages */
443 if (-1 ==
444 EXTRACTOR_IPC_channel_recv_ (channels,
445 plugin_count,
446 &process_plugin_reply,
447 &prp))
448 {
449 /* serious problem in IPC; reset *all* channels */
450 LOG ("Failed to receive message from channels; full reset\n");
451 abort_all_channels (plugins);
452 break;
453 }
454
455 /* calculate minimum seek request (or set done=0 to continue here) */
456 done = 1;
457 min_seek = -1;
458 plugin_off = 0;
459 for (pos = plugins; NULL != pos; pos = pos->next)
460 {
461 plugin_off++;
462 if ( (1 == pos->round_finished) ||
463 (NULL == pos->channel) )
464 {
465 continue; /* inactive plugin */
466 }
467 if (-1 == pos->seek_request)
468 {
469 /* possibly more meta data at current position, at least
470 this plugin is still working on it... */
471 done = 0;
472 break;
473 }
474 if (-1 != pos->seek_request)
475 {
476 if (SEEK_END == pos->seek_whence)
477 {
478 /* convert distance from end to absolute position */
479 pos->seek_whence = 0;
480 end = EXTRACTOR_datasource_get_size_ (ds, 1);
481 if (pos->seek_request > end)
482 {
483 LOG ("Cannot seek to before the beginning of the file!\n");
484 pos->seek_request = 0;
485 }
486 else
487 {
488 pos->seek_request = end - pos->seek_request;
489 }
490 }
491 if ( (-1 == min_seek) ||
492 (min_seek > pos->seek_request) )
493 {
494 min_seek = pos->seek_request;
495 }
496 }
497 }
498 data_available = -1;
499 if ( (1 == done) &&
500 (-1 != min_seek) &&
501 (NULL != shm) )
502 {
503 /* current position done, but seek requested */
504 done = 0;
505 if (-1 ==
506 (data_available = EXTRACTOR_IPC_shared_memory_set_ (shm,
507 ds,
508 min_seek,
509 DEFAULT_SHM_SIZE)))
510 {
511 LOG ("Failed to seek; full reset\n");
512 abort_all_channels (plugins);
513 break;
514 }
515 }
516 /* if 'prp.file_finished', send 'abort' to plugins;
517 if not, send 'seek' notification to plugins in range */
518 for (pos = plugins; NULL != pos; pos = pos->next)
519 {
520 if (NULL == (channel = pos->channel))
521 {
522 /* Skipping plugin: channel down */
523 continue;
524 }
525 if ( (-1 != pos->seek_request) &&
526 (1 == prp.file_finished) )
527 {
528 send_discard_message (pos);
529 pos->round_finished = 1;
530 pos->seek_request = -1;
531 }
532 if ( (-1 != data_available) &&
533 (-1 != pos->seek_request) &&
534 (min_seek <= pos->seek_request) &&
535 ( (min_seek + data_available > pos->seek_request) ||
536 (min_seek == EXTRACTOR_datasource_get_size_ (ds, 0))) )
537 {
538 /* Notify plugin about seek to 'min_seek' */
539 send_update_message (pos,
540 min_seek,
541 data_available,
542 ds);
543 pos->seek_request = -1;
544 }
545 if (0 == pos->round_finished)
546 done = 0; /* can't be done, plugin still active */
547 }
548 }
549
550 if (0 == have_in_memory)
551 return;
552 /* run in-process plugins */
553 ctx.finished = 0;
554 ctx.ds = ds;
555 ctx.proc = proc;
556 ctx.proc_cls = proc_cls;
557 ec.cls = &ctx;
558 ec.read = &in_process_read;
559 ec.seek = &in_process_seek;
560 ec.get_size = &in_process_get_size;
561 ec.proc = &in_process_proc;
562 for (pos = plugins; NULL != pos; pos = pos->next)
563 {
564 if (EXTRACTOR_OPTION_IN_PROCESS != pos->flags)
565 continue;
566 if (-1 == EXTRACTOR_plugin_load_ (pos))
567 continue;
568 ctx.plugin = pos;
569 ec.config = pos->plugin_options;
570 if (-1 == EXTRACTOR_datasource_seek_ (ds, 0, SEEK_SET))
571 {
572 LOG ("Failed to seek to 0 for in-memory plugins\n");
573 return;
574 }
575 pos->extract_method (&ec);
576 if (1 == ctx.finished)
577 break;
578 }
579 }
580
581
582 /**
583 * Extract keywords from a file using the given set of plugins.
584 * If needed, opens the file and loads its data (via mmap). Then
585 * decompresses it if the data is compressed. Finally runs the
586 * plugins on the (now possibly decompressed) data.
587 *
588 * @param plugins the list of plugins to use
589 * @param filename the name of the file, can be NULL if data is not NULL
590 * @param data data of the file in memory, can be NULL (in which
591 * case libextractor will open file) if filename is not NULL
592 * @param size number of bytes in data, ignored if data is NULL
593 * @param proc function to call for each meta data item found
594 * @param proc_cls cls argument to @a proc
595 */
596 void
597 EXTRACTOR_extract (struct EXTRACTOR_PluginList *plugins,
598 const char *filename,
599 const void *data,
600 size_t size,
601 EXTRACTOR_MetaDataProcessor proc,
602 void *proc_cls)
603 {
604 struct EXTRACTOR_Datasource *datasource;
605 struct EXTRACTOR_SharedMemory *shm;
606 struct EXTRACTOR_PluginList *pos;
607 int have_oop;
608
609 if (NULL == plugins)
610 return;
611 if (NULL == filename)
612 datasource = EXTRACTOR_datasource_create_from_buffer_ (data, size,
613 proc, proc_cls);
614 else
615 datasource = EXTRACTOR_datasource_create_from_file_ (filename,
616 proc, proc_cls);
617 if (NULL == datasource)
618 return;
619 shm = NULL;
620 have_oop = 0;
621 for (pos = plugins; NULL != pos; pos = pos->next)
622 {
623 if (NULL == shm)
624 shm = pos->shm;
625 if (EXTRACTOR_OPTION_IN_PROCESS != pos->flags)
626 have_oop = 1;
627 pos->round_finished = 0;
628 }
629 if ( (NULL == shm) &&
630 (1 == have_oop) )
631 {
632 /* need to create shared memory segment */
633 shm = EXTRACTOR_IPC_shared_memory_create_ (DEFAULT_SHM_SIZE);
634 if (NULL == shm)
635 {
636 LOG ("Failed to setup IPC\n");
637 EXTRACTOR_datasource_destroy_ (datasource);
638 return;
639 }
640 }
641 for (pos = plugins; NULL != pos; pos = pos->next)
642 if ( (NULL == pos->channel) &&
643 (NULL != shm) &&
644 (EXTRACTOR_OPTION_IN_PROCESS != pos->flags) )
645 {
646 if (NULL == pos->shm)
647 {
648 pos->shm = shm;
649 (void) EXTRACTOR_IPC_shared_memory_change_rc_ (shm, 1);
650 }
651 pos->channel = EXTRACTOR_IPC_channel_create_ (pos,
652 shm);
653 }
654 do_extract (plugins,
655 shm,
656 datasource,
657 proc,
658 proc_cls);
659 EXTRACTOR_datasource_destroy_ (datasource);
660 }
661
662
663 /**
664 * Initialize gettext and libltdl (and W32 if needed).
665 */
666 void __attribute__ ((constructor))
667 EXTRACTOR_ltdl_init ()
668 {
669 int err;
670
671 #if ENABLE_NLS
672 bindtextdomain (PACKAGE, LOCALEDIR);
673 #endif
674 err = lt_dlinit ();
675 if (err > 0)
676 {
677 #if DEBUG
678 fprintf (stderr,
679 _ ("Initialization of plugin mechanism failed: %s!\n"),
680 lt_dlerror ());
681 #endif
682 return;
683 }
684 #if WINDOWS
685 plibc_init_utf8 ("GNU", PACKAGE, 1);
686 plibc_set_stat_size_size (sizeof (((struct stat *) 0)->st_size));
687 plibc_set_stat_time_size (sizeof (((struct stat *) 0)->st_mtime));
688 #endif
689 }
690
691
692 /**
693 * Deinit.
694 */
695 void __attribute__ ((destructor))
696 EXTRACTOR_ltdl_fini ()
697 {
698 #if WINDOWS
699 plibc_shutdown ();
700 #endif
701 lt_dlexit ();
702 }
703
704
705 /* end of extractor.c */