"Fossies" - the Fresh Open Source Software Archive

Member "libextractor-1.11/src/main/extractor.c" (30 Jan 2021, 19487 Bytes) of package /linux/privat/libextractor-1.11.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "extractor.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 1.10_vs_1.11.

    1 /*
    2      This file is part of libextractor.
    3      Copyright (C) 2002, 2003, 2004, 2005, 2006, 2009, 2012 Vidyut Samanta and Christian Grothoff
    4 
    5      libextractor is free software; you can redistribute it and/or modify
    6      it under the terms of the GNU General Public License as published
    7      by the Free Software Foundation; either version 3, or (at your
    8      option) any later version.
    9 
   10      libextractor is distributed in the hope that it will be useful, but
   11      WITHOUT ANY WARRANTY; without even the implied warranty of
   12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13      General Public License for more details.
   14 
   15      You should have received a copy of the GNU General Public License
   16      along with libextractor; see the file COPYING.  If not, write to the
   17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
   18      Boston, MA 02110-1301, USA.
   19  */
   20 
   21 #include "platform.h"
   22 #include "extractor.h"
   23 #include <dirent.h>
   24 #include <sys/types.h>
   25 #include <signal.h>
   26 #include <ltdl.h>
   27 #include "extractor_datasource.h"
   28 #include "extractor_ipc.h"
   29 #include "extractor_logging.h"
   30 #include "extractor_plugpath.h"
   31 #include "extractor_plugins.h"
   32 
   33 
   34 /**
   35  * Size used for the shared memory segment.
   36  */
   37 #define DEFAULT_SHM_SIZE (16 * 1024)
   38 
   39 
   40 /**
   41  * Closure for #process_plugin_reply()
   42  */
   43 struct PluginReplyProcessor
   44 {
   45   /**
   46    * Function to call if we receive meta data from the plugin.
   47    */
   48   EXTRACTOR_MetaDataProcessor proc;
   49 
   50   /**
   51    * Closure for @e proc.
   52    */
   53   void *proc_cls;
   54 
   55   /**
   56    * Are we done with processing this file? 0 to continue, 1 to terminate.
   57    */
   58   int file_finished;
   59 
   60 };
   61 
   62 
   63 /**
   64  * Send an 'update' message to the plugin.
   65  *
   66  * @param plugin plugin to notify
   67  * @param shm_off new offset for the SHM
   68  * @param data_available number of bytes available in shm
   69  * @param ds datastore backend we are using
   70  */
   71 static void
   72 send_update_message (struct EXTRACTOR_PluginList *plugin,
   73                      int64_t shm_off,
   74                      size_t data_available,
   75                      struct EXTRACTOR_Datasource *ds)
   76 {
   77   struct UpdateMessage um;
   78 
   79   um.opcode = MESSAGE_UPDATED_SHM;
   80   um.reserved = 0;
   81   um.reserved2 = 0;
   82   um.shm_ready_bytes = (uint32_t) data_available;
   83   um.shm_off = (uint64_t) shm_off;
   84   um.file_size = EXTRACTOR_datasource_get_size_ (ds, 0);
   85   if (sizeof (um) !=
   86       EXTRACTOR_IPC_channel_send_ (plugin->channel,
   87                                    &um,
   88                                    sizeof (um)) )
   89   {
   90     LOG ("Failed to send UPDATED_SHM message to plugin\n");
   91     EXTRACTOR_IPC_channel_destroy_ (plugin->channel);
   92     plugin->channel = NULL;
   93     plugin->round_finished = 1;
   94   }
   95 }
   96 
   97 
   98 /**
   99  * Send a 'discard state' message to the plugin and mark it as finished
  100  * for this round.
  101  *
  102  * @param plugin plugin to notify
  103  */
  104 static void
  105 send_discard_message (struct EXTRACTOR_PluginList *plugin)
  106 {
  107   static unsigned char disc_msg = MESSAGE_DISCARD_STATE;
  108 
  109   if (sizeof (disc_msg) !=
  110       EXTRACTOR_IPC_channel_send_ (plugin->channel,
  111                                    &disc_msg,
  112                                    sizeof (disc_msg)) )
  113   {
  114     LOG ("Failed to send DISCARD_STATE message to plugin\n");
  115     EXTRACTOR_IPC_channel_destroy_ (plugin->channel);
  116     plugin->channel = NULL;
  117     plugin->round_finished = 1;
  118   }
  119 }
  120 
  121 
  122 /**
  123  * We had some serious trouble.  Abort all channels.
  124  *
  125  * @param plugins list of plugins with channels to abort
  126  */
  127 static void
  128 abort_all_channels (struct EXTRACTOR_PluginList *plugins)
  129 {
  130   struct EXTRACTOR_PluginList *pos;
  131 
  132   for (pos = plugins; NULL != pos; pos = pos->next)
  133   {
  134     if (NULL == pos->channel)
  135       continue;
  136     EXTRACTOR_IPC_channel_destroy_ (pos->channel);
  137     pos->channel = NULL;
  138   }
  139 }
  140 
  141 
  142 /**
  143  * Handler for a message from one of the plugins.
  144  *
  145  * @param cls closure with our 'struct PluginReplyProcessor'
  146  * @param plugin plugin of the channel sending the message
  147  * @param meta_type type of the meta data
  148  * @param meta_format format of the meta data
  149  * @param mime mime string send from the plugin
  150  * @param value 'data' send from the plugin
  151  * @param value_len number of bytes in 'value'
  152  */
  153 static void
  154 process_plugin_reply (void *cls,
  155                       struct EXTRACTOR_PluginList *plugin,
  156                       enum EXTRACTOR_MetaType meta_type,
  157                       enum EXTRACTOR_MetaFormat meta_format,
  158                       const char *mime,
  159                       const void *value,
  160                       size_t value_len)
  161 {
  162   static unsigned char cont_msg = MESSAGE_CONTINUE_EXTRACTING;
  163   struct PluginReplyProcessor *prp = cls;
  164 
  165   if (0 != prp->file_finished)
  166   {
  167     /* client already aborted, ignore message, tell plugin about abort */
  168     return;
  169   }
  170   if (0 != prp->proc (prp->proc_cls,
  171                       plugin->short_libname,
  172                       meta_type,
  173                       meta_format,
  174                       mime,
  175                       value,
  176                       value_len))
  177   {
  178     prp->file_finished = 1;
  179 #if DEBUG
  180     fprintf (stderr, "Sending ABRT\n");
  181 #endif
  182     send_discard_message (plugin);
  183     return;
  184   }
  185   if (sizeof (cont_msg) !=
  186       EXTRACTOR_IPC_channel_send_ (plugin->channel,
  187                                    &cont_msg,
  188                                    sizeof (cont_msg)) )
  189   {
  190     LOG ("Failed to send CONTINUE_EXTRACTING message to plugin\n");
  191     EXTRACTOR_IPC_channel_destroy_ (plugin->channel);
  192     plugin->channel = NULL;
  193     plugin->round_finished = 1;
  194   }
  195 }
  196 
  197 
  198 /**
  199  * Closure for the in-process callbacks.
  200  */
  201 struct InProcessContext
  202 {
  203   /**
  204    * Current plugin.
  205    */
  206   struct EXTRACTOR_PluginList *plugin;
  207 
  208   /**
  209    * Data source to use.
  210    */
  211   struct EXTRACTOR_Datasource *ds;
  212 
  213   /**
  214    * Function to call with meta data.
  215    */
  216   EXTRACTOR_MetaDataProcessor proc;
  217 
  218   /**
  219    * Closure for @e proc.
  220    */
  221   void *proc_cls;
  222 
  223   /**
  224    * IO buffer.
  225    */
  226   char buf[DEFAULT_SHM_SIZE];
  227 
  228   /**
  229    * 0 to continue extracting, 1 if we are finished
  230    */
  231   int finished;
  232 };
  233 
  234 
  235 /**
  236  * Obtain a pointer to up to @a size bytes of data from the file to process.
  237  * Callback used for in-process plugins.
  238  *
  239  * @param cls a `struct InProcessContext`
  240  * @param data pointer to set to the file data, set to NULL on error
  241  * @param size maximum number of bytes requested
  242  * @return number of bytes now available in data (can be smaller than @a size),
  243  *         -1 on error
  244  */
  245 static ssize_t
  246 in_process_read (void *cls,
  247                  void **data,
  248                  size_t size)
  249 {
  250   struct InProcessContext *ctx = cls;
  251   ssize_t ret;
  252   size_t bsize;
  253 
  254   bsize = sizeof (ctx->buf);
  255   if (size < bsize)
  256     bsize = size;
  257   ret = EXTRACTOR_datasource_read_ (ctx->ds,
  258                                     ctx->buf,
  259                                     bsize);
  260   if (-1 == ret)
  261     *data = NULL;
  262   else
  263     *data = ctx->buf;
  264   return ret;
  265 }
  266 
  267 
  268 /**
  269  * Seek in the file.  Use 'SEEK_CUR' for @a whence and @a pos of 0 to
  270  * obtain the current position in the file.
  271  * Callback used for in-process plugins.
  272  *
  273  * @param cls a 'struct InProcessContext'
  274  * @param pos position to seek (see 'man lseek')
  275  * @param whence how to see (absolute to start, relative, absolute to end)
  276  * @return new absolute position, -1 on error (i.e. desired position
  277  *         does not exist)
  278  */
  279 static int64_t
  280 in_process_seek (void *cls,
  281                  int64_t pos,
  282                  int whence)
  283 {
  284   struct InProcessContext *ctx = cls;
  285 
  286   return EXTRACTOR_datasource_seek_ (ctx->ds,
  287                                      pos,
  288                                      whence);
  289 }
  290 
  291 
  292 /**
  293  * Determine the overall size of the file.
  294  * Callback used for in-process plugins.
  295  *
  296  * @param cls a `struct InProcessContext`
  297  * @return overall file size, UINT64_MAX on error (i.e. IPC failure)
  298  */
  299 static uint64_t
  300 in_process_get_size (void *cls)
  301 {
  302   struct InProcessContext *ctx = cls;
  303 
  304   return (uint64_t) EXTRACTOR_datasource_get_size_ (ctx->ds, 0);
  305 }
  306 
  307 
  308 /**
  309  * Type of a function that libextractor calls for each
  310  * meta data item found.
  311  * Callback used for in-process plugins.
  312  *
  313  * @param cls a 'struct InProcessContext'
  314  * @param plugin_name name of the plugin that produced this value;
  315  *        special values can be used (i.e. '&lt;zlib&gt;' for zlib being
  316  *        used in the main libextractor library and yielding
  317  *        meta data).
  318  * @param type libextractor-type describing the meta data
  319  * @param format basic format information about data
  320  * @param data_mime_type mime-type of data (not of the original file);
  321  *        can be NULL (if mime-type is not known)
  322  * @param data actual meta-data found
  323  * @param data_len number of bytes in data
  324  * @return 0 to continue extracting, 1 to abort
  325  */
  326 static int
  327 in_process_proc (void *cls,
  328                  const char *plugin_name,
  329                  enum EXTRACTOR_MetaType type,
  330                  enum EXTRACTOR_MetaFormat format,
  331                  const char *data_mime_type,
  332                  const char *data,
  333                  size_t data_len)
  334 {
  335   struct InProcessContext *ctx = cls;
  336   int ret;
  337 
  338   if (0 != ctx->finished)
  339     return 1;
  340   ret = ctx->proc (ctx->proc_cls,
  341                    plugin_name,
  342                    type,
  343                    format,
  344                    data_mime_type,
  345                    data,
  346                    data_len);
  347   if (0 != ret)
  348     ctx->finished = 1;
  349   return ret;
  350 }
  351 
  352 
  353 /**
  354  * Extract keywords using the given set of plugins.
  355  *
  356  * @param plugins the list of plugins to use
  357  * @param shm shared memory object used by the plugins (NULL if
  358  *        all plugins are in-process)
  359  * @param ds data to process
  360  * @param proc function to call for each meta data item found
  361  * @param proc_cls cls argument to @a proc
  362  */
  363 static void
  364 do_extract (struct EXTRACTOR_PluginList *plugins,
  365             struct EXTRACTOR_SharedMemory *shm,
  366             struct EXTRACTOR_Datasource *ds,
  367             EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
  368 {
  369   unsigned int plugin_count;
  370   unsigned int plugin_off;
  371   struct EXTRACTOR_PluginList *pos;
  372   struct StartMessage start;
  373   struct EXTRACTOR_Channel *channel;
  374   struct PluginReplyProcessor prp;
  375   struct InProcessContext ctx;
  376   struct EXTRACTOR_ExtractContext ec;
  377   int64_t min_seek;
  378   int64_t end;
  379   ssize_t data_available;
  380   ssize_t ready;
  381   int done;
  382   int have_in_memory;
  383 
  384   plugin_count = 0;
  385   for (pos = plugins; NULL != pos; pos = pos->next)
  386     plugin_count++;
  387   if (NULL != shm)
  388     ready = EXTRACTOR_IPC_shared_memory_set_ (shm,
  389                                               ds,
  390                                               0,
  391                                               DEFAULT_SHM_SIZE);
  392   else
  393     ready = 0;
  394   if (-1 == ready)
  395     return; /* failed to ready _any_ data!? */
  396   have_in_memory = 0;
  397   prp.file_finished = 0;
  398   prp.proc = proc;
  399   prp.proc_cls = proc_cls;
  400 
  401   /* send 'start' message */
  402   start.opcode = MESSAGE_EXTRACT_START;
  403   start.reserved = 0;
  404   start.reserved2 = 0;
  405   start.shm_ready_bytes = (uint32_t) ready;
  406   start.file_size = EXTRACTOR_datasource_get_size_ (ds, 0);
  407   for (pos = plugins; NULL != pos; pos = pos->next)
  408   {
  409     if (EXTRACTOR_OPTION_IN_PROCESS == pos->flags)
  410       have_in_memory = 1;
  411     if ( (NULL != pos->channel) &&
  412          (-1 == EXTRACTOR_IPC_channel_send_ (pos->channel,
  413                                              &start,
  414                                              sizeof (start)) ) )
  415     {
  416       LOG ("Failed to send EXTRACT_START message to plugin\n");
  417       EXTRACTOR_IPC_channel_destroy_ (pos->channel);
  418       pos->channel = NULL;
  419     }
  420   }
  421   done = 0;
  422   while (! done)
  423   {
  424     struct EXTRACTOR_Channel *channels[plugin_count];
  425 
  426     /* calculate current 'channels' array */
  427     plugin_off = 0;
  428     for (pos = plugins; NULL != pos; pos = pos->next)
  429     {
  430       if (-1 == pos->seek_request)
  431       {
  432         /* channel is not seeking, must be running or done */
  433         channels[plugin_off] = pos->channel;
  434       }
  435       else
  436       {
  437         /* not running this round, seeking! */
  438         channels[plugin_off] = NULL;
  439       }
  440       plugin_off++;
  441     }
  442     /* give plugins chance to send us meta data, seek or finished messages */
  443     if (-1 ==
  444         EXTRACTOR_IPC_channel_recv_ (channels,
  445                                      plugin_count,
  446                                      &process_plugin_reply,
  447                                      &prp))
  448     {
  449       /* serious problem in IPC; reset *all* channels */
  450       LOG ("Failed to receive message from channels; full reset\n");
  451       abort_all_channels (plugins);
  452       break;
  453     }
  454 
  455     /* calculate minimum seek request (or set done=0 to continue here) */
  456     done = 1;
  457     min_seek = -1;
  458     plugin_off = 0;
  459     for (pos = plugins; NULL != pos; pos = pos->next)
  460     {
  461       plugin_off++;
  462       if ( (1 == pos->round_finished) ||
  463            (NULL == pos->channel) )
  464       {
  465         continue;     /* inactive plugin */
  466       }
  467       if (-1 == pos->seek_request)
  468       {
  469         /* possibly more meta data at current position, at least
  470      this plugin is still working on it... */
  471         done = 0;
  472         break;
  473       }
  474       if (-1 != pos->seek_request)
  475       {
  476         if (SEEK_END == pos->seek_whence)
  477         {
  478           /* convert distance from end to absolute position */
  479           pos->seek_whence = 0;
  480           end = EXTRACTOR_datasource_get_size_ (ds, 1);
  481           if (pos->seek_request > end)
  482           {
  483             LOG ("Cannot seek to before the beginning of the file!\n");
  484             pos->seek_request = 0;
  485           }
  486           else
  487           {
  488             pos->seek_request = end - pos->seek_request;
  489           }
  490         }
  491         if ( (-1 == min_seek) ||
  492              (min_seek > pos->seek_request) )
  493         {
  494           min_seek = pos->seek_request;
  495         }
  496       }
  497     }
  498     data_available = -1;
  499     if ( (1 == done) &&
  500          (-1 != min_seek) &&
  501          (NULL != shm) )
  502     {
  503       /* current position done, but seek requested */
  504       done = 0;
  505       if (-1 ==
  506           (data_available = EXTRACTOR_IPC_shared_memory_set_ (shm,
  507                                                               ds,
  508                                                               min_seek,
  509                                                               DEFAULT_SHM_SIZE)))
  510       {
  511         LOG ("Failed to seek; full reset\n");
  512         abort_all_channels (plugins);
  513         break;
  514       }
  515     }
  516     /* if 'prp.file_finished', send 'abort' to plugins;
  517        if not, send 'seek' notification to plugins in range */
  518     for (pos = plugins; NULL != pos; pos = pos->next)
  519     {
  520       if (NULL == (channel = pos->channel))
  521       {
  522         /* Skipping plugin: channel down */
  523         continue;
  524       }
  525       if ( (-1 != pos->seek_request) &&
  526            (1 == prp.file_finished) )
  527       {
  528         send_discard_message (pos);
  529         pos->round_finished = 1;
  530         pos->seek_request = -1;
  531       }
  532       if ( (-1 != data_available) &&
  533            (-1 != pos->seek_request) &&
  534            (min_seek <= pos->seek_request) &&
  535            ( (min_seek + data_available > pos->seek_request) ||
  536              (min_seek == EXTRACTOR_datasource_get_size_ (ds, 0))) )
  537       {
  538         /* Notify plugin about seek to 'min_seek' */
  539         send_update_message (pos,
  540                              min_seek,
  541                              data_available,
  542                              ds);
  543         pos->seek_request = -1;
  544       }
  545       if (0 == pos->round_finished)
  546         done = 0; /* can't be done, plugin still active */
  547     }
  548   }
  549 
  550   if (0 == have_in_memory)
  551     return;
  552   /* run in-process plugins */
  553   ctx.finished = 0;
  554   ctx.ds = ds;
  555   ctx.proc = proc;
  556   ctx.proc_cls = proc_cls;
  557   ec.cls = &ctx;
  558   ec.read = &in_process_read;
  559   ec.seek = &in_process_seek;
  560   ec.get_size = &in_process_get_size;
  561   ec.proc = &in_process_proc;
  562   for (pos = plugins; NULL != pos; pos = pos->next)
  563   {
  564     if (EXTRACTOR_OPTION_IN_PROCESS != pos->flags)
  565       continue;
  566     if (-1 == EXTRACTOR_plugin_load_ (pos))
  567       continue;
  568     ctx.plugin = pos;
  569     ec.config = pos->plugin_options;
  570     if (-1 == EXTRACTOR_datasource_seek_ (ds, 0, SEEK_SET))
  571     {
  572       LOG ("Failed to seek to 0 for in-memory plugins\n");
  573       return;
  574     }
  575     pos->extract_method (&ec);
  576     if (1 == ctx.finished)
  577       break;
  578   }
  579 }
  580 
  581 
  582 /**
  583  * Extract keywords from a file using the given set of plugins.
  584  * If needed, opens the file and loads its data (via mmap).  Then
  585  * decompresses it if the data is compressed.  Finally runs the
  586  * plugins on the (now possibly decompressed) data.
  587  *
  588  * @param plugins the list of plugins to use
  589  * @param filename the name of the file, can be NULL if data is not NULL
  590  * @param data data of the file in memory, can be NULL (in which
  591  *        case libextractor will open file) if filename is not NULL
  592  * @param size number of bytes in data, ignored if data is NULL
  593  * @param proc function to call for each meta data item found
  594  * @param proc_cls cls argument to @a proc
  595  */
  596 void
  597 EXTRACTOR_extract (struct EXTRACTOR_PluginList *plugins,
  598                    const char *filename,
  599                    const void *data,
  600                    size_t size,
  601                    EXTRACTOR_MetaDataProcessor proc,
  602                    void *proc_cls)
  603 {
  604   struct EXTRACTOR_Datasource *datasource;
  605   struct EXTRACTOR_SharedMemory *shm;
  606   struct EXTRACTOR_PluginList *pos;
  607   int have_oop;
  608 
  609   if (NULL == plugins)
  610     return;
  611   if (NULL == filename)
  612     datasource = EXTRACTOR_datasource_create_from_buffer_ (data, size,
  613                                                            proc, proc_cls);
  614   else
  615     datasource = EXTRACTOR_datasource_create_from_file_ (filename,
  616                                                          proc, proc_cls);
  617   if (NULL == datasource)
  618     return;
  619   shm = NULL;
  620   have_oop = 0;
  621   for (pos = plugins; NULL != pos; pos = pos->next)
  622   {
  623     if (NULL == shm)
  624       shm = pos->shm;
  625     if (EXTRACTOR_OPTION_IN_PROCESS != pos->flags)
  626       have_oop = 1;
  627     pos->round_finished = 0;
  628   }
  629   if ( (NULL == shm) &&
  630        (1 == have_oop) )
  631   {
  632     /* need to create shared memory segment */
  633     shm = EXTRACTOR_IPC_shared_memory_create_ (DEFAULT_SHM_SIZE);
  634     if (NULL == shm)
  635     {
  636       LOG ("Failed to setup IPC\n");
  637       EXTRACTOR_datasource_destroy_ (datasource);
  638       return;
  639     }
  640   }
  641   for (pos = plugins; NULL != pos; pos = pos->next)
  642     if ( (NULL == pos->channel) &&
  643          (NULL != shm) &&
  644          (EXTRACTOR_OPTION_IN_PROCESS != pos->flags) )
  645     {
  646       if (NULL == pos->shm)
  647       {
  648         pos->shm = shm;
  649         (void) EXTRACTOR_IPC_shared_memory_change_rc_ (shm, 1);
  650       }
  651       pos->channel = EXTRACTOR_IPC_channel_create_ (pos,
  652                                                     shm);
  653     }
  654   do_extract (plugins,
  655               shm,
  656               datasource,
  657               proc,
  658               proc_cls);
  659   EXTRACTOR_datasource_destroy_ (datasource);
  660 }
  661 
  662 
  663 /**
  664  * Initialize gettext and libltdl (and W32 if needed).
  665  */
  666 void __attribute__ ((constructor))
  667 EXTRACTOR_ltdl_init ()
  668 {
  669   int err;
  670 
  671 #if ENABLE_NLS
  672   bindtextdomain (PACKAGE, LOCALEDIR);
  673 #endif
  674   err = lt_dlinit ();
  675   if (err > 0)
  676   {
  677 #if DEBUG
  678     fprintf (stderr,
  679              _ ("Initialization of plugin mechanism failed: %s!\n"),
  680              lt_dlerror ());
  681 #endif
  682     return;
  683   }
  684 #if WINDOWS
  685   plibc_init_utf8 ("GNU", PACKAGE, 1);
  686   plibc_set_stat_size_size (sizeof (((struct stat *) 0)->st_size));
  687   plibc_set_stat_time_size (sizeof (((struct stat *) 0)->st_mtime));
  688 #endif
  689 }
  690 
  691 
  692 /**
  693  * Deinit.
  694  */
  695 void __attribute__ ((destructor))
  696 EXTRACTOR_ltdl_fini ()
  697 {
  698 #if WINDOWS
  699   plibc_shutdown ();
  700 #endif
  701   lt_dlexit ();
  702 }
  703 
  704 
  705 /* end of extractor.c */