"Fossies" - the Fresh Open Source Software Archive

Member "libextractor-1.11/src/plugins/pdf_extractor.c" (30 Jan 2021, 5530 Bytes) of package /linux/privat/libextractor-1.11.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "pdf_extractor.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.5_vs_1.6.

    1 /*
    2      This file is part of libextractor.
    3      Copyright (C) 2016 Christian Grothoff
    4 
    5      libextractor is free software; you can redistribute it and/or modify
    6      it under the terms of the GNU General Public License as published
    7      by the Free Software Foundation; either version 3, or (at your
    8      option) any later version.
    9 
   10      libextractor is distributed in the hope that it will be useful, but
   11      WITHOUT ANY WARRANTY; without even the implied warranty of
   12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13      General Public License for more details.
   14 
   15      You should have received a copy of the GNU General Public License
   16      along with libextractor; see the file COPYING.  If not, write to the
   17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
   18      Boston, MA 02110-1301, USA.
   19  */
   20 /**
   21  * @file plugins/pdf_extractor.c
   22  * @brief plugin to support PDF files
   23  * @author Christian Grothoff
   24  *
   25  * PDF libraries today are a nightmare (TM).  So instead of doing the
   26  * fast thing and calling some library functions to parse the PDF,
   27  * we execute 'pdfinfo' and parse the output. Because that's 21st
   28  * century plumbing: nobody writes reasonable code anymore.
   29  */
   30 #include "platform.h"
   31 #include <extractor.h>
   32 #include <sys/types.h>
   33 #include <sys/wait.h>
   34 #include <signal.h>
   35 #include <unistd.h>
   36 
   37 /**
   38  * Entry in the mapping from control data to LE types.
   39  */
   40 struct Matches
   41 {
   42   /**
   43    * Key in the Pdfian control file.
   44    */
   45   const char *text;
   46 
   47   /**
   48    * Corresponding type in LE.
   49    */
   50   enum EXTRACTOR_MetaType type;
   51 };
   52 
   53 
   54 /**
   55  * Map from pdf-control entries to LE types.
   56  *
   57  * See output of 'pdfinfo'.
   58  */
   59 static struct Matches tmap[] = {
   60   {"Title",        EXTRACTOR_METATYPE_TITLE},
   61   {"Subject",      EXTRACTOR_METATYPE_SUBJECT},
   62   {"Keywords",     EXTRACTOR_METATYPE_KEYWORDS},
   63   {"Author",       EXTRACTOR_METATYPE_AUTHOR_NAME},
   64   {"Creator",      EXTRACTOR_METATYPE_CREATOR},
   65   {"Producer",     EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE},
   66   {"CreationDate", EXTRACTOR_METATYPE_CREATION_DATE},
   67   {"ModDate",      EXTRACTOR_METATYPE_MODIFICATION_DATE},
   68   {"PDF version",  EXTRACTOR_METATYPE_ENCODER_VERSION},
   69   {"Pages",        EXTRACTOR_METATYPE_PAGE_COUNT},
   70   {NULL, 0}
   71 };
   72 
   73 
   74 /**
   75  * Process the "stdout" file from pdfinfo.
   76  *
   77  * @param fout stdout of pdfinfo
   78  * @param proc function to call with meta data
   79  * @param proc_cls closure for @e proc
   80  */
   81 static void
   82 process_stdout (FILE *fout,
   83                 EXTRACTOR_MetaDataProcessor proc,
   84                 void *proc_cls)
   85 {
   86   unsigned int i;
   87   char line[1025];
   88   const char *psuffix;
   89   const char *colon;
   90 
   91   while (! feof (fout))
   92   {
   93     if (NULL == fgets (line, sizeof (line) - 1, fout))
   94       break;
   95     if (0 == strlen (line))
   96       continue;
   97     if ('\n' == line[strlen (line) - 1])
   98       line[strlen (line) - 1] = '\0';
   99     colon = strchr (line, (int) ':');
  100     if (NULL == colon)
  101       break;
  102     psuffix = colon + 1;
  103     while (isblank ((unsigned char) psuffix[0]))
  104       psuffix++;
  105     if (0 == strlen (psuffix))
  106       continue;
  107     for (i = 0; NULL != tmap[i].text; i++)
  108     {
  109       if (0 != strncasecmp (line,
  110                             tmap[i].text,
  111                             colon - line))
  112         continue;
  113       if (0 != proc (proc_cls,
  114                      "pdf",
  115                      tmap[i].type,
  116                      EXTRACTOR_METAFORMAT_UTF8,
  117                      "text/plain",
  118                      psuffix,
  119                      strlen (psuffix) + 1))
  120         return;
  121       break;
  122     }
  123   }
  124 }
  125 
  126 
  127 /**
  128  * Main entry method for the PDF extraction plugin.
  129  *
  130  * @param ec extraction context provided to the plugin
  131  */
  132 void
  133 EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec)
  134 {
  135   uint64_t fsize;
  136   void *data;
  137   pid_t pid;
  138   int in[2];
  139   int out[2];
  140   FILE *fout;
  141   uint64_t pos;
  142 
  143   fsize = ec->get_size (ec->cls);
  144   if (fsize < 128)
  145     return;
  146   if (4 !=
  147       ec->read (ec->cls, &data, 4))
  148     return;
  149   if (0 != strncmp ("%PDF", data, 4))
  150     return;
  151   if (0 !=
  152       ec->seek (ec->cls, 0, SEEK_SET))
  153     return;
  154   if (0 != pipe (in))
  155     return;
  156   if (0 != pipe (out))
  157   {
  158     close (in[0]);
  159     close (in[1]);
  160     return;
  161   }
  162   pid = fork ();
  163   if (-1 == pid)
  164   {
  165     close (in[0]);
  166     close (in[1]);
  167     close (out[0]);
  168     close (out[1]);
  169     return;
  170   }
  171   if (0 == pid)
  172   {
  173     char *const args[] = {
  174       "pdfinfo",
  175       "-",
  176       NULL
  177     };
  178     /* am child, exec 'pdfinfo' */
  179     close (0);
  180     close (1);
  181     if ( (-1 == dup2 (in[0], 0)) ||
  182          (-1 == dup2 (out[1], 1)) )
  183       exit (1);
  184     close (in[0]);
  185     close (in[1]);
  186     close (out[0]);
  187     close (out[1]);
  188     execvp ("pdfinfo", args);
  189     exit (1);
  190   }
  191   /* am parent, send file */
  192   close (in[0]);
  193   close (out[1]);
  194   fout = fdopen (out[0], "r");
  195   if (NULL == fout)
  196   {
  197     close (in[1]);
  198     close (out[0]);
  199     kill (pid, SIGKILL);
  200     waitpid (pid, NULL, 0);
  201     return;
  202   }
  203   pos = 0;
  204   while (pos < fsize)
  205   {
  206     ssize_t got;
  207     size_t wpos;
  208 
  209     data = NULL;
  210     got = ec->read (ec->cls,
  211                     &data,
  212                     fsize - pos);
  213     if ( (-1 == got) ||
  214          (NULL == data) )
  215       break;
  216     wpos = 0;
  217     while (wpos < got)
  218     {
  219       ssize_t out;
  220 
  221       out = write (in[1], data + wpos, got - wpos);
  222       if (out <= 0)
  223         break;
  224       wpos += out;
  225     }
  226     if (wpos < got)
  227       break;
  228     pos += got;
  229   }
  230   close (in[1]);
  231   process_stdout (fout, ec->proc, ec->cls);
  232   fclose (fout);
  233   kill (pid, SIGKILL);
  234   waitpid (pid, NULL, 0);
  235 }
  236 
  237 
  238 /* end of pdf_extractor.c */