"Fossies" - the Fresh Open Source Software Archive

Member "libextractor-1.11/src/plugins/dvi_extractor.c" (30 Jan 2021, 8709 Bytes) of package /linux/privat/libextractor-1.11.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "dvi_extractor.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.6_vs_1.7.

    1 /*
    2      This file is part of libextractor.
    3      Copyright (C) 2002, 2003, 2004, 2012, 2017, 2019 Vidyut Samanta and Christian Grothoff
    4 
    5      libextractor is free software; you can redistribute it and/or modify
    6      it under the terms of the GNU General Public License as published
    7      by the Free Software Foundation; either version 3, or (at your
    8      option) any later version.
    9 
   10      libextractor is distributed in the hope that it will be useful, but
   11      WITHOUT ANY WARRANTY; without even the implied warranty of
   12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13      General Public License for more details.
   14 
   15      You should have received a copy of the GNU General Public License
   16      along with libextractor; see the file COPYING.  If not, write to the
   17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
   18      Boston, MA 02110-1301, USA.
   19  */
   20 /**
   21  * @file plugins/dvi_extractor.c
   22  * @brief plugin to support DVI files (from LaTeX)
   23  * @author Christian Grothoff
   24  */
   25 #include "platform.h"
   26 #include "extractor.h"
   27 
   28 
   29 /**
   30  * Pair of a PostScipt prefix and the corresponding LE type.
   31  */
   32 struct Matches
   33 {
   34   /**
   35    * Prefix in the PS map.
   36    */
   37   const char *text;
   38 
   39   /**
   40    * Corresponding LE type.
   41    */
   42   enum EXTRACTOR_MetaType type;
   43 };
   44 
   45 
   46 /**
   47  * Map from PS names to LE types.
   48  */
   49 static struct Matches tmap[] = {
   50   { "/Title (",    EXTRACTOR_METATYPE_TITLE },
   51   { "/Subject (",  EXTRACTOR_METATYPE_SUBJECT },
   52   { "/Author (",   EXTRACTOR_METATYPE_AUTHOR_NAME },
   53   { "/Keywords (", EXTRACTOR_METATYPE_KEYWORDS },
   54   { "/Creator (",  EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
   55   { "/Producer (", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE },
   56   { NULL, 0 }
   57 };
   58 
   59 
   60 /**
   61  * Parse a "ZZZ" tag.  Specifically, the data may contain a
   62  * postscript dictionary with metadata.
   63  *
   64  * @param data overall input stream
   65  * @param pos where in data is the zzz data
   66  * @param len how many bytes from 'pos' does the zzz data extend?
   67  * @param proc function to call with meta data found
   68  * @param proc_cls closure for proc
   69  * @return 0 to continue to extract, 1 to stop
   70  */
   71 static int
   72 parseZZZ (const char *data,
   73           size_t pos, size_t len,
   74           EXTRACTOR_MetaDataProcessor proc,
   75           void *proc_cls)
   76 {
   77   size_t slen;
   78   size_t end;
   79   unsigned int i;
   80 
   81   end = pos + len;
   82   slen = strlen ("ps:SDict begin [");
   83   if ( (len <= slen) ||
   84        (0 != strncmp ("ps:SDict begin [ ", &data[pos], slen)) )
   85     return 0;
   86   pos += slen;
   87   while (pos < end)
   88   {
   89     for (i = 0; NULL != tmap[i].text; i++)
   90     {
   91       slen = strlen (tmap[i].text);
   92       if ( (pos + slen > end) ||
   93            (0 != strncmp (&data[pos], tmap[i].text, slen)) )
   94         continue;
   95       pos += slen;
   96       slen = pos;
   97       while ((slen < end) && (data[slen] != ')'))
   98         slen++;
   99       slen = slen - pos;
  100       {
  101         char value[slen + 1];
  102 
  103         value[slen] = '\0';
  104         memcpy (value, &data[pos], slen);
  105         if (0 != proc (proc_cls,
  106                        "dvi",
  107                        tmap[i].type,
  108                        EXTRACTOR_METAFORMAT_C_STRING,
  109                        "text/plain",
  110                        value,
  111                        slen + 1))
  112           return 1;
  113       }
  114       pos += slen + 1;
  115       break;
  116     }
  117     pos++;
  118   }
  119   return 0;
  120 }
  121 
  122 
  123 /**
  124  * Read 32-bit unsigned integer in big-endian format from 'data'.
  125  *
  126  * @param data pointer to integer (possibly unaligned)
  127  * @return 32-bit integer in host byte order
  128  */
  129 static uint32_t
  130 getIntAt (const void *data)
  131 {
  132   uint32_t p;
  133 
  134   memcpy (&p, data, 4);          /* ensure alignment! */
  135   return ntohl (p);
  136 }
  137 
  138 
  139 /**
  140  * Read 16-bit unsigned integer in big-endian format from 'data'.
  141  *
  142  * @param data pointer to integer (possibly unaligned)
  143  * @return 16-bit integer in host byte order
  144  */
  145 static uint16_t
  146 getShortAt (const void *data)
  147 {
  148   uint16_t p;
  149 
  150   memcpy (&p, data, sizeof (uint16_t));          /* ensure alignment! */
  151   return ntohs (p);
  152 }
  153 
  154 
  155 /**
  156  * Main entry method for the 'application/x-dvi' extraction plugin.
  157  *
  158  * @param ec extraction context provided to the plugin
  159  */
  160 void
  161 EXTRACTOR_dvi_extract_method (struct EXTRACTOR_ExtractContext *ec)
  162 {
  163   unsigned int klen;
  164   uint32_t pos;
  165   uint32_t opos;
  166   unsigned int len;
  167   unsigned int pageCount;
  168   char pages[16];
  169   void *buf;
  170   unsigned char *data;
  171   uint64_t size;
  172   uint64_t off;
  173   ssize_t iret;
  174 
  175   if (40 >= (iret = ec->read (ec->cls, &buf, 1024)))
  176     return;
  177   data = buf;
  178   if ( (data[0] != 247) ||
  179        (data[1] != 2) )
  180     return;                /* cannot be DVI or unsupported version */
  181   klen = data[14];
  182   size = ec->get_size (ec->cls);
  183   if (size > 16 * 1024 * 1024)
  184     return; /* too large */
  185   if (klen + 15 > size)
  186     return; /* malformed klen */
  187   if (NULL == (data = malloc ((size_t) size)))
  188     return; /* out of memory */
  189   memcpy (data, buf, iret);
  190   off = iret;
  191   while (off < size)
  192   {
  193     if (0 >= (iret = ec->read (ec->cls, &buf, 16 * 1024)))
  194     {
  195       free (data);
  196       return;
  197     }
  198     memcpy (&data[off], buf, iret);
  199     off += iret;
  200   }
  201   pos = size - 1;
  202   while ( (223 == data[pos]) &&
  203           (pos > 0) )
  204     pos--;
  205   if ( (2 != data[pos]) ||
  206        (pos < 40) )
  207     goto CLEANUP;
  208   pos--;
  209   pos -= 4;
  210   /* assert pos at 'post_post tag' */
  211   if (data[pos] != 249)
  212     goto CLEANUP;
  213   opos = pos;
  214   pos = getIntAt (&data[opos + 1]);
  215   if ( (pos + 25 > size) ||
  216        (pos + 25 < pos) )
  217     goto CLEANUP;
  218   /* assert pos at 'post' command */
  219   if (data[pos] != 248)
  220     goto CLEANUP;
  221   pageCount = 0;
  222   opos = pos;
  223   pos = getIntAt (&data[opos + 1]);
  224   while (1)
  225   {
  226     if (UINT32_MAX == pos)
  227       break;
  228     if ( (pos + 45 > size) ||
  229          (pos + 45 < pos) )
  230       goto CLEANUP;
  231     if (data[pos] != 139)       /* expect 'bop' */
  232       goto CLEANUP;
  233     pageCount++;
  234     opos = pos;
  235     pos = getIntAt (&data[opos + 41]);
  236     if (UINT32_MAX == pos)
  237       break;
  238     if (pos >= opos)
  239       goto CLEANUP;       /* invalid! */
  240   }
  241   /* ok, now we believe it's a dvi... */
  242   snprintf (pages,
  243             sizeof (pages),
  244             "%u",
  245             pageCount);
  246   if (0 != ec->proc (ec->cls,
  247                      "dvi",
  248                      EXTRACTOR_METATYPE_PAGE_COUNT,
  249                      EXTRACTOR_METAFORMAT_UTF8,
  250                      "text/plain",
  251                      pages,
  252                      strlen (pages) + 1))
  253     goto CLEANUP;
  254   if (0 != ec->proc (ec->cls,
  255                      "dvi",
  256                      EXTRACTOR_METATYPE_MIMETYPE,
  257                      EXTRACTOR_METAFORMAT_UTF8,
  258                      "text/plain",
  259                      "application/x-dvi",
  260                      strlen ("application/x-dvi") + 1))
  261     goto CLEANUP;
  262   {
  263     char comment[klen + 1];
  264 
  265     comment[klen] = '\0';
  266     memcpy (comment, &data[15], klen);
  267     if (0 != ec->proc (ec->cls,
  268                        "dvi",
  269                        EXTRACTOR_METATYPE_COMMENT,
  270                        EXTRACTOR_METAFORMAT_C_STRING,
  271                        "text/plain",
  272                        comment,
  273                        klen + 1))
  274       goto CLEANUP;
  275   }
  276   /* try to find PDF/ps special */
  277   pos = opos;
  278   while ( (size >= 100) &&
  279           (pos < size - 100) )
  280   {
  281     switch (data[pos])
  282     {
  283     case 139:                  /* begin page 'bop', we typically have to skip that one to
  284                                    find the zzz's */
  285       pos += 45;                /* skip bop */
  286       break;
  287     case 239:                  /* zzz1 */
  288       len = data[pos + 1];
  289       if ( (pos + 2 + len < size) &&
  290            (0 != parseZZZ ((const char *) data, pos + 2, len, ec->proc,
  291                            ec->cls)) )
  292         goto CLEANUP;
  293       pos += len + 2;
  294       break;
  295     case 240:                  /* zzz2 */
  296       len = getShortAt (&data[pos + 1]);
  297       if ( (pos + 3 + len < size) &&
  298            (0 != parseZZZ ((const char *) data, pos + 3, len, ec->proc,
  299                            ec->cls)) )
  300         goto CLEANUP;
  301       pos += len + 3;
  302       break;
  303     case 241:                  /* zzz3, who uses that? */
  304       len = (getShortAt (&data[pos + 1])) + 65536 * data[pos + 3];
  305       if ( (pos + 4 + len < size) &&
  306            (0 != parseZZZ ((const char *) data, pos + 4, len, ec->proc,
  307                            ec->cls)) )
  308         goto CLEANUP;
  309       pos += len + 4;
  310       break;
  311     case 242:                  /* zzz4, hurray! */
  312       len = getIntAt (&data[pos + 1]);
  313       if ( (pos + 1 + len < size) &&
  314            (0 != parseZZZ ((const char *) data, pos + 5, len, ec->proc,
  315                            ec->cls)) )
  316         goto CLEANUP;
  317       pos += len + 5;
  318       break;
  319     default:                   /* unsupported opcode, abort scan */
  320       goto CLEANUP;
  321     }
  322   }
  323 CLEANUP:
  324   free (data);
  325 }
  326 
  327 
  328 /* end of dvi_extractor.c */