"Fossies" - the Fresh Open Source Software Archive

Member "libextractor-1.11/src/plugins/odf_extractor.c" (30 Jan 2021, 9455 Bytes) of package /linux/privat/libextractor-1.11.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "odf_extractor.c" see the Fossies "Dox" file reference documentation.

    1 /*
    2      This file is part of libextractor.
    3      Copyright (C) 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff
    4 
    5      libextractor is free software; you can redistribute it and/or modify
    6      it under the terms of the GNU General Public License as published
    7      by the Free Software Foundation; either version 3, or (at your
    8      option) any later version.
    9 
   10      libextractor is distributed in the hope that it will be useful, but
   11      WITHOUT ANY WARRANTY; without even the implied warranty of
   12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13      General Public License for more details.
   14 
   15      You should have received a copy of the GNU General Public License
   16      along with libextractor; see the file COPYING.  If not, write to the
   17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
   18      Boston, MA 02110-1301, USA.
   19 */
   20 /**
   21  * @file plugins/odf_extractor.c
   22  * @brief plugin to support ODF files
   23  * @author Christian Grothoff
   24  */
   25 #include "platform.h"
   26 #include <ctype.h>
   27 #include "extractor.h"
   28 #include "unzip.h"
   29 
   30 /**
   31  * Maximum length of a filename allowed inside the ZIP archive.
   32  */
   33 #define MAXFILENAME 256
   34 
   35 /**
   36  * Name of the file with the meta-data in OO documents.
   37  */
   38 #define METAFILE "meta.xml"
   39 
   40 
   41 /**
   42  * Mapping from ODF meta data strings to LE types.
   43  */
   44 struct Matches
   45 {
   46   /**
   47    * ODF description.
   48    */
   49   const char *text;
   50 
   51   /**
   52    * Corresponding LE type.
   53    */
   54   enum EXTRACTOR_MetaType type;
   55 };
   56 
   57 
   58 /**
   59  * NULL-terminated map from ODF meta data strings to LE types.
   60  */
   61 static struct Matches tmap[] = {
   62   { "meta:generator",     EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
   63   { "meta:page-count",    EXTRACTOR_METATYPE_PAGE_COUNT },
   64   { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE },
   65   { "dc:date",            EXTRACTOR_METATYPE_UNKNOWN_DATE },
   66   { "dc:creator",         EXTRACTOR_METATYPE_CREATOR },
   67   { "dc:language",        EXTRACTOR_METATYPE_LANGUAGE },
   68   { "dc:title",           EXTRACTOR_METATYPE_TITLE },
   69   { "dc:description",     EXTRACTOR_METATYPE_DESCRIPTION },
   70   { "dc:subject",         EXTRACTOR_METATYPE_SUBJECT },
   71   { "meta:keyword",       EXTRACTOR_METATYPE_KEYWORDS },
   72   { "meta:user-defined meta:name=\"Info 1\"", EXTRACTOR_METATYPE_COMMENT },
   73   { "meta:user-defined meta:name=\"Info 2\"", EXTRACTOR_METATYPE_COMMENT },
   74   { "meta:user-defined meta:name=\"Info 3\"", EXTRACTOR_METATYPE_COMMENT },
   75   { "meta:user-defined meta:name=\"Info 4\"", EXTRACTOR_METATYPE_COMMENT },
   76   { NULL, 0 }
   77 };
   78 
   79 
   80 /**
   81  * Obtain the mimetype of the archive by reading the 'mimetype'
   82  * file of the ZIP.
   83  *
   84  * @param uf unzip context to extract the mimetype from
   85  * @return NULL if no mimetype could be found, otherwise the mime type
   86  */
   87 static char *
   88 libextractor_oo_getmimetype (struct EXTRACTOR_UnzipFile *uf)
   89 {
   90   char filename_inzip[MAXFILENAME];
   91   struct EXTRACTOR_UnzipFileInfo file_info;
   92   char *buf;
   93   size_t buf_size;
   94 
   95   if (EXTRACTOR_UNZIP_OK !=
   96       EXTRACTOR_common_unzip_go_find_local_file (uf,
   97                                                  "mimetype",
   98                                                  2))
   99     return NULL;
  100   if (EXTRACTOR_UNZIP_OK !=
  101       EXTRACTOR_common_unzip_get_current_file_info (uf,
  102                                                     &file_info,
  103                                                     filename_inzip,
  104                                                     sizeof (filename_inzip),
  105                                                     NULL,
  106                                                     0,
  107                                                     NULL,
  108                                                     0))
  109     return NULL;
  110   if (EXTRACTOR_UNZIP_OK !=
  111       EXTRACTOR_common_unzip_open_current_file (uf))
  112     return NULL;
  113   buf_size = file_info.uncompressed_size;
  114   if (buf_size > 1024)
  115   {
  116     /* way too large! */
  117     EXTRACTOR_common_unzip_close_current_file (uf);
  118     return NULL;
  119   }
  120   if (NULL == (buf = malloc (1 + buf_size)))
  121   {
  122     /* memory exhausted! */
  123     EXTRACTOR_common_unzip_close_current_file (uf);
  124     return NULL;
  125   }
  126   if (buf_size !=
  127       (size_t) EXTRACTOR_common_unzip_read_current_file (uf,
  128                                                          buf,
  129                                                          buf_size))
  130   {
  131     free (buf);
  132     EXTRACTOR_common_unzip_close_current_file (uf);
  133     return NULL;
  134   }
  135   /* found something */
  136   buf[buf_size] = '\0';
  137   while ( (0 < buf_size) &&
  138           isspace ( (unsigned char) buf[buf_size - 1]))
  139     buf[--buf_size] = '\0';
  140   if ('\0' == buf[0])
  141   {
  142     free (buf);
  143     buf = NULL;
  144   }
  145   EXTRACTOR_common_unzip_close_current_file (uf);
  146   return buf;
  147 }
  148 
  149 
  150 /**
  151  * Main entry method for the ODF extraction plugin.
  152  *
  153  * @param ec extraction context provided to the plugin
  154  */
  155 void
  156 EXTRACTOR_odf_extract_method (struct EXTRACTOR_ExtractContext *ec)
  157 {
  158   char filename_inzip[MAXFILENAME];
  159   struct EXTRACTOR_UnzipFile *uf;
  160   struct EXTRACTOR_UnzipFileInfo file_info;
  161   char *buf;
  162   char *pbuf;
  163   size_t buf_size;
  164   unsigned int i;
  165   char *mimetype;
  166 
  167   if (NULL == (uf = EXTRACTOR_common_unzip_open (ec)))
  168     return;
  169   if (NULL != (mimetype = libextractor_oo_getmimetype (uf)))
  170   {
  171     if (0 != ec->proc (ec->cls,
  172                        "odf",
  173                        EXTRACTOR_METATYPE_MIMETYPE,
  174                        EXTRACTOR_METAFORMAT_UTF8,
  175                        "text/plain",
  176                        mimetype,
  177                        strlen (mimetype) + 1))
  178     {
  179       EXTRACTOR_common_unzip_close (uf);
  180       free (mimetype);
  181       return;
  182     }
  183     free (mimetype);
  184   }
  185   if (EXTRACTOR_UNZIP_OK !=
  186       EXTRACTOR_common_unzip_go_find_local_file (uf,
  187                                                  METAFILE,
  188                                                  2))
  189   {
  190     /* metafile not found */
  191     EXTRACTOR_common_unzip_close (uf);
  192     return;
  193   }
  194   if (EXTRACTOR_UNZIP_OK !=
  195       EXTRACTOR_common_unzip_get_current_file_info (uf,
  196                                                     &file_info,
  197                                                     filename_inzip,
  198                                                     sizeof (filename_inzip),
  199                                                     NULL, 0, NULL, 0))
  200   {
  201     /* problems accessing metafile */
  202     EXTRACTOR_common_unzip_close (uf);
  203     return;
  204   }
  205   if (EXTRACTOR_UNZIP_OK !=
  206       EXTRACTOR_common_unzip_open_current_file (uf))
  207   {
  208     /* problems with unzip */
  209     EXTRACTOR_common_unzip_close (uf);
  210     return;
  211   }
  212 
  213   buf_size = file_info.uncompressed_size;
  214   if (buf_size > 128 * 1024)
  215   {
  216     /* too big to be meta-data! */
  217     EXTRACTOR_common_unzip_close_current_file (uf);
  218     EXTRACTOR_common_unzip_close (uf);
  219     return;
  220   }
  221   if (NULL == (buf = malloc (buf_size + 1)))
  222   {
  223     /* out of memory */
  224     EXTRACTOR_common_unzip_close_current_file (uf);
  225     EXTRACTOR_common_unzip_close (uf);
  226     return;
  227   }
  228   if (buf_size != EXTRACTOR_common_unzip_read_current_file (uf, buf, buf_size))
  229   {
  230     EXTRACTOR_common_unzip_close_current_file (uf);
  231     goto CLEANUP;
  232   }
  233   EXTRACTOR_common_unzip_close_current_file (uf);
  234   /* we don't do "proper" parsing of the meta-data but rather use some heuristics
  235      to get values out that we understand */
  236   buf[buf_size] = '\0';
  237   /* printf("%s\n", buf); */
  238   /* try to find some of the typical OO xml headers */
  239   if ( (strstr (buf, "xmlns:meta=\"http://openoffice.org/2000/meta\"") !=
  240         NULL) ||
  241        (strstr (buf, "xmlns:dc=\"http://purl.org/dc/elements/1.1/\"") !=
  242         NULL) ||
  243        (strstr (buf, "xmlns:xlink=\"http://www.w3.org/1999/xlink\"") != NULL) )
  244   {
  245     /* accept as meta-data */
  246     for (i = 0; NULL  != tmap[i].text; i++)
  247     {
  248       char *spos;
  249       char *epos;
  250       char needle[256];
  251       int oc;
  252 
  253       pbuf = buf;
  254 
  255       while (1)
  256       {
  257         strcpy (needle, "<");
  258         strcat (needle, tmap[i].text);
  259         strcat (needle, ">");
  260         spos = strstr (pbuf, needle);
  261         if (NULL == spos)
  262         {
  263           strcpy (needle, tmap[i].text);
  264           strcat (needle, "=\"");
  265           spos = strstr (pbuf, needle);
  266           if (spos == NULL)
  267             break;
  268           spos += strlen (needle);
  269           epos = spos;
  270           while ( (epos[0] != '\0') &&
  271                   (epos[0] != '"') )
  272             epos++;
  273         }
  274         else
  275         {
  276           oc = 0;
  277           spos += strlen (needle);
  278           while ( (spos[0] != '\0') &&
  279                   ( (spos[0] == '<') ||
  280                     (oc > 0) ) )
  281           {
  282             if (spos[0] == '<')
  283               oc++;
  284             if (spos[0] == '>')
  285               oc--;
  286             spos++;
  287           }
  288           epos = spos;
  289           while ( (epos[0] != '\0') &&
  290                   (epos[0] != '<') &&
  291                   (epos[0] != '>') )
  292           {
  293             epos++;
  294           }
  295         }
  296         if (spos != epos)
  297         {
  298           char key[epos - spos + 1];
  299 
  300           memcpy (key, spos, epos - spos);
  301           key[epos - spos] = '\0';
  302           if (0 != ec->proc (ec->cls,
  303                              "odf",
  304                              tmap[i].type,
  305                              EXTRACTOR_METAFORMAT_UTF8,
  306                              "text/plain",
  307                              key,
  308                              epos - spos + 1))
  309             goto CLEANUP;
  310           pbuf = epos;
  311         }
  312         else
  313           break;
  314       }
  315     }
  316   }
  317 CLEANUP:
  318   free (buf);
  319   EXTRACTOR_common_unzip_close (uf);
  320 }
  321 
  322 
  323 /* end of odf_extractor.c */