libextractor  1.11
About: GNU libextractor is a library used to extract meta-data from files of arbitrary type.
  Fossies Dox: libextractor-1.11.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

odf_extractor.c
Go to the documentation of this file.
1 /*
2  This file is part of libextractor.
3  Copyright (C) 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff
4 
5  libextractor is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published
7  by the Free Software Foundation; either version 3, or (at your
8  option) any later version.
9 
10  libextractor is distributed in the hope that it will be useful, but
11  WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with libextractor; see the file COPYING. If not, write to the
17  Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18  Boston, MA 02110-1301, USA.
19 */
20 /**
21  * @file plugins/odf_extractor.c
22  * @brief plugin to support ODF files
23  * @author Christian Grothoff
24  */
25 #include "platform.h"
26 #include <ctype.h>
27 #include "extractor.h"
28 #include "unzip.h"
29 
30 /**
31  * Maximum length of a filename allowed inside the ZIP archive.
32  */
33 #define MAXFILENAME 256
34 
35 /**
36  * Name of the file with the meta-data in OO documents.
37  */
38 #define METAFILE "meta.xml"
39 
40 
41 /**
42  * Mapping from ODF meta data strings to LE types.
43  */
44 struct Matches
45 {
46  /**
47  * ODF description.
48  */
49  const char *text;
50 
51  /**
52  * Corresponding LE type.
53  */
55 };
56 
57 
58 /**
59  * NULL-terminated map from ODF meta data strings to LE types.
60  */
61 static struct Matches tmap[] = {
62  { "meta:generator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
63  { "meta:page-count", EXTRACTOR_METATYPE_PAGE_COUNT },
64  { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE },
65  { "dc:date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
66  { "dc:creator", EXTRACTOR_METATYPE_CREATOR },
67  { "dc:language", EXTRACTOR_METATYPE_LANGUAGE },
68  { "dc:title", EXTRACTOR_METATYPE_TITLE },
69  { "dc:description", EXTRACTOR_METATYPE_DESCRIPTION },
70  { "dc:subject", EXTRACTOR_METATYPE_SUBJECT },
71  { "meta:keyword", EXTRACTOR_METATYPE_KEYWORDS },
72  { "meta:user-defined meta:name=\"Info 1\"", EXTRACTOR_METATYPE_COMMENT },
73  { "meta:user-defined meta:name=\"Info 2\"", EXTRACTOR_METATYPE_COMMENT },
74  { "meta:user-defined meta:name=\"Info 3\"", EXTRACTOR_METATYPE_COMMENT },
75  { "meta:user-defined meta:name=\"Info 4\"", EXTRACTOR_METATYPE_COMMENT },
76  { NULL, 0 }
77 };
78 
79 
80 /**
81  * Obtain the mimetype of the archive by reading the 'mimetype'
82  * file of the ZIP.
83  *
84  * @param uf unzip context to extract the mimetype from
85  * @return NULL if no mimetype could be found, otherwise the mime type
86  */
87 static char *
89 {
90  char filename_inzip[MAXFILENAME];
91  struct EXTRACTOR_UnzipFileInfo file_info;
92  char *buf;
93  size_t buf_size;
94 
95  if (EXTRACTOR_UNZIP_OK !=
97  "mimetype",
98  2))
99  return NULL;
100  if (EXTRACTOR_UNZIP_OK !=
102  &file_info,
103  filename_inzip,
104  sizeof (filename_inzip),
105  NULL,
106  0,
107  NULL,
108  0))
109  return NULL;
110  if (EXTRACTOR_UNZIP_OK !=
112  return NULL;
113  buf_size = file_info.uncompressed_size;
114  if (buf_size > 1024)
115  {
116  /* way too large! */
118  return NULL;
119  }
120  if (NULL == (buf = malloc (1 + buf_size)))
121  {
122  /* memory exhausted! */
124  return NULL;
125  }
126  if (buf_size !=
128  buf,
129  buf_size))
130  {
131  free (buf);
133  return NULL;
134  }
135  /* found something */
136  buf[buf_size] = '\0';
137  while ( (0 < buf_size) &&
138  isspace ( (unsigned char) buf[buf_size - 1]))
139  buf[--buf_size] = '\0';
140  if ('\0' == buf[0])
141  {
142  free (buf);
143  buf = NULL;
144  }
146  return buf;
147 }
148 
149 
150 /**
151  * Main entry method for the ODF extraction plugin.
152  *
153  * @param ec extraction context provided to the plugin
154  */
155 void
157 {
158  char filename_inzip[MAXFILENAME];
159  struct EXTRACTOR_UnzipFile *uf;
160  struct EXTRACTOR_UnzipFileInfo file_info;
161  char *buf;
162  char *pbuf;
163  size_t buf_size;
164  unsigned int i;
165  char *mimetype;
166 
167  if (NULL == (uf = EXTRACTOR_common_unzip_open (ec)))
168  return;
169  if (NULL != (mimetype = libextractor_oo_getmimetype (uf)))
170  {
171  if (0 != ec->proc (ec->cls,
172  "odf",
175  "text/plain",
176  mimetype,
177  strlen (mimetype) + 1))
178  {
180  free (mimetype);
181  return;
182  }
183  free (mimetype);
184  }
185  if (EXTRACTOR_UNZIP_OK !=
187  METAFILE,
188  2))
189  {
190  /* metafile not found */
192  return;
193  }
194  if (EXTRACTOR_UNZIP_OK !=
196  &file_info,
197  filename_inzip,
198  sizeof (filename_inzip),
199  NULL, 0, NULL, 0))
200  {
201  /* problems accessing metafile */
203  return;
204  }
205  if (EXTRACTOR_UNZIP_OK !=
207  {
208  /* problems with unzip */
210  return;
211  }
212 
213  buf_size = file_info.uncompressed_size;
214  if (buf_size > 128 * 1024)
215  {
216  /* too big to be meta-data! */
219  return;
220  }
221  if (NULL == (buf = malloc (buf_size + 1)))
222  {
223  /* out of memory */
226  return;
227  }
228  if (buf_size != EXTRACTOR_common_unzip_read_current_file (uf, buf, buf_size))
229  {
231  goto CLEANUP;
232  }
234  /* we don't do "proper" parsing of the meta-data but rather use some heuristics
235  to get values out that we understand */
236  buf[buf_size] = '\0';
237  /* printf("%s\n", buf); */
238  /* try to find some of the typical OO xml headers */
239  if ( (strstr (buf, "xmlns:meta=\"http://openoffice.org/2000/meta\"") !=
240  NULL) ||
241  (strstr (buf, "xmlns:dc=\"http://purl.org/dc/elements/1.1/\"") !=
242  NULL) ||
243  (strstr (buf, "xmlns:xlink=\"http://www.w3.org/1999/xlink\"") != NULL) )
244  {
245  /* accept as meta-data */
246  for (i = 0; NULL != tmap[i].text; i++)
247  {
248  char *spos;
249  char *epos;
250  char needle[256];
251  int oc;
252 
253  pbuf = buf;
254 
255  while (1)
256  {
257  strcpy (needle, "<");
258  strcat (needle, tmap[i].text);
259  strcat (needle, ">");
260  spos = strstr (pbuf, needle);
261  if (NULL == spos)
262  {
263  strcpy (needle, tmap[i].text);
264  strcat (needle, "=\"");
265  spos = strstr (pbuf, needle);
266  if (spos == NULL)
267  break;
268  spos += strlen (needle);
269  epos = spos;
270  while ( (epos[0] != '\0') &&
271  (epos[0] != '"') )
272  epos++;
273  }
274  else
275  {
276  oc = 0;
277  spos += strlen (needle);
278  while ( (spos[0] != '\0') &&
279  ( (spos[0] == '<') ||
280  (oc > 0) ) )
281  {
282  if (spos[0] == '<')
283  oc++;
284  if (spos[0] == '>')
285  oc--;
286  spos++;
287  }
288  epos = spos;
289  while ( (epos[0] != '\0') &&
290  (epos[0] != '<') &&
291  (epos[0] != '>') )
292  {
293  epos++;
294  }
295  }
296  if (spos != epos)
297  {
298  char key[epos - spos + 1];
299 
300  memcpy (key, spos, epos - spos);
301  key[epos - spos] = '\0';
302  if (0 != ec->proc (ec->cls,
303  "odf",
304  tmap[i].type,
306  "text/plain",
307  key,
308  epos - spos + 1))
309  goto CLEANUP;
310  pbuf = epos;
311  }
312  else
313  break;
314  }
315  }
316  }
317 CLEANUP:
318  free (buf);
320 }
321 
322 
323 /* end of odf_extractor.c */
@ EXTRACTOR_METAFORMAT_UTF8
Definition: extractor.h:102
#define NULL
Definition: getopt1.c:60
EXTRACTOR_MetaType
Definition: extractor.h:126
@ EXTRACTOR_METATYPE_UNKNOWN_DATE
Definition: extractor.h:195
@ EXTRACTOR_METATYPE_LANGUAGE
Definition: extractor.h:157
@ EXTRACTOR_METATYPE_COMMENT
Definition: extractor.h:131
@ EXTRACTOR_METATYPE_TITLE
Definition: extractor.h:134
@ EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE
Definition: extractor.h:194
@ EXTRACTOR_METATYPE_CREATOR
Definition: extractor.h:189
@ EXTRACTOR_METATYPE_CREATION_DATE
Definition: extractor.h:196
@ EXTRACTOR_METATYPE_KEYWORDS
Definition: extractor.h:185
@ EXTRACTOR_METATYPE_PAGE_COUNT
Definition: extractor.h:141
@ EXTRACTOR_METATYPE_MIMETYPE
Definition: extractor.h:129
@ EXTRACTOR_METATYPE_SUBJECT
Definition: extractor.h:188
@ EXTRACTOR_METATYPE_DESCRIPTION
Definition: extractor.h:182
#define MAXFILENAME
Definition: odf_extractor.c:33
static char * libextractor_oo_getmimetype(struct EXTRACTOR_UnzipFile *uf)
Definition: odf_extractor.c:88
void EXTRACTOR_odf_extract_method(struct EXTRACTOR_ExtractContext *ec)
#define METAFILE
Definition: odf_extractor.c:38
static struct Matches tmap[]
Definition: odf_extractor.c:61
plaform specifics
EXTRACTOR_MetaDataProcessor proc
Definition: extractor.h:525
const char * text
Definition: deb_extractor.c:77
enum EXTRACTOR_MetaType type
Definition: deb_extractor.c:82
int EXTRACTOR_common_unzip_open_current_file(struct EXTRACTOR_UnzipFile *file)
Definition: unzip.c:1259
ssize_t EXTRACTOR_common_unzip_read_current_file(struct EXTRACTOR_UnzipFile *file, void *buf, size_t len)
Definition: unzip.c:1038
int EXTRACTOR_common_unzip_go_find_local_file(struct EXTRACTOR_UnzipFile *file, const char *szFileName, int iCaseSensitivity)
Definition: unzip.c:969
int EXTRACTOR_common_unzip_get_current_file_info(struct EXTRACTOR_UnzipFile *file, struct EXTRACTOR_UnzipFileInfo *pfile_info, char *szFileName, uLong fileNameBufferSize, void *extraField, uLong extraFieldBufferSize, char *szComment, uLong commentBufferSize)
Definition: unzip.c:908
struct EXTRACTOR_UnzipFile * EXTRACTOR_common_unzip_open(struct EXTRACTOR_ExtractContext *ec)
Definition: unzip.c:1421
int EXTRACTOR_common_unzip_close(struct EXTRACTOR_UnzipFile *file)
Definition: unzip.c:854
int EXTRACTOR_common_unzip_close_current_file(struct EXTRACTOR_UnzipFile *file)
Definition: unzip.c:823
API to access ZIP archives.
#define EXTRACTOR_UNZIP_OK
Definition: unzip.h:37