libextractor  1.11
About: GNU libextractor is a library used to extract meta-data from files of arbitrary type.
  Fossies Dox: libextractor-1.11.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

ole2_extractor.c
Go to the documentation of this file.
1 /*
2  This file is part of libextractor.
3  Copyright (C) 2004, 2005, 2006, 2007, 2009, 2012, 2018 Vidyut Samanta and Christian Grothoff
4 
5  libextractor is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published
7  by the Free Software Foundation; either version 3, or (at your
8  option) any later version.
9 
10  libextractor is distributed in the hope that it will be useful, but
11  WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with libextractor; see the file COPYING. If not, write to the
17  Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18  Boston, MA 02110-1301, USA.
19 
20  This code makes extensive use of libgsf
21  -- the Gnome Structured File Library
22  Copyright Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org)
23 
24  Part of this code was adapted from wordleaker.
25 */
26 /**
27  * @file plugins/ole2_extractor.c
28  * @brief plugin to support OLE2 (DOC, XLS, etc.) files
29  * @author Christian Grothoff
30  */
31 #include "platform.h"
32 #include "extractor.h"
33 #include "convert.h"
34 #include <glib-object.h>
35 #include <string.h>
36 #include <stdio.h>
37 #include <ctype.h>
38 #include <gsf/gsf-utils.h>
39 #include <gsf/gsf-input-impl.h>
40 #include <gsf/gsf-input-memory.h>
41 #include <gsf/gsf-impl-utils.h>
42 #include <gsf/gsf-infile.h>
43 #include <gsf/gsf-infile-msole.h>
44 #include <gsf/gsf-msole-utils.h>
45 
46 
47 /**
48  * Set to 1 to use our own GsfInput subclass which supports seeking
49  * and thus can handle very large files. Set to 0 to use the simple
50  * gsf in-memory buffer (which can only access the first ~16k) for
51  * debugging.
52  */
53 #define USE_LE_INPUT 1
54 
55 
56 /**
57  * Give the given UTF8 string to LE by calling 'proc'.
58  *
59  * @param proc callback to invoke
60  * @param proc_cls closure for proc
61  * @param phrase metadata string to pass; may include spaces
62  * just double-quotes or just a space in a double quote;
63  * in those cases, nothing should be done
64  * @param type meta data type to use
65  * @return if 'proc' returned 1, otherwise 0
66  */
67 static int
69  void *proc_cls,
70  const char *phrase,
72 {
73  char *tmp;
74  int ret;
75 
76  if (0 == strlen (phrase))
77  return 0;
78  if (0 == strcmp (phrase, "\"\""))
79  return 0;
80  if (0 == strcmp (phrase, "\" \""))
81  return 0;
82  if (0 == strcmp (phrase, " "))
83  return 0;
84  if (NULL == (tmp = strdup (phrase)))
85  return 0;
86 
87  while ( (strlen (tmp) > 0) &&
88  (isblank ((unsigned char) tmp [strlen (tmp) - 1])) )
89  tmp [strlen (tmp) - 1] = '\0';
90  ret = proc (proc_cls,
91  "ole2",
92  type,
94  "text/plain",
95  tmp,
96  strlen (tmp) + 1);
97  free (tmp);
98  return ret;
99 }
100 
101 
102 /**
103  * Entry in the map from OLE meta type strings
104  * to LE types.
105  */
106 struct Matches
107 {
108  /**
109  * OLE description.
110  */
111  const char *text;
112 
113  /**
114  * Corresponding LE type.
115  */
117 };
118 
119 
120 static struct Matches tmap[] = {
121  { "Title", EXTRACTOR_METATYPE_TITLE },
122  { "PresentationFormat", EXTRACTOR_METATYPE_FORMAT },
123  { "Category", EXTRACTOR_METATYPE_SECTION },
124  { "Manager", EXTRACTOR_METATYPE_MANAGER },
125  { "Company", EXTRACTOR_METATYPE_COMPANY },
126  { "Subject", EXTRACTOR_METATYPE_SUBJECT },
127  { "Author", EXTRACTOR_METATYPE_AUTHOR_NAME },
128  { "Keywords", EXTRACTOR_METATYPE_KEYWORDS },
129  { "Comments", EXTRACTOR_METATYPE_COMMENT },
130  { "Template", EXTRACTOR_METATYPE_TEMPLATE },
131  { "NumPages", EXTRACTOR_METATYPE_PAGE_COUNT },
133  { "RevisionNumber", EXTRACTOR_METATYPE_REVISION_NUMBER },
135  { "CreatedTime", EXTRACTOR_METATYPE_CREATION_DATE },
136  { "LastSavedTime", EXTRACTOR_METATYPE_MODIFICATION_DATE },
137  { "gsf:company", EXTRACTOR_METATYPE_COMPANY },
138  { "gsf:character-count", EXTRACTOR_METATYPE_CHARACTER_COUNT },
139  { "gsf:page-count", EXTRACTOR_METATYPE_PAGE_COUNT },
140  { "gsf:line-count", EXTRACTOR_METATYPE_LINE_COUNT },
141  { "gsf:word-count", EXTRACTOR_METATYPE_WORD_COUNT },
142  { "gsf:paragraph-count", EXTRACTOR_METATYPE_PARAGRAPH_COUNT },
143  { "gsf:last-saved-by", EXTRACTOR_METATYPE_LAST_SAVED_BY },
144  { "gsf:manager", EXTRACTOR_METATYPE_MANAGER },
145  { "dc:title", EXTRACTOR_METATYPE_TITLE },
146  { "dc:creator", EXTRACTOR_METATYPE_CREATOR },
147  { "dc:date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
148  { "dc:subject", EXTRACTOR_METATYPE_SUBJECT },
149  { "dc:keywords", EXTRACTOR_METATYPE_KEYWORDS },
150  { "dc:last-printed", EXTRACTOR_METATYPE_LAST_PRINTED },
151  { "dc:description", EXTRACTOR_METATYPE_DESCRIPTION },
152  { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE },
153  { "meta:generator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
154  { "meta:template", EXTRACTOR_METATYPE_TEMPLATE },
155  { "meta:editing-cycles", EXTRACTOR_METATYPE_EDITING_CYCLES },
156  /* { "Dictionary", EXTRACTOR_METATYPE_LANGUAGE }, */
157  /* { "gsf:security", EXTRACTOR_SECURITY }, */
158  /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */
159  /* { "meta:editing-duration", EXTRACTOR_METATYPE_TOTAL_EDITING_TIME }, // encoding? */
160  /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */
161  { NULL, 0 }
162 };
163 
164 
165 /**
166  * Closure for 'process_metadata'.
167  */
169 {
170  /**
171  * Function to call for meta data that was found.
172  */
174 
175  /**
176  * Closure for @e proc.
177  */
178  void *proc_cls;
179 
180  /**
181  * Return value; 0 to continue to extract, 1 if we are done
182  */
183  int ret;
184 };
185 
186 
187 /**
188  * Function invoked by 'gst_msole_metadata_read' with
189  * metadata found in the document.
190  *
191  * @param key 'const char *' describing the meta data
192  * @param value the UTF8 representation of the meta data
193  * @param user_data our 'struct ProcContext' (closure)
194  */
195 static void
196 process_metadata (gpointer key,
197  gpointer value,
198  gpointer user_data)
199 {
200  const char *type = key;
201  const GsfDocProp *prop = value;
202  struct ProcContext *pc = user_data;
203  const GValue *gval;
204  char *contents;
205  int pos;
206 
207  if ( (NULL == key) ||
208  (NULL == value) )
209  return;
210  if (0 != pc->ret)
211  return;
212  gval = gsf_doc_prop_get_val (prop);
213 
214  if (G_VALUE_TYPE (gval) == G_TYPE_STRING)
215  {
216  const char *gvals;
217 
218  gvals = g_value_get_string (gval);
219  if (NULL == gvals)
220  return;
221  contents = strdup (gvals);
222  }
223  else
224  {
225  /* convert other formats? */
226  contents = g_strdup_value_contents (gval);
227  }
228  if (NULL == contents)
229  return;
230  if (0 == strcmp (type,
231  "meta:generator"))
232  {
233  const char *mimetype = "application/vnd.ms-files";
234  struct
235  {
236  const char *v;
237  const char *m;
238  } mm[] = {
239  { "Microsoft Word", "application/msword" },
240  { "Microsoft Office Word", "application/msword" },
241  { "Microsoft Excel", "application/vnd.ms-excel" },
242  { "Microsoft Office Excel", "application/vnd.ms-excel" },
243  { "Microsoft PowerPoint", "application/vnd.ms-powerpoint" },
244  { "Microsoft Office PowerPoint", "application/vnd.ms-powerpoint"},
245  { "Microsoft Project", "application/vnd.ms-project" },
246  { "Microsoft Visio", "application/vnd.visio" },
247  { "Microsoft Office", "application/vnd.ms-office" },
248  { NULL, NULL }
249  };
250  int i;
251 
252  for (i = 0; NULL != mm[i].v; i++)
253  if (0 == strncmp (value,
254  mm[i].v,
255  strlen (mm[i].v) + 1))
256  {
257  mimetype = mm[i].m;
258  break;
259  }
260  if (0 != add_metadata (pc->proc,
261  pc->proc_cls,
262  mimetype,
264  {
265  free (contents);
266  pc->ret = 1;
267  return;
268  }
269  }
270  for (pos = 0; NULL != tmap[pos].text; pos++)
271  if (0 == strcmp (tmap[pos].text,
272  type))
273  break;
274  if ( (NULL != tmap[pos].text) &&
275  (0 != add_metadata (pc->proc, pc->proc_cls,
276  contents,
277  tmap[pos].type)) )
278  {
279  free (contents);
280  pc->ret = 1;
281  return;
282  }
283  free (contents);
284 }
285 
286 
287 /**
288  * Function called on (Document)SummaryInformation OLE
289  * streams.
290  *
291  * @param in the input OLE stream
292  * @param proc function to call on meta data found
293  * @param proc_cls closure for proc
294  * @return 0 to continue to extract, 1 if we are done
295  */
296 static int
297 process (GsfInput *in,
299  void *proc_cls)
300 {
301  struct ProcContext pc;
302  GsfDocMetaData *sections;
303  GError *error;
304 
305  pc.proc = proc;
306  pc.proc_cls = proc_cls;
307  pc.ret = 0;
308  sections = gsf_doc_meta_data_new ();
309 #ifdef HAVE_GSF_DOC_META_DATA_READ_FROM_MSOLE
310  error = gsf_doc_meta_data_read_from_msole (sections, in);
311 #else
312  error = gsf_msole_metadata_read (in, sections);
313 #endif
314  if (NULL == error)
315  {
316  gsf_doc_meta_data_foreach (sections,
318  &pc);
319  }
320  else
321  {
322  g_error_free (error);
323  }
324  g_object_unref (G_OBJECT (sections));
325  return pc.ret;
326 }
327 
328 
329 /**
330  * Function called on SfxDocumentInfo OLE
331  * streams.
332  *
333  * @param in the input OLE stream
334  * @param proc function to call on meta data found
335  * @param proc_cls closure for proc
336  * @return 0 to continue to extract, 1 if we are done
337  */
338 static int
339 process_star_office (GsfInput *src,
341  void *proc_cls)
342 {
343  off_t size = gsf_input_size (src);
344 
345  if ( (size < 0x374) ||
346  (size > 4 * 1024 * 1024) ) /* == 0x375?? */
347  return 0;
348  {
349  char buf[size];
350 
351  gsf_input_read (src, size, (unsigned char*) buf);
352  if ( (buf[0] != 0x0F) ||
353  (buf[1] != 0x0) ||
354  (0 != strncmp (&buf[2],
355  "SfxDocumentInfo",
356  strlen ("SfxDocumentInfo"))) ||
357  (buf[0x11] != 0x0B) ||
358  (buf[0x13] != 0x00) || /* pw protected! */
359  (buf[0x12] != 0x00) )
360  return 0;
361  buf[0xd3] = '\0';
362  if ( (buf[0x94] + buf[0x93] > 0) &&
363  (0 != add_metadata (proc, proc_cls,
364  &buf[0x95],
366  return 1;
367  buf[0x114] = '\0';
368  if ( (buf[0xd5] + buf[0xd4] > 0) &&
369  (0 != add_metadata (proc, proc_cls,
370  &buf[0xd6],
372  return 1;
373  buf[0x215] = '\0';
374  if ( (buf[0x115] + buf[0x116] > 0) &&
375  (0 != add_metadata (proc, proc_cls,
376  &buf[0x117],
378  return 1;
379  buf[0x296] = '\0';
380  if ( (buf[0x216] + buf[0x217] > 0) &&
381  (0 != add_metadata (proc, proc_cls,
382  &buf[0x218],
384  return 1;
385  /* fixme: do timestamps,
386  mime-type, user-defined info's */
387  }
388  return 0;
389 }
390 
391 
392 /**
393  * We use "__" to translate using iso-639.
394  *
395  * @param a string to translate
396  * @return translated string
397  */
398 #define __(a) dgettext ("iso-639", a)
399 
400 
401 /**
402  * Get the language string for the given language ID (lid)
403  * value.
404  *
405  * @param lid language id value
406  * @return language string corresponding to the lid
407  */
408 static const char *
409 lid_to_language (unsigned int lid)
410 {
411  switch (lid)
412  {
413  case 0x0400:
414  return _ ("No Proofing");
415  case 0x0401:
416  return __ ("Arabic");
417  case 0x0402:
418  return __ ("Bulgarian");
419  case 0x0403:
420  return __ ("Catalan");
421  case 0x0404:
422  return _ ("Traditional Chinese");
423  case 0x0804:
424  return _ ("Simplified Chinese");
425  case 0x0405:
426  return __ ("Chechen");
427  case 0x0406:
428  return __ ("Danish");
429  case 0x0407:
430  return __ ("German");
431  case 0x0807:
432  return _ ("Swiss German");
433  case 0x0408:
434  return __ ("Greek");
435  case 0x0409:
436  return _ ("U.S. English");
437  case 0x0809:
438  return _ ("U.K. English");
439  case 0x0c09:
440  return _ ("Australian English");
441  case 0x040a:
442  return _ ("Castilian Spanish");
443  case 0x080a:
444  return _ ("Mexican Spanish");
445  case 0x040b:
446  return __ ("Finnish");
447  case 0x040c:
448  return __ ("French");
449  case 0x080c:
450  return _ ("Belgian French");
451  case 0x0c0c:
452  return _ ("Canadian French");
453  case 0x100c:
454  return _ ("Swiss French");
455  case 0x040d:
456  return __ ("Hebrew");
457  case 0x040e:
458  return __ ("Hungarian");
459  case 0x040f:
460  return __ ("Icelandic");
461  case 0x0410:
462  return __ ("Italian");
463  case 0x0810:
464  return _ ("Swiss Italian");
465  case 0x0411:
466  return __ ("Japanese");
467  case 0x0412:
468  return __ ("Korean");
469  case 0x0413:
470  return __ ("Dutch");
471  case 0x0813:
472  return _ ("Belgian Dutch");
473  case 0x0414:
474  return _ ("Norwegian Bokmal");
475  case 0x0814:
476  return __ ("Norwegian Nynorsk");
477  case 0x0415:
478  return __ ("Polish");
479  case 0x0416:
480  return __ ("Brazilian Portuguese");
481  case 0x0816:
482  return __ ("Portuguese");
483  case 0x0417:
484  return _ ("Rhaeto-Romanic");
485  case 0x0418:
486  return __ ("Romanian");
487  case 0x0419:
488  return __ ("Russian");
489  case 0x041a:
490  return _ ("Croato-Serbian (Latin)");
491  case 0x081a:
492  return _ ("Serbo-Croatian (Cyrillic)");
493  case 0x041b:
494  return __ ("Slovak");
495  case 0x041c:
496  return __ ("Albanian");
497  case 0x041d:
498  return __ ("Swedish");
499  case 0x041e:
500  return __ ("Thai");
501  case 0x041f:
502  return __ ("Turkish");
503  case 0x0420:
504  return __ ("Urdu");
505  case 0x0421:
506  return __ ("Bahasa");
507  case 0x0422:
508  return __ ("Ukrainian");
509  case 0x0423:
510  return __ ("Byelorussian");
511  case 0x0424:
512  return __ ("Slovenian");
513  case 0x0425:
514  return __ ("Estonian");
515  case 0x0426:
516  return __ ("Latvian");
517  case 0x0427:
518  return __ ("Lithuanian");
519  case 0x0429:
520  return _ ("Farsi");
521  case 0x042D:
522  return __ ("Basque");
523  case 0x042F:
524  return __ ("Macedonian");
525  case 0x0436:
526  return __ ("Afrikaans");
527  case 0x043E:
528  return __ ("Malayalam");
529  default:
530  return NULL;
531  }
532 }
533 
534 
535 /**
536  * Extract editing history from XTable stream.
537  *
538  * @param stream OLE stream to process
539  * @param lcSttbSavedBy length of the revision history in bytes
540  * @param fcSttbSavedBy offset of the revision history in the stream
541  * @param proc function to call on meta data found
542  * @param proc_cls closure for proc
543  * @return 0 to continue to extract, 1 if we are done
544  */
545 static int
546 history_extract (GsfInput *stream,
547  unsigned int lcbSttbSavedBy,
548  unsigned int fcSttbSavedBy,
550  void *proc_cls)
551 {
552  unsigned int where;
553  unsigned char *lbuffer;
554  unsigned int i;
555  unsigned int length;
556  char *author;
557  char *filename;
558  char *rbuf;
559  unsigned int nRev;
560  int ret;
561 
562  /* goto offset of revision information */
563  gsf_input_seek (stream, fcSttbSavedBy, G_SEEK_SET);
564  if (gsf_input_remaining (stream) < lcbSttbSavedBy)
565  return 0;
566  if (NULL == (lbuffer = malloc (lcbSttbSavedBy)))
567  return 0;
568  /* read all the revision history */
569  gsf_input_read (stream, lcbSttbSavedBy, lbuffer);
570  /* there are n strings, so n/2 revisions (author & file) */
571  nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
572  where = 6;
573  ret = 0;
574  for (i = 0; i < nRev; i++)
575  {
576  if (where >= lcbSttbSavedBy)
577  break;
578  length = lbuffer[where++];
579  if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
580  (where + 2 * length + 2 <= where) )
581  break;
582  author = EXTRACTOR_common_convert_to_utf8 ((const char*) &lbuffer[where],
583  length * 2,
584  "UTF-16BE");
585  where += length * 2 + 1;
586  length = lbuffer[where++];
587  if ( (where + 2 * length >= lcbSttbSavedBy) ||
588  (where + 2 * length + 1 <= where) )
589  {
590  if (NULL != author)
591  free (author);
592  break;
593  }
594  filename = EXTRACTOR_common_convert_to_utf8 ((const char*) &lbuffer[where],
595  length * 2,
596  "UTF-16BE");
597  where += length * 2 + 1;
598  if ( (NULL != author) &&
599  (NULL != filename) )
600  {
601  size_t bsize;
602 
603  bsize = strlen (author) + strlen (filename) + 512;
604  if (NULL != (rbuf = malloc (bsize)))
605  {
606  int snret;
607 
608  snret = snprintf (rbuf,
609  bsize,
610  _ ("Revision #%u: Author `%s' worked on `%s'"),
611  i,
612  author,
613  filename);
614  if ( (-1 != snret) &&
615  (bsize > (size_t) snret) )
616  {
617  ret = add_metadata (proc,
618  proc_cls,
619  rbuf,
621  }
622  free (rbuf);
623  }
624  }
625  if (NULL != author)
626  free (author);
627  if (NULL != filename)
628  free (filename);
629  if (0 != ret)
630  break;
631  }
632  free (lbuffer);
633  return ret;
634 }
635 
636 
637 /* *************************** custom GSF input method ***************** */
638 
639 #define LE_TYPE_INPUT (le_input_get_type ())
640 #define LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), \
641  LE_TYPE_INPUT, \
642  LeInput))
643 #define LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), \
644  LE_TYPE_INPUT, \
645  LeInputClass))
646 #define IS_LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), \
647  LE_TYPE_INPUT))
648 #define IS_LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), \
649  LE_TYPE_INPUT))
650 #define LE_INPUT_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), \
651  LE_TYPE_INPUT, \
652  LeInputClass))
653 
654 /**
655  * Internal state of an "LeInput" object.
656  */
657 typedef struct _LeInputPrivate
658 {
659  /**
660  * Our extraction context.
661  */
664 
665 
666 /**
667  * Overall state of an "LeInput" object.
668  */
669 typedef struct _LeInput
670 {
671  /**
672  * Inherited state from parent (GsfInput).
673  */
674  GsfInput input;
675 
676  /*< private > */
677  /**
678  * Private state of the LeInput.
679  */
682 
683 
684 /**
685  * LeInput's class state.
686  */
687 typedef struct _LeInputClass
688 {
689  /**
690  * GsfInput is our parent class.
691  */
692  GsfInputClass parent_class;
693 
694  /* Padding for future expansion */
695  void (*_gtk_reserved1)(void);
696  void (*_gtk_reserved2)(void);
697  void (*_gtk_reserved3)(void);
698  void (*_gtk_reserved4)(void);
700 
701 
702 /**
703  * Constructor for LeInput objects.
704  *
705  * @param ec extraction context to use
706  * @return the LeInput, NULL on error
707  */
708 GsfInput *
710 
711 
712 /**
713  * Class initializer for the "LeInput" class.
714  *
715  * @param class class object to initialize
716  */
717 static void
719 
720 
721 /**
722  * Initialize internal state of fresh input object.
723  *
724  * @param input object to initialize
725  */
726 static void
727 le_input_init (LeInput *input);
728 
729 
730 /**
731  * Macro to create LeInput type definition and register the class.
732  */
734  GSF_INPUT_TYPE)
735 
736 
737 /**
738  * Duplicate input, leaving the new one at the same offset.
739  *
740  * @param input the input to duplicate
741  * @param err location for error reporting, can be NULL
742  * @return NULL on error (always)
743  */
744 static GsfInput *
745 le_input_dup (GsfInput * input,
746  GError * *err)
747 {
748  if (NULL != err)
749  *err = g_error_new (gsf_input_error_id (), 0,
750  "dup not supported on LeInput");
751  return NULL;
752 }
753 
754 
755 /**
756  * Read at least num_bytes. Does not change the current position if
757  * there is an error. Will only read if the entire amount can be
758  * read. Invalidates the buffer associated with previous calls to
759  * gsf_input_read.
760  *
761  * @param input
762  * @param num_bytes
763  * @param optional_buffer
764  * @return buffer where num_bytes data are available, or NULL on error
765  */
766 static const guint8 *
767 le_input_read (GsfInput *input,
768  size_t num_bytes,
769  guint8 *optional_buffer)
770 {
771  LeInput *li = LE_INPUT (input);
772  struct EXTRACTOR_ExtractContext *ec;
773  void *buf;
774  uint64_t old_off;
775  ssize_t ret;
776 
777  ec = li->priv->ec;
778  old_off = ec->seek (ec->cls, 0, SEEK_CUR);
779  if (num_bytes
780  != (ret = ec->read (ec->cls,
781  &buf,
782  num_bytes)))
783  {
784  /* we don't support partial reads;
785  most other GsfInput implementations in this case
786  allocate some huge temporary buffer just to avoid
787  the partial read; we might need to do that as well!? */
788  ec->seek (ec->cls, SEEK_SET, old_off);
789  return NULL;
790  }
791  if (NULL != optional_buffer)
792  {
793  memcpy (optional_buffer, buf, num_bytes);
794  return optional_buffer;
795  }
796  return buf;
797 }
798 
799 
800 /**
801  * Move the current location in an input stream
802  *
803  * @param input stream to seek
804  * @param offset target offset
805  * @param whence determines to what the offset is relative to
806  * @return TRUE on error
807  */
808 static gboolean
809 le_input_seek (GsfInput *input,
810  gsf_off_t offset,
811  GSeekType whence)
812 {
813  LeInput *li = LE_INPUT (input);
814  struct EXTRACTOR_ExtractContext *ec;
815  int w;
816  int64_t ret;
817 
818  ec = li->priv->ec;
819  switch (whence)
820  {
821  case G_SEEK_SET:
822  w = SEEK_SET;
823  break;
824  case G_SEEK_CUR:
825  w = SEEK_CUR;
826  break;
827  case G_SEEK_END:
828  w = SEEK_END;
829  break;
830  default:
831  return TRUE;
832  }
833  if (-1 ==
834  (ret = ec->seek (ec->cls,
835  offset,
836  w)))
837  return TRUE;
838  return FALSE;
839 }
840 
841 
842 /**
843  * Class initializer for the "LeInput" class.
844  *
845  * @param class class object to initialize
846  */
847 static void
849 {
850  GsfInputClass *input_class;
851 
852  input_class = (GsfInputClass *) class;
853  input_class->Dup = le_input_dup;
854  input_class->Read = le_input_read;
855  input_class->Seek = le_input_seek;
856  g_type_class_add_private (class, sizeof (LeInputPrivate));
857 }
858 
859 
860 /**
861  * Initialize internal state of fresh input object.
862  *
863  * @param input object to initialize
864  */
865 static void
867 {
868  LeInputPrivate *priv;
869 
870  input->priv =
871  G_TYPE_INSTANCE_GET_PRIVATE (input, LE_TYPE_INPUT,
873  priv = input->priv;
874  priv->ec = NULL;
875 }
876 
877 
878 /**
879  * Creates a new LeInput object.
880  *
881  * @param ec extractor context to wrap
882  * @return NULL on error
883  */
884 GsfInput *
886 {
887  LeInput *input;
888 
889  input = g_object_new (LE_TYPE_INPUT, NULL);
890  gsf_input_set_size (GSF_INPUT (input),
891  ec->get_size (ec->cls));
892  gsf_input_seek_emulate (GSF_INPUT (input),
893  0);
894  input->input.name = NULL;
895  input->input.container = NULL;
896  input->priv->ec = ec;
897 
898  return GSF_INPUT (input);
899 }
900 
901 
902 /* *********************** end of custom GSF input method ************* */
903 
904 
905 /**
906  * Main entry method for the OLE2 extraction plugin.
907  *
908  * @param ec extraction context provided to the plugin
909  */
910 void
912 {
913  GsfInput *input;
914  GsfInfile *infile;
915  GsfInput *src;
916  const char *name;
917  unsigned int i;
918  unsigned int lcb;
919  unsigned int fcb;
920  const unsigned char *data512;
921  unsigned int lid;
922  const char *lang;
923  int ret;
924  void *data;
925  uint64_t fsize;
926  ssize_t data_size;
927 
928  fsize = ec->get_size (ec->cls);
929  if (fsize < 512 + 898)
930  {
931  /* File too small for OLE2 */
932  return; /* can hardly be OLE2 */
933  }
934  if (512 + 898 > (data_size = ec->read (ec->cls, &data, fsize)))
935  {
936  /* Failed to read minimum file size to buffer */
937  return;
938  }
939  data512 = (const unsigned char*) data + 512;
940  lid = data512[6] + (data512[7] << 8);
941  if ( (NULL != (lang = lid_to_language (lid))) &&
942  (0 != (ret = add_metadata (ec->proc, ec->cls,
943  lang,
945  return;
946  lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16)
947  + (data512[729] << 24);
948  fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16)
949  + (data512[725] << 24);
950  if (0 != ec->seek (ec->cls, 0, SEEK_SET))
951  {
952  /* seek failed!? */
953  return;
954  }
955 #if USE_LE_INPUT
956  if (NULL == (input = le_input_new (ec)))
957  {
958  fprintf (stderr, "le_input_new failed\n");
959  return;
960  }
961 #else
962  input = gsf_input_memory_new ((const guint8 *) data,
963  data_size,
964  FALSE);
965 #endif
966  if (NULL == (infile = gsf_infile_msole_new (input, NULL)))
967  {
968  g_object_unref (G_OBJECT (input));
969  return;
970  }
971  ret = 0;
972  for (i = 0; i<gsf_infile_num_children (infile); i++)
973  {
974  if (0 != ret)
975  break;
976  if (NULL == (name = gsf_infile_name_by_index (infile, i)))
977  continue;
978  src = NULL;
979  if ( ( (0 == strcmp (name, "\005SummaryInformation")) ||
980  (0 == strcmp (name, "\005DocumentSummaryInformation")) ) &&
981  (NULL != (src = gsf_infile_child_by_index (infile, i))) )
982  ret = process (src,
983  ec->proc,
984  ec->cls);
985  if ( (0 == strcmp (name, "SfxDocumentInfo")) &&
986  (NULL != (src = gsf_infile_child_by_index (infile, i))) )
987  ret = process_star_office (src,
988  ec->proc,
989  ec->cls);
990  if (NULL != src)
991  g_object_unref (G_OBJECT (src));
992  }
993  if (0 != ret)
994  goto CLEANUP;
995 
996  if (lcb < 6)
997  goto CLEANUP;
998  for (i = 0; i<gsf_infile_num_children (infile); i++)
999  {
1000  if (ret != 0)
1001  break;
1002  if (NULL == (name = gsf_infile_name_by_index (infile, i)))
1003  continue;
1004  if ( ( (0 == strcmp (name, "1Table")) ||
1005  (0 == strcmp (name, "0Table")) ) &&
1006  (NULL != (src = gsf_infile_child_by_index (infile, i))) )
1007  {
1008  ret = history_extract (src,
1009  lcb,
1010  fcb,
1011  ec->proc, ec->cls);
1012  g_object_unref (G_OBJECT (src));
1013  }
1014  }
1015 CLEANUP:
1016  g_object_unref (G_OBJECT (infile));
1017  g_object_unref (G_OBJECT (input));
1018 }
1019 
1020 
1021 /**
1022  * Custom log function we give to GSF to disable logging.
1023  *
1024  * @param log_domain unused
1025  * @param log_level unused
1026  * @param message unused
1027  * @param user_data unused
1028  */
1029 static void
1030 nolog (const gchar *log_domain,
1031  GLogLevelFlags log_level,
1032  const gchar *message,
1033  gpointer user_data)
1034 {
1035  /* do nothing */
1036 }
1037 
1038 
1039 /**
1040  * OLE2 plugin constructor. Initializes glib and gsf, in particular
1041  * gsf logging is disabled.
1042  */
1043 void __attribute__ ((constructor))
1045 {
1046 #if ! GLIB_CHECK_VERSION (2, 35, 0)
1047  g_type_init ();
1048 #endif
1049 #ifdef HAVE_GSF_INIT
1050  gsf_init ();
1051 #endif
1052  /* disable logging -- thanks, Jody! */
1053  g_log_set_handler ("libgsf:msole",
1054  G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING,
1055  &nolog, NULL);
1056 }
1057 
1058 
1059 /**
1060  * OLE2 plugin destructor. Shutdown of gsf.
1061  */
1062 void __attribute__ ((destructor))
1064 {
1065 #ifdef HAVE_GSF_INIT
1066  gsf_shutdown ();
1067 #endif
1068 }
1069 
1070 
1071 /* end of ole2_extractor.c */
char * EXTRACTOR_common_convert_to_utf8(const char *input, size_t len, const char *charset)
Definition: convert.c:39
int(* EXTRACTOR_MetaDataProcessor)(void *cls, const char *plugin_name, enum EXTRACTOR_MetaType type, enum EXTRACTOR_MetaFormat format, const char *data_mime_type, const char *data, size_t data_len)
Definition: extractor.h:460
@ EXTRACTOR_METAFORMAT_UTF8
Definition: extractor.h:102
#define NULL
Definition: getopt1.c:60
EXTRACTOR_MetaType
Definition: extractor.h:126
@ EXTRACTOR_METATYPE_SECTION
Definition: extractor.h:212
@ EXTRACTOR_METATYPE_UNKNOWN_DATE
Definition: extractor.h:195
@ EXTRACTOR_METATYPE_MANAGER
Definition: extractor.h:273
@ EXTRACTOR_METATYPE_TEMPLATE
Definition: extractor.h:271
@ EXTRACTOR_METATYPE_LAST_PRINTED
Definition: extractor.h:198
@ EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE
Definition: extractor.h:258
@ EXTRACTOR_METATYPE_FORMAT
Definition: extractor.h:190
@ EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE
Definition: extractor.h:205
@ EXTRACTOR_METATYPE_REVISION_NUMBER
Definition: extractor.h:274
@ EXTRACTOR_METATYPE_AUTHOR_NAME
Definition: extractor.h:143
@ EXTRACTOR_METATYPE_LAST_SAVED_BY
Definition: extractor.h:199
@ EXTRACTOR_METATYPE_WORD_COUNT
Definition: extractor.h:267
@ EXTRACTOR_METATYPE_LANGUAGE
Definition: extractor.h:157
@ EXTRACTOR_METATYPE_COMMENT
Definition: extractor.h:131
@ EXTRACTOR_METATYPE_PARAGRAPH_COUNT
Definition: extractor.h:266
@ EXTRACTOR_METATYPE_TITLE
Definition: extractor.h:134
@ EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE
Definition: extractor.h:194
@ EXTRACTOR_METATYPE_CREATOR
Definition: extractor.h:189
@ EXTRACTOR_METATYPE_CREATION_DATE
Definition: extractor.h:196
@ EXTRACTOR_METATYPE_COMPANY
Definition: extractor.h:272
@ EXTRACTOR_METATYPE_EDITING_CYCLES
Definition: extractor.h:201
@ EXTRACTOR_METATYPE_LINE_COUNT
Definition: extractor.h:265
@ EXTRACTOR_METATYPE_KEYWORDS
Definition: extractor.h:185
@ EXTRACTOR_METATYPE_MODIFICATION_DATE
Definition: extractor.h:197
@ EXTRACTOR_METATYPE_REVISION_HISTORY
Definition: extractor.h:203
@ EXTRACTOR_METATYPE_PAGE_COUNT
Definition: extractor.h:141
@ EXTRACTOR_METATYPE_MIMETYPE
Definition: extractor.h:129
@ EXTRACTOR_METATYPE_CHARACTER_COUNT
Definition: extractor.h:268
@ EXTRACTOR_METATYPE_SUBJECT
Definition: extractor.h:188
@ EXTRACTOR_METATYPE_DESCRIPTION
Definition: extractor.h:182
const char * name
enum EXTRACTOR_MetaType type
static void process_metadata(gpointer key, gpointer value, gpointer user_data)
#define LE_INPUT(obj)
static int process(GsfInput *in, EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
struct _LeInputClass LeInputClass
static void le_input_init(LeInput *input)
#define __(a)
static void nolog(const gchar *log_domain, GLogLevelFlags log_level, const gchar *message, gpointer user_data)
GsfInput * le_input_new(struct EXTRACTOR_ExtractContext *ec)
struct _LeInput LeInput
void ole2_ltdl_fini()
struct _LeInputPrivate LeInputPrivate
static const guint8 * le_input_read(GsfInput *input, size_t num_bytes, guint8 *optional_buffer)
void ole2_ltdl_init()
static int history_extract(GsfInput *stream, unsigned int lcbSttbSavedBy, unsigned int fcSttbSavedBy, EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
static void le_input_class_init(LeInputClass *class)
static gboolean le_input_seek(GsfInput *input, gsf_off_t offset, GSeekType whence)
static const char * lid_to_language(unsigned int lid)
#define LE_TYPE_INPUT
GSF_CLASS(LeInput, le_input, le_input_class_init, le_input_init, GSF_INPUT_TYPE)
static int process_star_office(GsfInput *src, EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
static struct Matches tmap[]
static int add_metadata(EXTRACTOR_MetaDataProcessor proc, void *proc_cls, const char *phrase, enum EXTRACTOR_MetaType type)
void EXTRACTOR_ole2_extract_method(struct EXTRACTOR_ExtractContext *ec)
plaform specifics
#define _(a)
Definition: platform.h:32
int64_t(* seek)(void *cls, int64_t pos, int whence)
Definition: extractor.h:509
uint64_t(* get_size)(void *cls)
Definition: extractor.h:520
EXTRACTOR_MetaDataProcessor proc
Definition: extractor.h:525
ssize_t(* read)(void *cls, void **data, size_t size)
Definition: extractor.h:494
const char * text
Definition: deb_extractor.c:77
enum EXTRACTOR_MetaType type
Definition: deb_extractor.c:82
EXTRACTOR_MetaDataProcessor proc
void(* _gtk_reserved3)(void)
void(* _gtk_reserved2)(void)
void(* _gtk_reserved4)(void)
void(* _gtk_reserved1)(void)
GsfInputClass parent_class
struct EXTRACTOR_ExtractContext * ec
GsfInput input
LeInputPrivate * priv