libextractor  1.11
About: GNU libextractor is a library used to extract meta-data from files of arbitrary type.
  Fossies Dox: libextractor-1.11.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

dvi_extractor.c
Go to the documentation of this file.
1 /*
2  This file is part of libextractor.
3  Copyright (C) 2002, 2003, 2004, 2012, 2017, 2019 Vidyut Samanta and Christian Grothoff
4 
5  libextractor is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published
7  by the Free Software Foundation; either version 3, or (at your
8  option) any later version.
9 
10  libextractor is distributed in the hope that it will be useful, but
11  WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with libextractor; see the file COPYING. If not, write to the
17  Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18  Boston, MA 02110-1301, USA.
19  */
20 /**
21  * @file plugins/dvi_extractor.c
22  * @brief plugin to support DVI files (from LaTeX)
23  * @author Christian Grothoff
24  */
25 #include "platform.h"
26 #include "extractor.h"
27 
28 
29 /**
30  * Pair of a PostScipt prefix and the corresponding LE type.
31  */
32 struct Matches
33 {
34  /**
35  * Prefix in the PS map.
36  */
37  const char *text;
38 
39  /**
40  * Corresponding LE type.
41  */
43 };
44 
45 
46 /**
47  * Map from PS names to LE types.
48  */
49 static struct Matches tmap[] = {
50  { "/Title (", EXTRACTOR_METATYPE_TITLE },
51  { "/Subject (", EXTRACTOR_METATYPE_SUBJECT },
52  { "/Author (", EXTRACTOR_METATYPE_AUTHOR_NAME },
53  { "/Keywords (", EXTRACTOR_METATYPE_KEYWORDS },
55  { "/Producer (", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE },
56  { NULL, 0 }
57 };
58 
59 
60 /**
61  * Parse a "ZZZ" tag. Specifically, the data may contain a
62  * postscript dictionary with metadata.
63  *
64  * @param data overall input stream
65  * @param pos where in data is the zzz data
66  * @param len how many bytes from 'pos' does the zzz data extend?
67  * @param proc function to call with meta data found
68  * @param proc_cls closure for proc
69  * @return 0 to continue to extract, 1 to stop
70  */
71 static int
72 parseZZZ (const char *data,
73  size_t pos, size_t len,
75  void *proc_cls)
76 {
77  size_t slen;
78  size_t end;
79  unsigned int i;
80 
81  end = pos + len;
82  slen = strlen ("ps:SDict begin [");
83  if ( (len <= slen) ||
84  (0 != strncmp ("ps:SDict begin [ ", &data[pos], slen)) )
85  return 0;
86  pos += slen;
87  while (pos < end)
88  {
89  for (i = 0; NULL != tmap[i].text; i++)
90  {
91  slen = strlen (tmap[i].text);
92  if ( (pos + slen > end) ||
93  (0 != strncmp (&data[pos], tmap[i].text, slen)) )
94  continue;
95  pos += slen;
96  slen = pos;
97  while ((slen < end) && (data[slen] != ')'))
98  slen++;
99  slen = slen - pos;
100  {
101  char value[slen + 1];
102 
103  value[slen] = '\0';
104  memcpy (value, &data[pos], slen);
105  if (0 != proc (proc_cls,
106  "dvi",
107  tmap[i].type,
109  "text/plain",
110  value,
111  slen + 1))
112  return 1;
113  }
114  pos += slen + 1;
115  break;
116  }
117  pos++;
118  }
119  return 0;
120 }
121 
122 
123 /**
124  * Read 32-bit unsigned integer in big-endian format from 'data'.
125  *
126  * @param data pointer to integer (possibly unaligned)
127  * @return 32-bit integer in host byte order
128  */
129 static uint32_t
130 getIntAt (const void *data)
131 {
132  uint32_t p;
133 
134  memcpy (&p, data, 4); /* ensure alignment! */
135  return ntohl (p);
136 }
137 
138 
139 /**
140  * Read 16-bit unsigned integer in big-endian format from 'data'.
141  *
142  * @param data pointer to integer (possibly unaligned)
143  * @return 16-bit integer in host byte order
144  */
145 static uint16_t
146 getShortAt (const void *data)
147 {
148  uint16_t p;
149 
150  memcpy (&p, data, sizeof (uint16_t)); /* ensure alignment! */
151  return ntohs (p);
152 }
153 
154 
155 /**
156  * Main entry method for the 'application/x-dvi' extraction plugin.
157  *
158  * @param ec extraction context provided to the plugin
159  */
160 void
162 {
163  unsigned int klen;
164  uint32_t pos;
165  uint32_t opos;
166  unsigned int len;
167  unsigned int pageCount;
168  char pages[16];
169  void *buf;
170  unsigned char *data;
171  uint64_t size;
172  uint64_t off;
173  ssize_t iret;
174 
175  if (40 >= (iret = ec->read (ec->cls, &buf, 1024)))
176  return;
177  data = buf;
178  if ( (data[0] != 247) ||
179  (data[1] != 2) )
180  return; /* cannot be DVI or unsupported version */
181  klen = data[14];
182  size = ec->get_size (ec->cls);
183  if (size > 16 * 1024 * 1024)
184  return; /* too large */
185  if (klen + 15 > size)
186  return; /* malformed klen */
187  if (NULL == (data = malloc ((size_t) size)))
188  return; /* out of memory */
189  memcpy (data, buf, iret);
190  off = iret;
191  while (off < size)
192  {
193  if (0 >= (iret = ec->read (ec->cls, &buf, 16 * 1024)))
194  {
195  free (data);
196  return;
197  }
198  memcpy (&data[off], buf, iret);
199  off += iret;
200  }
201  pos = size - 1;
202  while ( (223 == data[pos]) &&
203  (pos > 0) )
204  pos--;
205  if ( (2 != data[pos]) ||
206  (pos < 40) )
207  goto CLEANUP;
208  pos--;
209  pos -= 4;
210  /* assert pos at 'post_post tag' */
211  if (data[pos] != 249)
212  goto CLEANUP;
213  opos = pos;
214  pos = getIntAt (&data[opos + 1]);
215  if ( (pos + 25 > size) ||
216  (pos + 25 < pos) )
217  goto CLEANUP;
218  /* assert pos at 'post' command */
219  if (data[pos] != 248)
220  goto CLEANUP;
221  pageCount = 0;
222  opos = pos;
223  pos = getIntAt (&data[opos + 1]);
224  while (1)
225  {
226  if (UINT32_MAX == pos)
227  break;
228  if ( (pos + 45 > size) ||
229  (pos + 45 < pos) )
230  goto CLEANUP;
231  if (data[pos] != 139) /* expect 'bop' */
232  goto CLEANUP;
233  pageCount++;
234  opos = pos;
235  pos = getIntAt (&data[opos + 41]);
236  if (UINT32_MAX == pos)
237  break;
238  if (pos >= opos)
239  goto CLEANUP; /* invalid! */
240  }
241  /* ok, now we believe it's a dvi... */
242  snprintf (pages,
243  sizeof (pages),
244  "%u",
245  pageCount);
246  if (0 != ec->proc (ec->cls,
247  "dvi",
250  "text/plain",
251  pages,
252  strlen (pages) + 1))
253  goto CLEANUP;
254  if (0 != ec->proc (ec->cls,
255  "dvi",
258  "text/plain",
259  "application/x-dvi",
260  strlen ("application/x-dvi") + 1))
261  goto CLEANUP;
262  {
263  char comment[klen + 1];
264 
265  comment[klen] = '\0';
266  memcpy (comment, &data[15], klen);
267  if (0 != ec->proc (ec->cls,
268  "dvi",
271  "text/plain",
272  comment,
273  klen + 1))
274  goto CLEANUP;
275  }
276  /* try to find PDF/ps special */
277  pos = opos;
278  while ( (size >= 100) &&
279  (pos < size - 100) )
280  {
281  switch (data[pos])
282  {
283  case 139: /* begin page 'bop', we typically have to skip that one to
284  find the zzz's */
285  pos += 45; /* skip bop */
286  break;
287  case 239: /* zzz1 */
288  len = data[pos + 1];
289  if ( (pos + 2 + len < size) &&
290  (0 != parseZZZ ((const char *) data, pos + 2, len, ec->proc,
291  ec->cls)) )
292  goto CLEANUP;
293  pos += len + 2;
294  break;
295  case 240: /* zzz2 */
296  len = getShortAt (&data[pos + 1]);
297  if ( (pos + 3 + len < size) &&
298  (0 != parseZZZ ((const char *) data, pos + 3, len, ec->proc,
299  ec->cls)) )
300  goto CLEANUP;
301  pos += len + 3;
302  break;
303  case 241: /* zzz3, who uses that? */
304  len = (getShortAt (&data[pos + 1])) + 65536 * data[pos + 3];
305  if ( (pos + 4 + len < size) &&
306  (0 != parseZZZ ((const char *) data, pos + 4, len, ec->proc,
307  ec->cls)) )
308  goto CLEANUP;
309  pos += len + 4;
310  break;
311  case 242: /* zzz4, hurray! */
312  len = getIntAt (&data[pos + 1]);
313  if ( (pos + 1 + len < size) &&
314  (0 != parseZZZ ((const char *) data, pos + 5, len, ec->proc,
315  ec->cls)) )
316  goto CLEANUP;
317  pos += len + 5;
318  break;
319  default: /* unsupported opcode, abort scan */
320  goto CLEANUP;
321  }
322  }
323 CLEANUP:
324  free (data);
325 }
326 
327 
328 /* end of dvi_extractor.c */
static uint16_t getShortAt(const void *data)
void EXTRACTOR_dvi_extract_method(struct EXTRACTOR_ExtractContext *ec)
static int parseZZZ(const char *data, size_t pos, size_t len, EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
Definition: dvi_extractor.c:72
static struct Matches tmap[]
Definition: dvi_extractor.c:49
static uint32_t getIntAt(const void *data)
int(* EXTRACTOR_MetaDataProcessor)(void *cls, const char *plugin_name, enum EXTRACTOR_MetaType type, enum EXTRACTOR_MetaFormat format, const char *data_mime_type, const char *data, size_t data_len)
Definition: extractor.h:460
@ EXTRACTOR_METAFORMAT_C_STRING
Definition: extractor.h:113
@ EXTRACTOR_METAFORMAT_UTF8
Definition: extractor.h:102
#define NULL
Definition: getopt1.c:60
EXTRACTOR_MetaType
Definition: extractor.h:126
@ EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE
Definition: extractor.h:258
@ EXTRACTOR_METATYPE_AUTHOR_NAME
Definition: extractor.h:143
@ EXTRACTOR_METATYPE_COMMENT
Definition: extractor.h:131
@ EXTRACTOR_METATYPE_TITLE
Definition: extractor.h:134
@ EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE
Definition: extractor.h:194
@ EXTRACTOR_METATYPE_KEYWORDS
Definition: extractor.h:185
@ EXTRACTOR_METATYPE_PAGE_COUNT
Definition: extractor.h:141
@ EXTRACTOR_METATYPE_MIMETYPE
Definition: extractor.h:129
@ EXTRACTOR_METATYPE_SUBJECT
Definition: extractor.h:188
enum EXTRACTOR_MetaType type
plaform specifics
uint64_t(* get_size)(void *cls)
Definition: extractor.h:520
EXTRACTOR_MetaDataProcessor proc
Definition: extractor.h:525
ssize_t(* read)(void *cls, void **data, size_t size)
Definition: extractor.h:494
const char * text
Definition: deb_extractor.c:77
enum EXTRACTOR_MetaType type
Definition: deb_extractor.c:82