"Fossies" - the Fresh Open Source Software Archive 
Member "libextractor-1.11/src/plugins/dvi_extractor.c" (30 Jan 2021, 8709 Bytes) of package /linux/privat/libextractor-1.11.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "dvi_extractor.c" see the
Fossies "Dox" file reference documentation and the last
Fossies "Diffs" side-by-side code changes report:
1.6_vs_1.7.
1 /*
2 This file is part of libextractor.
3 Copyright (C) 2002, 2003, 2004, 2012, 2017, 2019 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 Boston, MA 02110-1301, USA.
19 */
20 /**
21 * @file plugins/dvi_extractor.c
22 * @brief plugin to support DVI files (from LaTeX)
23 * @author Christian Grothoff
24 */
25 #include "platform.h"
26 #include "extractor.h"
27
28
29 /**
30 * Pair of a PostScipt prefix and the corresponding LE type.
31 */
32 struct Matches
33 {
34 /**
35 * Prefix in the PS map.
36 */
37 const char *text;
38
39 /**
40 * Corresponding LE type.
41 */
42 enum EXTRACTOR_MetaType type;
43 };
44
45
46 /**
47 * Map from PS names to LE types.
48 */
49 static struct Matches tmap[] = {
50 { "/Title (", EXTRACTOR_METATYPE_TITLE },
51 { "/Subject (", EXTRACTOR_METATYPE_SUBJECT },
52 { "/Author (", EXTRACTOR_METATYPE_AUTHOR_NAME },
53 { "/Keywords (", EXTRACTOR_METATYPE_KEYWORDS },
54 { "/Creator (", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
55 { "/Producer (", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE },
56 { NULL, 0 }
57 };
58
59
60 /**
61 * Parse a "ZZZ" tag. Specifically, the data may contain a
62 * postscript dictionary with metadata.
63 *
64 * @param data overall input stream
65 * @param pos where in data is the zzz data
66 * @param len how many bytes from 'pos' does the zzz data extend?
67 * @param proc function to call with meta data found
68 * @param proc_cls closure for proc
69 * @return 0 to continue to extract, 1 to stop
70 */
71 static int
72 parseZZZ (const char *data,
73 size_t pos, size_t len,
74 EXTRACTOR_MetaDataProcessor proc,
75 void *proc_cls)
76 {
77 size_t slen;
78 size_t end;
79 unsigned int i;
80
81 end = pos + len;
82 slen = strlen ("ps:SDict begin [");
83 if ( (len <= slen) ||
84 (0 != strncmp ("ps:SDict begin [ ", &data[pos], slen)) )
85 return 0;
86 pos += slen;
87 while (pos < end)
88 {
89 for (i = 0; NULL != tmap[i].text; i++)
90 {
91 slen = strlen (tmap[i].text);
92 if ( (pos + slen > end) ||
93 (0 != strncmp (&data[pos], tmap[i].text, slen)) )
94 continue;
95 pos += slen;
96 slen = pos;
97 while ((slen < end) && (data[slen] != ')'))
98 slen++;
99 slen = slen - pos;
100 {
101 char value[slen + 1];
102
103 value[slen] = '\0';
104 memcpy (value, &data[pos], slen);
105 if (0 != proc (proc_cls,
106 "dvi",
107 tmap[i].type,
108 EXTRACTOR_METAFORMAT_C_STRING,
109 "text/plain",
110 value,
111 slen + 1))
112 return 1;
113 }
114 pos += slen + 1;
115 break;
116 }
117 pos++;
118 }
119 return 0;
120 }
121
122
123 /**
124 * Read 32-bit unsigned integer in big-endian format from 'data'.
125 *
126 * @param data pointer to integer (possibly unaligned)
127 * @return 32-bit integer in host byte order
128 */
129 static uint32_t
130 getIntAt (const void *data)
131 {
132 uint32_t p;
133
134 memcpy (&p, data, 4); /* ensure alignment! */
135 return ntohl (p);
136 }
137
138
139 /**
140 * Read 16-bit unsigned integer in big-endian format from 'data'.
141 *
142 * @param data pointer to integer (possibly unaligned)
143 * @return 16-bit integer in host byte order
144 */
145 static uint16_t
146 getShortAt (const void *data)
147 {
148 uint16_t p;
149
150 memcpy (&p, data, sizeof (uint16_t)); /* ensure alignment! */
151 return ntohs (p);
152 }
153
154
155 /**
156 * Main entry method for the 'application/x-dvi' extraction plugin.
157 *
158 * @param ec extraction context provided to the plugin
159 */
160 void
161 EXTRACTOR_dvi_extract_method (struct EXTRACTOR_ExtractContext *ec)
162 {
163 unsigned int klen;
164 uint32_t pos;
165 uint32_t opos;
166 unsigned int len;
167 unsigned int pageCount;
168 char pages[16];
169 void *buf;
170 unsigned char *data;
171 uint64_t size;
172 uint64_t off;
173 ssize_t iret;
174
175 if (40 >= (iret = ec->read (ec->cls, &buf, 1024)))
176 return;
177 data = buf;
178 if ( (data[0] != 247) ||
179 (data[1] != 2) )
180 return; /* cannot be DVI or unsupported version */
181 klen = data[14];
182 size = ec->get_size (ec->cls);
183 if (size > 16 * 1024 * 1024)
184 return; /* too large */
185 if (klen + 15 > size)
186 return; /* malformed klen */
187 if (NULL == (data = malloc ((size_t) size)))
188 return; /* out of memory */
189 memcpy (data, buf, iret);
190 off = iret;
191 while (off < size)
192 {
193 if (0 >= (iret = ec->read (ec->cls, &buf, 16 * 1024)))
194 {
195 free (data);
196 return;
197 }
198 memcpy (&data[off], buf, iret);
199 off += iret;
200 }
201 pos = size - 1;
202 while ( (223 == data[pos]) &&
203 (pos > 0) )
204 pos--;
205 if ( (2 != data[pos]) ||
206 (pos < 40) )
207 goto CLEANUP;
208 pos--;
209 pos -= 4;
210 /* assert pos at 'post_post tag' */
211 if (data[pos] != 249)
212 goto CLEANUP;
213 opos = pos;
214 pos = getIntAt (&data[opos + 1]);
215 if ( (pos + 25 > size) ||
216 (pos + 25 < pos) )
217 goto CLEANUP;
218 /* assert pos at 'post' command */
219 if (data[pos] != 248)
220 goto CLEANUP;
221 pageCount = 0;
222 opos = pos;
223 pos = getIntAt (&data[opos + 1]);
224 while (1)
225 {
226 if (UINT32_MAX == pos)
227 break;
228 if ( (pos + 45 > size) ||
229 (pos + 45 < pos) )
230 goto CLEANUP;
231 if (data[pos] != 139) /* expect 'bop' */
232 goto CLEANUP;
233 pageCount++;
234 opos = pos;
235 pos = getIntAt (&data[opos + 41]);
236 if (UINT32_MAX == pos)
237 break;
238 if (pos >= opos)
239 goto CLEANUP; /* invalid! */
240 }
241 /* ok, now we believe it's a dvi... */
242 snprintf (pages,
243 sizeof (pages),
244 "%u",
245 pageCount);
246 if (0 != ec->proc (ec->cls,
247 "dvi",
248 EXTRACTOR_METATYPE_PAGE_COUNT,
249 EXTRACTOR_METAFORMAT_UTF8,
250 "text/plain",
251 pages,
252 strlen (pages) + 1))
253 goto CLEANUP;
254 if (0 != ec->proc (ec->cls,
255 "dvi",
256 EXTRACTOR_METATYPE_MIMETYPE,
257 EXTRACTOR_METAFORMAT_UTF8,
258 "text/plain",
259 "application/x-dvi",
260 strlen ("application/x-dvi") + 1))
261 goto CLEANUP;
262 {
263 char comment[klen + 1];
264
265 comment[klen] = '\0';
266 memcpy (comment, &data[15], klen);
267 if (0 != ec->proc (ec->cls,
268 "dvi",
269 EXTRACTOR_METATYPE_COMMENT,
270 EXTRACTOR_METAFORMAT_C_STRING,
271 "text/plain",
272 comment,
273 klen + 1))
274 goto CLEANUP;
275 }
276 /* try to find PDF/ps special */
277 pos = opos;
278 while ( (size >= 100) &&
279 (pos < size - 100) )
280 {
281 switch (data[pos])
282 {
283 case 139: /* begin page 'bop', we typically have to skip that one to
284 find the zzz's */
285 pos += 45; /* skip bop */
286 break;
287 case 239: /* zzz1 */
288 len = data[pos + 1];
289 if ( (pos + 2 + len < size) &&
290 (0 != parseZZZ ((const char *) data, pos + 2, len, ec->proc,
291 ec->cls)) )
292 goto CLEANUP;
293 pos += len + 2;
294 break;
295 case 240: /* zzz2 */
296 len = getShortAt (&data[pos + 1]);
297 if ( (pos + 3 + len < size) &&
298 (0 != parseZZZ ((const char *) data, pos + 3, len, ec->proc,
299 ec->cls)) )
300 goto CLEANUP;
301 pos += len + 3;
302 break;
303 case 241: /* zzz3, who uses that? */
304 len = (getShortAt (&data[pos + 1])) + 65536 * data[pos + 3];
305 if ( (pos + 4 + len < size) &&
306 (0 != parseZZZ ((const char *) data, pos + 4, len, ec->proc,
307 ec->cls)) )
308 goto CLEANUP;
309 pos += len + 4;
310 break;
311 case 242: /* zzz4, hurray! */
312 len = getIntAt (&data[pos + 1]);
313 if ( (pos + 1 + len < size) &&
314 (0 != parseZZZ ((const char *) data, pos + 5, len, ec->proc,
315 ec->cls)) )
316 goto CLEANUP;
317 pos += len + 5;
318 break;
319 default: /* unsupported opcode, abort scan */
320 goto CLEANUP;
321 }
322 }
323 CLEANUP:
324 free (data);
325 }
326
327
328 /* end of dvi_extractor.c */