w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

pdfparse.c
Go to the documentation of this file.
1 /* $Header$
2  This is dvipdfm, a DVI to PDF translator.
3  Copyright (C) 1998, 1999 by Mark A. Wicks
4 
5  This program is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; either version 2 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program; if not, write to the Free Software
17  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 
19  The author may be contacted via the e-mail address
20 
21  mwicks@kettering.edu
22 */
23 
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <ctype.h>
28 #include <string.h>
29 #include "system.h"
30 #include "mem.h"
31 #include "mfileio.h"
32 #include "numbers.h"
33 #include "dvi.h"
34 #include "pdfparse.h"
35 #include "pdfspecial.h"
36 #include "pdfobj.h"
37 #include "pdfdoc.h"
38 #include "pdfdev.h"
39 
40 #define verbose 0
41 #define debug 0
42 
43 #define DUMP_LIMIT 50
44 void dump(char *start, char *end)
45 {
46  char *p = start;
47  fprintf (stderr, "\nCurrent input buffer is ");
48  fprintf (stderr, "-->");
49  while (p < end && p < start+DUMP_LIMIT)
50  fprintf (stderr, "%c", *(p++));
51  if (p == start+DUMP_LIMIT)
52  fprintf (stderr, "...\n");
53  fprintf (stderr, "<--\n");
54 }
55 
56 void skip_white (char **start, char *end)
57 {
58  while (*start < end && (isspace (**start) || **start == '%')) {
59  if (**start == '%')
60  skip_line (start, end);
61  else /* Skip the white char */
62  (*start)++;
63  }
64  return;
65 }
66 
67 void skip_line (char **start, char *end)
68 {
69  /* Note: PDF spec says that all platforms must end line with '\n'
70  after a "stream" keyword */
71  while (*start < end && **start != '\n' && **start != '\r')
72  (*start)++;
73  if (*start < end && **start == '\r')
74  (*start) += 1;
75  if (*start < end && **start == '\n')
76  (*start) += 1;
77  return;
78 }
79 
80 void parse_crap (char **start, char *end)
81 {
83  if (*start != end) {
84  fprintf (stderr, "\nCrap left over after object!!\n");
85  dump(*start, end);
86  }
87 }
88 
89 int is_an_int(const char *s)
90 {
91  int i;
92  for (i=0; i<strlen(s); i++) {
93  if (i == 0 && s[i] == '-')
94  continue;
95  if (!isdigit (s[i]))
96  return 0;
97  }
98  return 1;
99 }
100 
101 int is_a_number(const char *s)
102 {
103  int i, period = 0;
104  for (i=0; i<strlen(s); i++) {
105  if (s[i] == '-' && i == 0)
106  continue;
107  if (s[i] == '.' && !period) {
108  period = 1;
109  continue;
110  }
111  if (!isdigit (s[i]))
112  return 0;
113  }
114  return 1;
115 }
116 
117 
118 pdf_obj *parse_pdf_dict (char **start, char *end)
119 {
120  pdf_obj *result, *tmp1, *tmp2;
121  char *save = *start;
122  skip_white(start, end);
123  if (*((*start)++) != '<' ||
124  *((*start)++) != '<') {
125  *start = save;
126  dump (*start, end);
127  return NULL;
128  }
129  result = pdf_new_dict ();
130  skip_white(start, end);
131  while (*start < end &&
132  **start != '>') {
133  if ((tmp1 = parse_pdf_name (start, end)) == NULL) {
135  {
136  *start = save;
137  dump (*start, end);
138  return NULL;
139  }
140  };
141  if ((tmp2 = parse_pdf_object (start, end)) == NULL) {
144  {
145  *start = save;
146  dump (*start, end);
147  return NULL;
148  }
149  }
150  pdf_add_dict (result, tmp1, tmp2);
151  skip_white(start, end);
152  }
153  if (*start >= end) {
155  *start = save;
156  dump (*start, end);
157  return NULL;
158  }
159  if (*((*start)++) == '>' &&
160  *((*start)++) == '>') {
161  return result;
162  } else {
164  fprintf (stderr, "\nDictionary object ended prematurely\n");
165  *start = save;
166  dump (*start, end);
167  return NULL;
168  }
169 }
170 
172 {
173  pdf_obj *result, *tmp1;
174 #ifdef MEM_DEBUG
175 MEM_START
176 #endif
177  skip_white(start, end);
178  if (*((*start)++) != '[')
179  return NULL;
180  result = pdf_new_array ();
181  skip_white(start, end);
182  while (*start < end &&
183  **start != ']') {
184  if ((tmp1 = parse_pdf_object (start, end)) == NULL) {
186  return NULL;
187  };
189  skip_white(start, end);
190  }
191  if (*start >= end) {
193  fprintf (stderr, "\nArray ended prematurely\n");
194  return NULL;
195  }
196  (*start)++;
197 #ifdef MEM_DEBUG
198 MEM_END
199 #endif
200  return result;
201 }
202 
203 char *parse_number (char **start, char *end)
204 {
205  char *number, *save;
206 #ifdef MEM_DEBUG
207 MEM_START
208 #endif
209  skip_white(start, end);
210  save = *start;
211  if (*start < end && (**start == '+' || **start == '-')) {
212  *start += 1;
213  }
214  while (*start < end &&
215  isdigit(**start))
216  (*start)++;
217  if (*start < end && **start == '.') {
218  (*start)++;
219  while (*start < end &&
220  isdigit(**start))
221  (*start)++;
222  }
223  if (*start > save) {
224  number = NEW ((*start-save)+1, char);
225  memcpy (number, save, (*start-save));
226  number[*start-save] = 0;
227  return number;
228  }
229  *start = save;
230 #ifdef MEM_DEBUG
231 MEM_END
232 #endif
233  return NULL;
234 }
235 
236 char *parse_unsigned (char **start, char *end)
237 {
238  char *number, *save;
239 #ifdef MEM_DEBUG
240 MEM_START
241 #endif
242  skip_white(start, end);
243  save = *start;
244  while (*start < end &&
245  isdigit(**start))
246  (*start)++;
247  if (*start > save) {
248  number = NEW ((*start-save)+1, char);
249  memcpy (number, save, (*start-save));
250  number[*start-save] = 0;
251  return number;
252  }
253  *start = save;
254 #ifdef MEM_DEBUG
255 MEM_END
256 #endif
257  return NULL;
258 }
259 
260 static char *parse_gen_ident (char **start, char *end, char *valid_chars)
261 {
262  char *ident, *save;
263  save = *start;
264  skip_white(start, end);
265  while (*start < end && strchr (valid_chars, **start))
266  (*start)++;
267  if (save == *start)
268  return NULL;
269  ident = NEW (*start-save+1, char);
270  memcpy (ident, save, *start-save);
271  ident[*start-save] = 0;
272  return ident;
273 }
274 
275 char *parse_ident (char **start, char *end)
276 {
277  static char *valid_chars =
278  "!\"#$&'*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\^_`abcdefghijklmnopqrstuvwxyz|~";
279  return parse_gen_ident (start, end, valid_chars);
280 }
281 
282 char *parse_val_ident (char **start, char *end)
283 {
284  static char *valid_chars =
285  "!\"#$&'*+,-./0123456789:;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\^_`abcdefghijklmnopqrstuvwxyz|~";
286  return parse_gen_ident (start, end, valid_chars);
287 }
288 
289 char *parse_c_ident (char **start, char *end)
290 {
291  static char *valid_chars =
292  "0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz";
293  return parse_gen_ident (start, end, valid_chars);
294 }
295 
296 char *parse_opt_ident(char **start, char *end)
297 {
298  if (*start >= end || (**start) != '@')
299  return NULL;
300  (*start)++;
301  return parse_ident(start, end);
302 }
303 
304 
305 pdf_obj *parse_pdf_name (char **start, char *end)
306 {
307  pdf_obj *result;
308  char *name;
309  skip_white(start, end);
310  if (**start != '/') {
311  fprintf (stderr, "\nPDF Name expected and not found.\n");
312  dump(*start, end);
313  return NULL;
314  }
315  (*start)++;
316  if ((name = parse_ident(start, end)) != NULL) {
318  RELEASE (name);
319  return result;
320  }
321  return NULL;
322 }
323 
324 char *parse_pdf_reference(char **start, char *end)
325 {
326  skip_white (start, end);
327  if (**start != '@') {
328  fprintf (stderr, "\nPDF Name expected and not found.\n");
329  dump(*start, end);
330  return NULL;
331  }
332  (*start)++;
333  return parse_ident(start, end);
334 }
335 
337 {
338  skip_white (start, end);
339  if (end-*start > strlen ("true") &&
340  !strncmp (*start, "true", strlen("true"))) {
341  *start += strlen("true");
342  return pdf_new_boolean (1);
343  }
344  if (end - *start > strlen ("false") &&
345  !strncmp (*start, "false", strlen("false"))) {
346  *start += strlen("false");
347  return pdf_new_boolean (0);
348  }
349  return NULL;
350 }
351 
352 pdf_obj *parse_pdf_null (char **start, char *end)
353 {
354  char *save = *start;
355  char *ident;
356  skip_white (start, end);
358  if (!strcmp (ident, "null")) {
359  RELEASE(ident);
360  return pdf_new_null();
361  }
362  *start = save;
363  fprintf (stderr, "\nNot a valid object\n");
364  dump(*start, end);
365  return NULL;
366 }
367 
368 static pdf_obj *parse_pdf_number (char **start, char *end)
369 {
370  char *number;
371  pdf_obj *result;
372  skip_white(start, end);
373  if ((number = parse_number(start, end)) != NULL) {
375  RELEASE (number);
376  return result;
377  }
378  return NULL;
379 }
380 
381 int xtod (char c)
382 {
383  if (c >= '0' && c <= '9')
384  return c-'0';
385  if (c >= 'A' && c <= 'F')
386  return (c-'A')+10;
387  if (c >= 'a' && c <= 'f')
388  return (c-'a')+10;
389  return 0;
390 }
391 
393 {
394  pdf_obj *result;
395  char *save;
396  unsigned char *string = NULL;
397  int strlength;
398  skip_white (start, end);
399  if (*start == end || *((*start)++) != '<')
400  return NULL;
401  save = *start;
402  string = NEW ((end - *start)/2+2, unsigned char); /* A little excess here */
403  strlength = 0;
404  while (*start < end && **start != '>') {
405  string[strlength] = xtod(**start) * 16;
406  (*start) += 1;
407  if (*start < end && **start != '>') {
408  string[strlength] += xtod(**start);
409  (*start) += 1;
410  }
411  skip_white (start, end);
412  strlength += 1;
413  }
414  if (*start < end) {
415  *start += 1;
416  result = pdf_new_string (string, strlength);
417  } else {
418  result = NULL;
419  }
420  if (string)
421  RELEASE(string);
422  return result;
423 }
424 
425 pdf_obj *parse_pdf_string (char **start, char *end)
426 {
427  pdf_obj *result;
428  int balance = 0;
429  char *save;
430  unsigned char *string;
431  int strlength;
432  skip_white(start, end);
433  save = *start;
434  if (*start == end || **start != '(') {
435  return NULL;
436  }
437  ++(*start);
438  string = NEW (end - *start, unsigned char);
439  strlength = 0;
440  balance = 0;
441  while (*start < end &&
442  (**start != ')' || balance > 0)) {
443  if (**start == '\\')
444  switch (*(++(*start))) {
445  case 'n':
446  string[strlength++] = '\n';
447  (*start)++;
448  break;
449  case 'r':
450  string[strlength++] = '\r';
451  (*start)++;
452  break;
453  case 't':
454  string[strlength++] = '\t';
455  (*start)++;
456  break;
457  case 'b':
458  string[strlength++] = '\b';
459  (*start)++;
460  break;
461  default:
462  if (isdigit(**start)) {
463  int i;
464  string[strlength] = 0;
465  for (i=0; i<3; i++)
466  string[strlength] = string[strlength]*8 + (*((*start)++)-'0');
467  strlength+= 1;
468  } else {
469  string[strlength++] = *((*start)++);
470  }
471  }
472  else {
473  if (**start == '(')
474  balance += 1;
475  if (**start == ')')
476  balance -= 1;
477  string[strlength++] = *((*start)++);
478  }
479  }
480  if (*start >= end) {
481  fprintf (stderr, "\nString object ended prematurely\n");
482  dump (save, *start);
483  return NULL;
484  }
485  (*start)++;
486  result = pdf_new_string (string, strlength);
487  RELEASE (string);
488  return result;
489 }
490 
491 char *parse_c_string (char **start, char *end)
492 {
493  char *string, *save;
494  int strlength;
495  skip_white(start, end);
496  save = *start;
497  if (*start == end || **start != '"') {
498  return NULL;
499  }
500  ++(*start);
501  string = NEW (end - *start, char);
502  strlength = 0;
503  while (*start < end && (**start != '"')) {
504  if (**start == '\\')
505  switch (*(++(*start))) {
506  case '"':
507  string[strlength++] = '"';
508  (*start)++;
509  break;
510  case 'n':
511  string[strlength++] = '\n';
512  (*start)++;
513  break;
514  case 'r':
515  string[strlength++] = '\r';
516  (*start)++;
517  break;
518  case 't':
519  string[strlength++] = '\t';
520  (*start)++;
521  break;
522  case 'b':
523  string[strlength++] = '\b';
524  (*start)++;
525  break;
526  default:
527  if (isdigit(**start)) {
528  int i;
529  string[strlength] = 0;
530  for (i=0; i<3; i++)
531  string[strlength] = string[strlength]*8 + (*((*start)++)-'0');
532  strlength+= 1;
533  } else {
534  string[strlength++] = *((*start)++);
535  }
536  }
537  else {
538  string[strlength++] = *((*start)++);
539  }
540  string[strlength]=0;
541  }
542  if (*start >= end) {
543  fprintf (stderr, "\nString ended prematurely\n");
544  dump (save, *start);
545  return NULL;
546  }
547  string[strlength] = 0;
548  (*start)++;
549  return string;
550 }
551 
552 static pdf_obj *parse_pdf_stream (char **start, char *end, pdf_obj
553  *dict)
554 {
555  pdf_obj *result, *new_dict, *tmp1, *length_obj;
556  unsigned long length;
557  if (pdf_lookup_dict(dict, "F")) {
558  fprintf (stderr, "File streams not implemented (yet)");
559  return NULL;
560  }
561  if ((tmp1 = pdf_lookup_dict(dict, "Length")) == NULL) {
562  fprintf (stderr, "No length specified");
563  return NULL;
564  }
565  length = pdf_number_value (length_obj = pdf_deref_obj (tmp1));
566  pdf_release_obj (length_obj);
567  skip_white(start, end);
568  skip_line(start, end);
569  result = pdf_new_stream(0);
570  new_dict = pdf_stream_dict(result);
571  pdf_merge_dict (new_dict, dict);
572  pdf_release_obj (dict);
573  pdf_add_stream (result, *start, length);
574  *start += length;
575  skip_white(start, end);
576  if (*start+strlen("endstream") > end ||
577  strncmp(*start, "endstream", strlen("endstream"))) {
578  fprintf (stderr, "\nendstream not found\n");
579  return NULL;
580  }
581  *start += strlen("endstream");
582  return result;
583 }
584 
585 pdf_obj *parse_pdf_object (char **start, char *end)
586 {
587  pdf_obj *result, *tmp1=NULL, *tmp2=NULL;
588  char *save = *start;
589  char *position2;
590  skip_white(start, end);
591  if (*start >= end)
592  return NULL;
593  switch (**start) {
594  case '<':
595  /* Check for those troublesome strings starting with '<' */
596  if (*start+1 < end && *(*start+1) != '<') {
597  result = parse_pdf_hex_string (start, end);
598  break;
599  }
600  result = parse_pdf_dict (start, end);
601  skip_white(start, end);
602  if (end - *start > strlen("stream") &&
603  !strncmp(*start, "stream", strlen("stream"))) {
604  result = parse_pdf_stream (start, end, result);
605  }
606  /* Check for stream */
607  break;
608  case '(':
609  result = parse_pdf_string(start, end);
610  break;
611  case '[':
612  result = parse_pdf_array(start, end);
613  break;
614  case '/':
615  result = parse_pdf_name(start, end);
616  break;
617  case '@':
618  result = get_reference(start, end);
619  break;
620  case 't':
621  case 'f':
622  result = parse_pdf_boolean(start, end);
623  break;
624  default:
625  /* This is a bit of a hack, but PDF doesn't easily allow you to
626  tell a number from an indirect object reference with some
627  serious looking ahead */
628 
629  if (*start < end &&
630  (isdigit(**start) || **start == '+' || **start == '-' || **start == '.')) {
632  tmp2 = NULL;
633  /* This could be a # # R type reference. We can't be sure unless
634  we look ahead for the second number and the 'R' */
635  skip_white(start, end);
636  position2 = *start;
637  if (*start < end && isdigit(**start)) {
638  tmp2 = parse_pdf_number(start, end);
639  } else
640  tmp2 = NULL;
641  skip_white(start, end);
642  if (tmp1 != NULL && tmp2 != NULL && *start < end && *((*start)++) == 'R') {
643  result = pdf_new_ref ((unsigned long) pdf_number_value (tmp1),
644  (int) pdf_number_value (tmp2));
646  pdf_release_obj (tmp2);
647  break;
648  }
649  /* Following checks if we got two numbers, but not 'r' */
650  if (tmp1 != NULL && tmp2 != NULL) {
651  pdf_release_obj (tmp2);
652  *start = position2;
653  }
654  result = tmp1;
655  break;
656  }
657  if (*start < end && **start == 'n') {
659  break;
660  }
661  result = NULL;
662  break;
663  }
664  if (result == NULL) {
665  fprintf (stderr, "\nExpecting an object, but didn't find one");
666  *start = save;
667  dump(*start, end);
668  }
669  return result;
670 }
671 
672 void parse_key_val (char **start, char *end, char **key, char **val)
673 {
674  *key = NULL;
675  *val = NULL;
676  skip_white (start, end);
677  if ((*key = parse_c_ident (start, end))) {
678  skip_white (start, end);
679  if (*start < end && **start == '=')
680  {
681  (*start) += 1;
682  skip_white (start, end);
683  if (*start < end) switch (**start) {
684  case '"':
685  *val = parse_c_string (start, end);
686  break;
687  default:
689  }
690  }
691  }
692 }
#define name
#define n
Definition: t4ht.c:1290
#define b
Definition: jpegint.h:372
int strcmp()
Definition: coll.cpp:143
mpz_t * f
Definition: gen-fib.c:34
#define s
Definition: afcover.h:80
#define c(n)
Definition: gpos-common.c:150
#define a(n)
Definition: gpos-common.c:148
#define strchr
Definition: gsftopk.c:59
#define memcpy(d, s, n)
Definition: gsftopk.c:64
#define NULL
Definition: ftobjs.h:61
small capitals from c petite p
Definition: afcover.h:72
small capitals from c petite p scientific i
Definition: afcover.h:80
#define NEW
Definition: gdkanji.c:77
double atof(const char *)
#define RELEASE(p)
Definition: mem.h:39
pdf_obj * pdf_new_array(void)
Definition: pdfobj.c:1421
static pdf_obj * pdf_new_ref(pdf_out *p, pdf_obj *object)
Definition: pdfobj.c:3671
pdf_obj * pdf_new_name(const char *name)
Definition: pdfobj.c:1330
pdf_obj * pdf_new_number(double value)
Definition: pdfobj.c:1076
double pdf_number_value(pdf_obj *object)
Definition: pdfobj.c:1119
void pdf_release_obj(pdf_obj *object)
Definition: pdfobj.c:3217
pdf_obj * pdf_new_boolean(char value)
Definition: pdfobj.c:1034
void pdf_add_array(pdf_obj *array, pdf_obj *object)
Definition: pdfobj.c:1511
pdf_obj * pdf_new_dict(void)
Definition: pdfobj.c:1645
pdf_obj * pdf_new_null(void)
Definition: pdfobj.c:1017
int pdf_add_dict(pdf_obj *dict, pdf_obj *key, pdf_obj *value)
Definition: pdfobj.c:1680
char * parse_number(const char **start, const char *end)
Definition: pdfparse.c:138
char * parse_val_ident(const char **start, const char *end)
Definition: pdfparse.c:204
void skip_white(const char **start, const char *end)
Definition: pdfparse.c:104
void dump(const char *start, const char *end)
Definition: pdfparse.c:74
pdf_obj * parse_pdf_array(const char **pp, const char *endptr, pdf_file *pf)
Definition: pdfparse.c:982
pdf_obj * parse_pdf_dict(const char **pp, const char *endptr, pdf_file *pf)
Definition: pdfparse.c:976
char * parse_opt_ident(const char **start, const char *end)
Definition: pdfparse.c:213
char * parse_unsigned(const char **start, const char *end)
Definition: pdfparse.c:161
pdf_obj * parse_pdf_object(const char **pp, const char *endptr, pdf_file *pf)
Definition: pdfparse.c:988
char * parse_ident(const char **start, const char *end)
Definition: pdfparse.c:195
pdf_obj * parse_pdf_name(const char **pp, const char *endptr)
Definition: pdfparse.c:327
pdf_obj * parse_pdf_null(const char **pp, const char *endptr)
Definition: pdfparse.c:390
pdf_obj * parse_pdf_number(const char **pp, const char *endptr)
Definition: pdfparse.c:224
pdf_obj * parse_pdf_boolean(const char **pp, const char *endptr)
Definition: pdfparse.c:365
#define MEM_END
Definition: mem.h:45
#define MEM_START
Definition: mem.h:44
static pdf_obj * tmp1
Definition: pdfdoc.c:76
pdf_obj * parse_pdf_hex_string(char **start, char *end)
Definition: pdfparse.c:392
#define DUMP_LIMIT
Definition: pdfparse.c:43
char * parse_c_ident(char **start, char *end)
Definition: pdfparse.c:289
int is_a_number(const char *s)
Definition: pdfparse.c:101
int is_an_int(const char *s)
Definition: pdfparse.c:89
void skip_line(char **start, char *end)
Definition: pdfparse.c:67
static char * parse_gen_ident(char **start, char *end, char *valid_chars)
Definition: pdfparse.c:260
char * parse_c_string(char **start, char *end)
Definition: pdfparse.c:491
void parse_key_val(char **start, char *end, char **key, char **val)
Definition: pdfparse.c:672
char * parse_pdf_reference(char **start, char *end)
Definition: pdfparse.c:324
void parse_crap(char **start, char *end)
Definition: pdfparse.c:80
int xtod(char c)
Definition: pdfparse.c:381
#define fprintf
Definition: mendex.h:64
long tell()
int strncmp()
#define isdigit(c)
Definition: snprintf.c:177
real to[600]
Definition: pmxab.c:87
int r
Definition: ppmqvga.c:68
#define isspace(ch)
Definition: utype.h:87
Definition: pdfobj.c:63
Definition: dvips.h:235
Definition: strexpr.c:21
static unsigned char * save
Definition: t1disasm.c:278
int number
Definition: t1part.c:207
*job_name strlen((char *) job_name) - 4)
#define key
Definition: tex2xindy.c:753
char ident[]
Definition: ttf2pk.c:40
@ start
Definition: preamble.c:52
#define end(cp)
Definition: zic.c:71