"Fossies" - the Fresh Open Source Software Archive 
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
1 /* Copyright (C) 2002-2005 Ghostgum Software Pty Ltd. All rights reserved.
2
3 This software is provided AS-IS with no warranty, either express or
4 implied.
5
6 This software is distributed under licence and may not be copied,
7 modified or distributed except as expressly authorised under the terms
8 of the licence contained in the file LICENCE in this distribution.
9
10 For more information about licensing, please refer to
11 http://www.ghostgum.com.au/ or contact Ghostsgum Software Pty Ltd,
12 218 Gallaghers Rd, Glen Waverley VIC 3150, AUSTRALIA,
13 Fax +61 3 9886 6616.
14 */
15
16 /* $Id: cpdfscan.c,v 1.7 2005/06/10 09:39:24 ghostgum Exp $ */
17 /* PDF scanner */
18
19 /* This is a rudimentary PDF scanner, intended to get
20 * the page count, and for each page the Rotate, MediaBox
21 * and CropBox.
22 */
23
24 #ifdef DEMO_PDFSCAN
25 # include <windows.h>
26 # include <stdio.h>
27 # include <stdarg.h>
28 # include <string.h>
29 # include <ctype.h>
30 # ifdef _MSC_VER
31 # define vsnprintf _vsnprintf
32 # endif
33 # define csfopen fopen
34 # define cslen strlen
35 #else
36 # include "common.h"
37 # include <ctype.h>
38 #endif
39
40 #include "cpdfscan.h"
41
42
43 /* Limitations.
44 *
45 * We currently load the entire xref table. To minimise memory
46 * would could instead keep a list of xref blocks, and do random
47 * access within those.
48 *
49 * Memory management is very simple. We just keep a linked
50 * list of allocated blocks for composite objects.
51 * We empty the stack, and free all PDF objects and composite
52 * objects before returning to the caller.
53 * We don't bother doing garbage collection.
54 */
55
56
57 /* We keep a linked list of memory allocated for composite objects
58 * such as name, string, array or dict.
59 */
60 typedef struct PDFMEM_s PDFMEM;
61 struct PDFMEM_s {
62 void *ptr;
63 int len;
64 PDFMEM *next;
65 };
66
67 /* The token scanner and object references understand the following types */
68 typedef enum rtype_e {
69 invalidtype=0,
70 marktype=1,
71 nulltype=2,
72 booltype=3, /* uses boolval */
73 integertype=4, /* uses intval */
74 realtype=5, /* uses realval */
75 nametype=6, /* uses nameval */
76 stringtype=7, /* uses strval */
77 arraytype=8, /* uses arrayval */
78 dicttype=9, /* uses dictval */
79 optype=10, /* uses opval */
80 streamtype=11, /* uses streamval */
81 objtype=12, /* uses objval */
82 commenttype=13
83 } rtype;
84
85 const char *rtype_string[] = {
86 "invalidtype", "marktype", "nulltype", "booltype", "integertype",
87 "realtype", "nametype", "stringtype", "arraytype", "dicttype",
88 "optype", "streamtype", "objtype", "commenttype"
89 };
90
91 /* A reference contains a simple object, or a pointer to
92 * a composite object.
93 */
94 typedef struct ref_s ref;
95 struct ref_s {
96 rtype type;
97 int rsize;
98 union value_u {
99 /* simple */
100 void *voidval;
101 BOOL boolval;
102 int intval;
103 float realval;
104 /* composite */
105 char *nameval;
106 char *strval;
107 ref *arrayval;
108 ref *dictval;
109 char *opval;
110 /* simple */
111 unsigned long streamval;
112 int objval;
113 } value;
114 };
115
116 /* Cross reference table entry */
117 typedef struct PDFXREF_s {
118 unsigned long offset;
119 int generation;
120 BOOL used;
121 } PDFXREF;
122
123 struct PDFSCAN_s {
124 void *handle;
125 int (*print_fn)(void *handle, const char *ptr, int len);
126 TCHAR filename[1024];
127 FILE *file;
128 char *buf;
129 int buflen; /* length of allocated buf */
130 int len; /* #bytes currently in buf */
131 int offset; /* file offset to start of buf */
132 int begin; /* offset in buf to start of token */
133 int end; /* offset in buf to end of token */
134 rtype token_type; /* token type */
135 BOOL instream; /* In a stream, looking for endstream */
136 unsigned long xref_offset; /* offset to xref table */
137 PDFXREF *xref;
138 int xref_len;
139
140 /* Object numbers obtained during pdf_scan_open() */
141 int root; /* root object reference */
142 int info; /* document info dicionary reference */
143 int pages; /* Pages dictionary reference */
144 int page_count; /* number of pages */
145
146 /* Cached page media */
147 int pagenum;
148 int rotate;
149 PDFBBOX mediabox;
150 PDFBBOX cropbox;
151
152 /* memory allocation */
153 PDFMEM *memory_head;
154 PDFMEM *memory_tail;
155
156 /* operand stack */
157 ref *ostack;
158 int ostack_idx; /* index to top of ostack */
159 int ostack_len; /* Initially 512 */
160 int ostack_maxlen; /* maximum depth of ostack */
161
162 /* objects in memory */
163 /* This contains pairs of integer & reference */
164 ref *objs;
165 int objs_count; /* count of loaded objects */
166 int objs_len; /* length of objs */
167 int objs_maxlen; /* maximum number entries in objs */
168 };
169
170 typedef enum PDFSEEK_e {
171 PDFSEEK_CUR,
172 PDFSEEK_END,
173 PDFSEEK_SET
174 } PDFSEEK;
175
176
177 /* Prototypes */
178 static int pdf_scan_next_token(PDFSCAN *ps);
179 static int pdf_scan_read_trailer(PDFSCAN *ps, unsigned long *prev);
180 static int pdf_scan_read_xref(PDFSCAN *ps, unsigned long xref_offset);
181
182 static void clear_stack(PDFSCAN *ps);
183 static void clear_objs(PDFSCAN *ps);
184 static void pdf_scan_freeall(PDFSCAN *ps);
185 static void pdf_scan_cleanup(PDFSCAN *ps);
186 static int pdf_scan_open_file(PDFSCAN *ps);
187
188
189 /*****************************************************************/
190 /* text message output */
191
192 static int
193 pdf_scan_write(PDFSCAN *ps, const char *str, int len)
194 {
195 if (ps != NULL)
196 fwrite(str, 1, len, stdout);
197 else
198 (*ps->print_fn)(ps->handle, str, len);
199 return len;
200 }
201
202 static int
203 pdf_scan_msgf(PDFSCAN *ps, const char *fmt, ...)
204 {
205 va_list args;
206 int count;
207 char buf[2048];
208 va_start(args,fmt);
209 count = vsnprintf(buf, sizeof(buf), fmt, args);
210 pdf_scan_write(ps, buf, count);
211 va_end(args);
212 return count;
213 }
214
215 /*****************************************************************/
216 /* memory allocation */
217
218 static void
219 pdf_scan_cleanup(PDFSCAN *ps)
220 {
221 if (ps->file)
222 fclose(ps->file);
223 ps->file = NULL;
224 clear_stack(ps);
225 clear_objs(ps);
226 pdf_scan_freeall(ps);
227 }
228
229 static void *pdf_scan_alloc(PDFSCAN *ps, const void *ptr, int len)
230 {
231 void *data;
232 PDFMEM *mem = (PDFMEM *)malloc(sizeof(PDFMEM));
233 if (mem == NULL)
234 return NULL;
235
236 data = malloc(len);
237 if (data == NULL) {
238 free(mem);
239 return NULL;
240 }
241
242 mem->ptr = data;
243 mem->next = NULL;
244 mem->len = len;
245 memcpy(data, ptr, len);
246
247 if (ps->memory_tail) {
248 ps->memory_tail->next = mem;
249 ps->memory_tail = mem;
250 }
251 else
252 ps->memory_head = ps->memory_tail = mem;
253 return data;
254 }
255
256 /* free all name/string/array/dict memory */
257 static void
258 pdf_scan_freeall(PDFSCAN *ps)
259 {
260 PDFMEM *memnext;
261 PDFMEM *mem = ps->memory_head;
262 while (mem) {
263 memnext = mem->next;
264 free(mem->ptr);
265 free(mem);
266 mem = memnext;
267 }
268 ps->memory_head = ps->memory_tail = NULL;
269 }
270
271 /*****************************************************************/
272 /* Token checks */
273
274 static BOOL is_optoken(PDFSCAN *ps, const char *str)
275 {
276 return (ps->token_type == optype) &&
277 (ps->end-ps->begin == (int)strlen(str)) &&
278 (memcmp(ps->buf+ps->begin, str, ps->end-ps->begin) == 0);
279 }
280
281 static int
282 type_check(PDFSCAN *ps, rtype type)
283 {
284 if (ps->token_type == type)
285 return 0;
286
287 pdf_scan_msgf(ps, "Error at offset %ld. Expecting %s and found %s\n",
288 ps->offset + ps->begin,
289 rtype_string[(int)type],
290 rtype_string[(int)ps->token_type]);
291 pdf_scan_msgf(ps, "Token is \042");
292 pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
293 pdf_scan_msgf(ps, "\042\n");
294 return -1;
295 }
296
297 static int
298 op_check(PDFSCAN *ps, const char *str)
299 {
300 int code = type_check(ps, optype);
301 if (code)
302 return code;
303
304 if (!is_optoken(ps, str)) {
305 pdf_scan_msgf(ps,
306 "Error at offset %ld. Expecting \042%s\042 and found \042",
307 ps->offset + ps->begin, str);
308 pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
309 pdf_scan_msgf(ps, "\042\n");
310 code = -1;
311 }
312 return code;
313 }
314
315 /*****************************************************************/
316 /* stack */
317
318 const ref invalidref = {invalidtype, 0, {NULL}};
319 const ref markref = {marktype, 0, {NULL}};
320
321 /* Push item, return depth of stack */
322 /* >0 is success, <=0 is failure */
323 static int push_stack(PDFSCAN *ps, ref r)
324 {
325 int idx;
326 if (ps->ostack_idx + 1 >= ps->ostack_len) {
327 /* increase stack size */
328 ref *newstack;
329 int newlen = ps->ostack_len + 256;
330 if (newlen > ps->ostack_maxlen) {
331 pdf_scan_msgf(ps, "push_stack: stack overflow\n");
332 return 0;
333 }
334 newstack = (ref *)malloc(newlen * sizeof(ref));
335 if (newstack == NULL) {
336 pdf_scan_msgf(ps, "push_stack: Out of memory\n");
337 return 0;
338 }
339 memcpy(newstack, ps->ostack, ps->ostack_len * sizeof(ref));
340 free(ps->ostack);
341 ps->ostack = newstack;
342 ps->ostack_len = newlen;
343 }
344 idx = ++(ps->ostack_idx);
345 ps->ostack[idx] = r;
346 return idx;
347 }
348
349 static ref pop_stack(PDFSCAN *ps)
350 {
351 if (ps->ostack_idx <= 0) {
352 pdf_scan_msgf(ps, "pop_stack: stack underflow\n");
353 return invalidref;
354 }
355 return ps->ostack[ps->ostack_idx--];
356 }
357
358 static void clear_stack(PDFSCAN *ps)
359 {
360 ps->ostack_idx = 0;
361 }
362
363 static ref index_stack(PDFSCAN *ps, int n)
364 {
365 if (n < 0) {
366 pdf_scan_msgf(ps, "index_stack: index must not be negative\n");
367 return invalidref;
368 }
369 if (ps->ostack_idx <= n) {
370 pdf_scan_msgf(ps, "index_stack: stack isn't that deep\n");
371 return invalidref;
372 }
373 return ps->ostack[ps->ostack_idx-n];
374 }
375
376 static ref top_stack(PDFSCAN *ps)
377 {
378 if (ps->ostack_idx <= 0) {
379 pdf_scan_msgf(ps, "top_stack: stack is empty\n");
380 return invalidref;
381 }
382 return ps->ostack[ps->ostack_idx];
383 }
384
385 /*****************************************************************/
386 /* references */
387
388
389 static ref make_int(int value)
390 {
391 ref r;
392 r.type = integertype;
393 r.rsize = 0;
394 r.value.intval = value;
395 return r;
396 }
397
398 static ref make_string(PDFSCAN *ps, const char *str, int len)
399 {
400 ref r;
401 r.type = stringtype;
402 r.rsize = len;
403 r.value.strval = pdf_scan_alloc(ps, str, len);
404 if (r.value.strval == NULL)
405 return invalidref;
406 return r;
407 }
408
409 static ref make_name(PDFSCAN *ps, const char *str, int len)
410 {
411 ref r;
412 r.type = nametype;
413 r.rsize = len;
414 r.value.nameval = pdf_scan_alloc(ps, str, len);
415 if (r.value.nameval == NULL)
416 return invalidref;
417 return r;
418 }
419
420 static BOOL nameref_equals(ref *r, const char *name)
421 {
422 int len = (int)strlen(name);
423 if (r->type != nametype)
424 return FALSE;
425 if (r->rsize != len)
426 return FALSE;
427 return (memcmp(r->value.nameval, name, len) == 0);
428 }
429
430 /* Get a reference from a dictionary */
431 /* Return the result, but don't push it */
432 static ref dict_get(PDFSCAN *ps, const char *name)
433 {
434 int namelen = (int)strlen(name);
435 ref dict = top_stack(ps);
436 ref *r;
437 int dictlen;
438 int i;
439 if (dict.type == invalidtype)
440 return invalidref;
441 dictlen = dict.rsize * 2;
442 for (i = 0; i<dictlen; i+=2) {
443 r = &dict.value.dictval[i];
444 if ((r->rsize == namelen) && (r->type == nametype) &&
445 (memcmp(r->value.nameval, name, namelen) ==0))
446 return dict.value.dictval[i+1];
447 }
448 return invalidref;
449 }
450
451 /* convert the items on the stack to an array on the stack */
452 static ref array_to_mark(PDFSCAN *ps)
453 {
454 ref r;
455 ref *array;
456 int n = ps->ostack_idx;
457 int len;
458 while ((n>0) && (ps->ostack[n].type != marktype))
459 n--;
460 if (n == 0) {
461 pdf_scan_msgf(ps, "array_to_mark: no mark on stack\n");
462 return invalidref;
463 }
464 len = ps->ostack_idx - n;
465 r.type = arraytype;
466 r.rsize = len;
467 r.value.arrayval = NULL;
468 if (len) {
469 array = pdf_scan_alloc(ps, &ps->ostack[n+1], len * sizeof(ref));
470 if (array)
471 r.value.arrayval = array;
472 else
473 return invalidref;
474 }
475 ps->ostack_idx -= len + 1;
476 push_stack(ps, r);
477 return r;
478 }
479
480 /* convert the items on the stack to a dictionary on the stack */
481 static ref dict_to_mark(PDFSCAN *ps)
482 {
483 ref r;
484 ref *dict;
485 int n = ps->ostack_idx;
486 int len;
487 while ((n>0) && (ps->ostack[n].type != marktype))
488 n--;
489 if (n == 0) {
490 pdf_scan_msgf(ps, "dict_to_mark: no mark on stack\n");
491 return invalidref;
492 }
493 len = ps->ostack_idx - n;
494 if (len & 1) {
495 pdf_scan_msgf(ps, "dict_to_mark: must have name/value pairs\n");
496 return invalidref;
497 }
498 r.type = dicttype;
499 r.rsize = len/2;
500 r.value.arrayval = NULL;
501 if (len) {
502 dict = pdf_scan_alloc(ps, &ps->ostack[n+1], len * sizeof(ref));
503 if (dict)
504 r.value.arrayval = dict;
505 else
506 return invalidref;
507 }
508 ps->ostack_idx -= len + 1;
509 push_stack(ps, r);
510 return r;
511 }
512
513 /*****************************************************************/
514
515 /* Push reference from a token */
516 static ref push_token(PDFSCAN *ps)
517 {
518 ref r;
519 int len = ps->end - ps->begin;
520 const char *p = ps->buf + ps->begin;
521 r.type = ps->token_type;
522 r.rsize = 0;
523 r.value.voidval = NULL;
524 switch(r.type) {
525 case invalidtype:
526 break;
527 case marktype:
528 break;
529 case nulltype:
530 break;
531 case booltype:
532 if ((len == 4) && (memcmp(p, "true", 4)==0))
533 r.value.boolval = TRUE;
534 else if ((len == 5) && (memcmp(p, "true", 5)==0))
535 r.value.boolval = FALSE;
536 else
537 r = invalidref;
538 break;
539 case integertype:
540 { char buf[64];
541 if (len > (int)sizeof(buf)-1)
542 r = invalidref;
543 else {
544 memcpy(buf, p, len);
545 buf[len] = '\0';
546 r.value.intval = atoi(buf);
547 }
548 }
549 break;
550 case realtype:
551 { char buf[64];
552 if (len > (int)sizeof(buf)-1)
553 r = invalidref;
554 else {
555 memcpy(buf, p, len);
556 buf[len] = '\0';
557 r.value.realval = (float)atof(buf);
558 }
559 }
560 break;
561 case nametype:
562 r = make_name(ps, p+1, len-1);
563 break;
564 case stringtype:
565 r = make_string(ps, p, len);
566 break;
567 case streamtype:
568 case commenttype:
569 case objtype:
570 case optype:
571 case arraytype:
572 case dicttype:
573 /* Can't push these from a token */
574 /* These are made by operators like stream, R, ], >> */
575 return invalidref;
576 default:
577 r.type = invalidtype;
578 break;
579 }
580 push_stack(ps, r);
581 return r;
582 }
583
584 /* Process known operators */
585 static int process_op(PDFSCAN *ps)
586 {
587 ref r;
588 if (ps->token_type != optype)
589 return 1; /* not an op */
590 if (is_optoken(ps, "R")) {
591 /* convert "n 0 R" to an indirect reference */
592 ref r1 = index_stack(ps, 1);
593 r = top_stack(ps);
594 if ((r.type == integertype) && (r1.type == integertype)) {
595 r.type = objtype;
596 r.rsize = r.value.intval;
597 r.value.intval = r1.value.intval;
598 pop_stack(ps);
599 pop_stack(ps);
600 push_stack(ps, r);
601 }
602 }
603 else if (is_optoken(ps, "]")) {
604 array_to_mark(ps);
605 }
606 else if (is_optoken(ps, ">>")) {
607 dict_to_mark(ps);
608 }
609 else if (is_optoken(ps, "null")) {
610 r.type = nulltype;
611 r.rsize = 0;
612 r.value.voidval = NULL;
613 push_stack(ps, r);
614 }
615 else if (is_optoken(ps, "obj")) {
616 pdf_scan_msgf(ps, "ignoring obj token\n");
617 /* ignore */
618 }
619 else if (is_optoken(ps, "endobj")) {
620 pdf_scan_msgf(ps, "ignoring endobj token\n");
621 /* ignore */
622 }
623 else if (is_optoken(ps, "stream")) {
624 /* stream object contains offset to start of stream */
625 r.type = streamtype;
626 r.rsize = 0;
627 r.value.streamval = ps->offset + ps->end;
628 push_stack(ps, r);
629 /* Now skip over stream */
630 pdf_scan_next_token(ps);
631 }
632 else {
633 pdf_scan_msgf(ps, "process_op: unrecognised operator \042");
634 pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
635 pdf_scan_msgf(ps, "\042\n");
636 return -1;
637 }
638 return 0;
639 }
640
641 /*****************************************************************/
642 /* Debugging and error messages */
643
644 #ifdef NOTUSED
645
646 /* Print a reference, returning number of characters written */
647 static int
648 print_ref(PDFSCAN *ps, ref *r)
649 {
650 int n = 0;
651 switch(r->type) {
652 case invalidtype:
653 n = pdf_scan_msgf(ps, "--invalid--");
654 break;
655 case marktype:
656 n = pdf_scan_msgf(ps, "--mark--");
657 break;
658 case nulltype:
659 n = pdf_scan_msgf(ps, "--null--");
660 break;
661 case booltype:
662 n = pdf_scan_msgf(ps, "%s", r->value.boolval ? "true" : "false");
663 break;
664 case integertype:
665 n = pdf_scan_msgf(ps, "%d", r->value.intval);
666 break;
667 case realtype:
668 n = pdf_scan_msgf(ps, "%g", r->value.realval);
669 break;
670 case nametype:
671 n = pdf_scan_write(ps, "/", 1);
672 pdf_scan_write(ps, r->value.nameval, r->rsize);
673 break;
674 case stringtype:
675 n = pdf_scan_write(ps, "(", 1);
676 n += pdf_scan_write(ps, r->value.strval, r->rsize);
677 n += pdf_scan_write(ps, ")", 1);
678 break;
679 case streamtype:
680 n = pdf_scan_msgf(ps, "--stream:%d--", r->value.streamval);
681 break;
682 case commenttype:
683 n = pdf_scan_msgf(ps, "--comment--");
684 break;
685 case objtype:
686 n = pdf_scan_msgf(ps, "--obj:%d--", r->value.objval);
687 break;
688 case optype:
689 n = pdf_scan_msgf(ps, "--op:");
690 n += pdf_scan_write(ps, r->value.opval, r->rsize);
691 n += pdf_scan_write(ps, "--", 2);
692 break;
693 case arraytype:
694 n = pdf_scan_msgf(ps, "--array:%d--", r->rsize);
695 break;
696 case dicttype:
697 n = pdf_scan_msgf(ps, "--dict:%d--", r->rsize);
698 break;
699 default:
700 n = pdf_scan_msgf(ps, "--unknown--");
701 break;
702 }
703 return n;
704 }
705
706 /* print a reference, expanding array and dict */
707 static int
708 print_ref_expand(PDFSCAN *ps, ref *r)
709 {
710 int i;
711 int n = 0;;
712 if (r->type == arraytype) {
713 n += pdf_scan_msgf(ps, "[ ");
714 for (i=0; i<r->rsize; i++) {
715 n += print_ref(ps, &r->value.arrayval[i]);
716 n += pdf_scan_msgf(ps, " ");
717 }
718 n += pdf_scan_msgf(ps, "]");
719 }
720 else if (r->type == dicttype) {
721 n += pdf_scan_msgf(ps, "<< ");
722 for (i=0; i<r->rsize; i++) {
723 n += print_ref(ps, &r->value.dictval[i+i]);
724 n += pdf_scan_msgf(ps, " ");
725 n += print_ref(ps, &r->value.dictval[i+i+1]);
726 n += pdf_scan_msgf(ps, " ");
727 }
728 n += pdf_scan_msgf(ps, ">>");
729 }
730 else
731 n += print_ref(ps, r);
732 return n;
733 }
734
735 static void
736 print_stack(PDFSCAN *ps)
737 {
738 int i, n=ps->ostack_idx;
739 int col = 0;
740 pdf_scan_msgf(ps, "Stack: ");
741 for (i=1; i<=n; i++) {
742 col += print_ref(ps, &ps->ostack[i]);
743 if (col > 70) {
744 pdf_scan_msgf(ps, "\n");
745 col = 0;
746 }
747 else
748 col += pdf_scan_msgf(ps, " ");
749 }
750 pdf_scan_msgf(ps, "\n");
751 }
752
753 static void
754 print_stack_expand(PDFSCAN *ps)
755 {
756 int i, n=ps->ostack_idx;
757 pdf_scan_msgf(ps, "Stack:\n");
758 for (i=1; i<=n; i++) {
759 pdf_scan_msgf(ps, "%2d: ", i);
760 print_ref_expand(ps, &ps->ostack[i]);
761 pdf_scan_msgf(ps, "\n");
762 }
763 }
764
765 static void pdf_scan_print_allocated(PDFSCAN *ps)
766 {
767 int count = 0;
768 int len = 0;
769 PDFMEM *mem = ps->memory_head;
770 while (mem) {
771 len += sizeof(PDFMEM);
772 len += mem->len;
773 count++;
774 mem = mem->next;
775 }
776 pdf_scan_msgf(ps, "Allocated memory %d bytes in %d objects\n",
777 len, count);
778 }
779
780 #endif
781
782 /*****************************************************************/
783 /* object reading and cache */
784
785 static int obj_add(PDFSCAN *ps, int objnum, ref objref)
786 {
787 if (ps->objs_count + 2 >= ps->objs_len) {
788 /* allocate more space */
789 ref *newobjs;
790 int newlen = ps->objs_len + 256;
791 if (newlen > ps->objs_maxlen) {
792 pdf_scan_msgf(ps, "obj_add: too many objects to cache\n");
793 return 0;
794 }
795 newobjs = (ref *)malloc(newlen * sizeof(ref));
796 if (newobjs == NULL) {
797 pdf_scan_msgf(ps, "obj_add: Out of memory\n");
798 return 0;
799 }
800 memcpy(newobjs, ps->objs, ps->objs_len * sizeof(ref));
801 free(ps->objs);
802 ps->objs = newobjs;
803 ps->objs_len = newlen;
804 }
805 ps->objs[ps->objs_count++] = make_int(objnum);
806 ps->objs[ps->objs_count++] = objref;
807 return ps->objs_count;
808 }
809
810 static ref obj_find(PDFSCAN *ps, int objnum)
811 {
812 int i;
813 for (i=0; i<ps->objs_count; i+=2) {
814 if (objnum == ps->objs[i].value.intval)
815 return ps->objs[i+1];
816 }
817 return invalidref;
818 }
819
820 static void clear_objs(PDFSCAN *ps)
821 {
822 ps->objs_count = 0;
823 }
824
825 /*****************************************************************/
826 /* token parsing */
827
828 static int is_white(char ch)
829 {
830 return (ch == '\0') || (ch == '\t') || (ch == '\n') ||
831 (ch == '\f') || (ch == '\r') || (ch == ' ');
832 }
833
834 static int is_delimiter(char ch)
835 {
836 return (ch == '(') || (ch == ')') ||
837 (ch == '<') || (ch == '>') ||
838 (ch == '[') || (ch == ']') ||
839 (ch == '{') || (ch == '}') ||
840 (ch == '/') || (ch == '%');
841 }
842
843
844 /* Scan next token from buffer, returning token type and offset to begin
845 * and end of token.
846 * Return 0 if OK, 1 if no token or not enough data, -1 on error
847 */
848 static int pdf_scan_token(const char *buf, int buflen,
849 rtype *ttype, int *tbegin, int *tend)
850 {
851 int code = -1;
852 int i = 0;
853 rtype type;
854 int begin, end;
855 *ttype = type = invalidtype;
856 *tbegin = begin = 0;
857 *tend = end = 0;
858 while ((i < buflen) && is_white(buf[i]))
859 i++;
860 if (i == buflen)
861 return 1;
862
863 begin = i;
864 if (buf[i] == '%') {
865 while (i < buflen) {
866 if ((buf[i] == '\n') || (buf[i] == '\r')) {
867 type = commenttype;
868 end = i;
869 code = 0;
870 break;
871 }
872 i++;
873 }
874 if (i >= buflen)
875 code = 1;
876
877 }
878 else if (buf[i] == '(') {
879 /* string */
880 int pcount = 0;
881 type = stringtype;
882 i++;
883 while (i < buflen) {
884 if (buf[i] == '\\')
885 i++;
886 else if (buf[i] == '(')
887 pcount++;
888 else if (buf[i] == ')') {
889 if (pcount <= 0) {
890 end = i+1;
891 code = 0;
892 break;
893 }
894 else
895 pcount--;
896 }
897 i++;
898 }
899 if (i >= buflen)
900 code = 1;
901 }
902 else if (buf[i] == '<') {
903 i++;
904 if (i >= buflen) {
905 code = 1;
906 }
907 else if (buf[i] == '<') {
908 /* marktype */
909 end = i+1;
910 type = marktype;
911 code = 0;
912 }
913 else {
914 /* hexadecimal string */
915 type = stringtype;
916 while (i < buflen) {
917 if (buf[i] == '>') {
918 end = i+1;
919 code = 0;
920 break;
921 }
922 i++;
923 }
924 if (i >= buflen)
925 code = 1;
926 }
927 }
928 else if (buf[i] == '[') {
929 code = 0;
930 end = i+1;
931 type = marktype;
932 }
933 else if (buf[i] == '/') {
934 /* name */
935 type = nametype;
936 i++;
937 while (i < buflen) {
938 if (is_white(buf[i]) || is_delimiter(buf[i])) {
939 end = i;
940 code = 0;
941 break;
942 }
943 i++;
944 }
945 if (i >= buflen)
946 code = 1;
947 }
948 else if (is_delimiter(buf[i])) {
949 /* skip over delimiter */
950 if (buf[i] == '>') {
951 i++;
952 if (i < buflen) {
953 if (buf[i] == '>') {
954 type = optype;
955 end = i+1;
956 code = 0;
957 }
958 else
959 code = -1;
960 }
961 }
962 else {
963 type = optype;
964 end = i+1;
965 code = 0;
966 }
967 if (i >= buflen)
968 code = 1;
969 }
970 else {
971 /* First assume that it is an op */
972 type = optype;
973 while (i < buflen) {
974 if (is_white(buf[i]) || is_delimiter(buf[i])) {
975 end = i;
976 code = 0;
977 break;
978 }
979 i++;
980 }
981 if (i >= buflen)
982 code = 1;
983
984 /* try to convert it into a bool */
985 if ((code == 0) && (type == optype)) {
986 if ((end - begin == 4) &&
987 (memcmp(buf+begin, "true", 4) == 0)) {
988 type = booltype;
989 }
990 else if ((end - begin == 5) &&
991 (memcmp(buf+begin, "false", 5) == 0)) {
992 type = booltype;
993 }
994 }
995
996 /* try to convert it into an integer */
997 if ((code == 0) && (type == optype)) {
998 int j;
999 char ch;
1000 BOOL isreal = FALSE;
1001 BOOL isnum = TRUE;
1002 for (j=begin; j<end; j++) {
1003 ch = buf[j];
1004 if (ch == '.')
1005 isreal = TRUE;
1006 if (!((ch == '-') || (ch == '+') || (ch == '.') ||
1007 isdigit((int)ch)))
1008 isnum = FALSE;
1009 }
1010 if (isnum) {
1011 if (isreal)
1012 type = realtype;
1013 else
1014 type = integertype;
1015 }
1016 }
1017 }
1018
1019 *ttype = type;
1020 *tbegin = begin;
1021 *tend = end;
1022 return code;
1023 }
1024
1025 /*****************************************************************/
1026
1027 static void pdf_scan_finish(PDFSCAN *ps)
1028 {
1029 if (ps->file) {
1030 fclose(ps->file);
1031 ps->file = NULL;
1032 }
1033 if (ps->buf) {
1034 free(ps->buf);
1035 ps->buf = NULL;
1036 }
1037 ps->buflen = 0;
1038 if (ps->xref) {
1039 free(ps->xref);
1040 ps->xref = NULL;
1041 }
1042 ps->xref_len = 0;
1043 if (ps->ostack) {
1044 free(ps->ostack);
1045 ps->ostack = NULL;
1046 }
1047 ps->ostack_len = 0;
1048 ps->ostack_idx = 0;
1049
1050 if (ps->objs) {
1051 free(ps->objs);
1052 ps->objs = NULL;
1053 }
1054 ps->objs_len = 0;
1055 ps->objs_count = 0;
1056 memset(ps, 0, sizeof(PDFSCAN));
1057 }
1058
1059 static int pdf_scan_open_file(PDFSCAN *ps)
1060 {
1061 ps->file = csfopen(ps->filename, TEXT("rb"));
1062 if (ps->file == NULL)
1063 return -1;
1064 return 0;
1065 }
1066
1067 static int pdf_scan_init(PDFSCAN *ps, const TCHAR *name)
1068 {
1069 int len = (int)(cslen(name)+1) * sizeof(TCHAR);
1070 if (len > (int)sizeof(ps->filename))
1071 return -1;
1072 memcpy(ps->filename, name, len);
1073 if (pdf_scan_open_file(ps) != 0)
1074 return -1;
1075 ps->buflen = 256;
1076 ps->buf = (char *)malloc(ps->buflen);
1077 if (ps->buf == NULL) {
1078 pdf_scan_finish(ps);
1079 return -2;
1080 }
1081 ps->ostack_maxlen = 4096;
1082 ps->ostack_len = 256;
1083 ps->ostack_idx = 0; /* empty */
1084 ps->ostack = (ref *)malloc(ps->ostack_len * sizeof(ref));
1085 if (ps->ostack == NULL) {
1086 pdf_scan_finish(ps);
1087 return -2;
1088 }
1089 /* make first item on stack invalid */
1090 ps->ostack[0].type = invalidtype;
1091 ps->ostack[0].rsize = 0;
1092 ps->ostack[0].value.voidval = NULL;
1093
1094 /* object cache */
1095 ps->objs_maxlen = 1024;
1096 ps->objs_len = 256;
1097 ps->objs_count = 0; /* empty */
1098 ps->objs = (ref *)malloc(ps->objs_len * sizeof(ref));
1099 if (ps->objs == NULL) {
1100 pdf_scan_finish(ps);
1101 return -2;
1102 }
1103
1104 ps->pagenum = -1; /* no cached media info yet */
1105
1106 return 0;
1107 }
1108
1109 static int pdf_scan_seek(PDFSCAN *ps, long offset, PDFSEEK whence)
1110 {
1111 int code = -1;
1112 switch (whence) {
1113 case PDFSEEK_CUR:
1114 offset = ps->offset + ps->end + offset;
1115 case PDFSEEK_SET:
1116 ps->begin = ps->end = ps->len = 0;
1117 code = fseek(ps->file, offset, SEEK_SET);
1118 ps->offset = offset;
1119 break;
1120 case PDFSEEK_END:
1121 code = fseek(ps->file, 0, SEEK_END);
1122 ps->begin = ps->end = ps->len = 0;
1123 ps->offset = ftell(ps->file);
1124 break;
1125 }
1126 return code;
1127 }
1128
1129 /* Read next token from PDF file */
1130 /* Return 0 if OK, or -1 if EOF, -2 if error */
1131 /* Set *token_type to token type */
1132 static int pdf_scan_next_token(PDFSCAN *ps)
1133 {
1134 int code = 0;
1135 int count;
1136 rtype type=invalidtype;
1137 int begin=0, end=0;
1138
1139 do {
1140 if ((code == 1) && ps->end) {
1141 /* move characters to front of buffer */
1142 if (ps->len - ps->end)
1143 memmove(ps->buf, ps->buf+ps->end, ps->len - ps->end);
1144 ps->offset += ps->end;
1145 ps->len = ps->len - ps->end;
1146 ps->begin = 0;
1147 ps->end = 0;
1148 }
1149
1150 if ((code == 1) && (ps->len >= ps->buflen)) {
1151 /* increase buffer size */
1152 char *newbuf;
1153 int newbuflen = 2 * ps->buflen;
1154 newbuf = (char *)malloc(newbuflen);
1155 if (newbuf) {
1156 memcpy(newbuf, ps->buf, ps->buflen);
1157 free(ps->buf);
1158 ps->buf = newbuf;
1159 ps->buflen = newbuflen;
1160 }
1161 else {
1162 pdf_scan_msgf(ps, "Out of memory in pdf_scan_next_token\n");
1163 pdf_scan_msgf(ps, "Tried to realloc %d to %d\n",
1164 ps->buflen, newbuflen);
1165 code = -2;
1166 break;
1167 }
1168 }
1169
1170 if ((code == 1) || (ps->len == 0)) {
1171 count = (int)fread(ps->buf+ps->len, 1, ps->buflen-ps->len,
1172 ps->file);
1173 if (count == 0) {
1174 pdf_scan_msgf(ps, "EOF in pdf_scan_next_token\n");
1175 code = -1;
1176 break;
1177 }
1178 ps->len += count;
1179 }
1180
1181 while (ps->instream) {
1182 /* We are in a stream. Keep reading until we find
1183 * the endstream. This isn't robust. It can be fooled
1184 * by "endstream" occuring within a stream.
1185 */
1186 while ((ps->end < ps->len) && (ps->buf[ps->end] != 'e'))
1187 ps->end++;
1188 /* look for endstream */
1189 if (ps->end + 9 >= ps->len) {
1190 code = 1; /* need more */
1191 break;
1192 }
1193 if (memcmp(ps->buf+ps->end, "endstream", 9) == 0)
1194 ps->instream = FALSE;
1195 else
1196 ps->end++;
1197 }
1198 if (!ps->instream)
1199 code = pdf_scan_token(ps->buf+ps->end, ps->len - ps->end,
1200 &type, &begin, &end);
1201 } while (code == 1);
1202
1203
1204 if (code == 0) {
1205 /* got a token */
1206 ps->begin = ps->end + begin;
1207 ps->end = ps->end + end;
1208 ps->token_type = type;
1209
1210 if ((type == optype) && (ps->end-ps->begin == 6) &&
1211 (memcmp(ps->buf+ps->begin, "stream", 6) == 0))
1212 ps->instream = TRUE;
1213 }
1214
1215 return code;
1216 }
1217
1218 /*****************************************************************/
1219 /* Reading %%EOF, xref, traler */
1220
1221 static int
1222 previous_line(const char *str, int len)
1223 {
1224 int i = len-1;
1225 /* first skip over EOL */
1226 while ((i > 0) && ((str[i]=='\r') || (str[i]=='\n')))
1227 i--;
1228 while ((i > 0) && !((str[i]=='\r') || (str[i]=='\n')))
1229 i--;
1230 if (!((str[i]=='\r') || (str[i]=='\n')))
1231 return -1; /* didn't find a line */
1232 return i+1;
1233 }
1234
1235 static int
1236 pdf_scan_find_xref(PDFSCAN *ps)
1237 {
1238 char buf[4096];
1239 int i, j;
1240 int code = -1;
1241 int count;
1242 pdf_scan_seek(ps, 0, PDFSEEK_END);
1243 count = min((int)sizeof(buf), ps->offset);
1244 pdf_scan_seek(ps, -count, PDFSEEK_CUR);
1245 count = (int)fread(buf, 1, sizeof(buf), ps->file);
1246 pdf_scan_seek(ps, 0, PDFSEEK_SET);
1247 if (count == 0)
1248 return -1;
1249 i = count - 5;
1250 while (i > 0) {
1251 /* Find %%EOF */
1252 if (memcmp(buf+i, "%%EOF", 5) == 0) {
1253 code = 0;
1254 break;
1255 }
1256 i--;
1257 }
1258 if (i == 0) {
1259 pdf_scan_msgf(ps, "Failed to find %%EOF\n");
1260 code = -1;
1261 }
1262 if (code == 0) {
1263 /* Look for xref table offset */
1264 j = previous_line(buf, i);
1265 if (j >= 0)
1266 ps->xref_offset = atol(buf+j);
1267 else
1268 code = -1;
1269 i = j;
1270 if (ps->xref_offset == 0)
1271 code = -1;
1272 if (code != 0)
1273 pdf_scan_msgf(ps, "Failed to find cross reference table\n");
1274 }
1275
1276 if (code == 0) {
1277 /* Look for "startxref" */
1278 j = previous_line(buf, i);
1279 if (j >= 0) {
1280 if (memcmp(buf+j, "startxref", 9) != 0)
1281 code = -1;
1282 }
1283 else {
1284 code = -1;
1285 }
1286 if (code != 0)
1287 pdf_scan_msgf(ps, "Failed to find startxref\n");
1288 }
1289 return code;
1290 }
1291
1292 /* Read a cross reference table */
1293 /* This is called for each cross reference table */
1294 static int
1295 pdf_scan_read_xref(PDFSCAN *ps, unsigned long xref_offset)
1296 {
1297 int code;
1298 int i;
1299 int first = 0;
1300 int count = 0;
1301 unsigned long prev = 0;
1302 unsigned long offset = 0;
1303 int generation = 0;
1304 BOOL used = FALSE;
1305 pdf_scan_seek(ps, xref_offset, PDFSEEK_SET);
1306 code = pdf_scan_next_token(ps);
1307 if (code == 0)
1308 code = op_check(ps, "xref");
1309 while (code == 0) {
1310 code = pdf_scan_next_token(ps);
1311 if ((code == 0) && is_optoken(ps, "trailer"))
1312 break; /* finished this xref table */
1313 if (code == 0) {
1314 first = atoi(ps->buf + ps->begin);
1315 code = pdf_scan_next_token(ps);
1316 }
1317 if (code == 0) {
1318 count = atoi(ps->buf + ps->begin);
1319 }
1320 if (code == 0) {
1321 /* make sure there is enough space in the table */
1322 if (first + count > ps->xref_len) {
1323 int len = (first + count) * sizeof(PDFXREF);
1324 PDFXREF *newxref = (PDFXREF *)malloc(len);
1325 if (newxref) {
1326 memset(newxref, 0, len);
1327 memcpy(newxref, ps->xref, ps->xref_len * sizeof(PDFXREF));
1328 free(ps->xref);
1329 ps->xref = newxref;
1330 ps->xref_len = first + count;
1331 }
1332 else {
1333 pdf_scan_msgf(ps, "pdf_scan_read_xref: out of memory\n");
1334 code = -2;
1335 break;
1336 }
1337 }
1338 }
1339 for (i=first; i<first+count; i++) {
1340 code = pdf_scan_next_token(ps);
1341 if (code == 0) {
1342 offset = atol(ps->buf+ps->begin);
1343 code = pdf_scan_next_token(ps);
1344 }
1345 if (code == 0) {
1346 generation = atoi(ps->buf+ps->begin);
1347 code = pdf_scan_next_token(ps);
1348 }
1349 if (code == 0) {
1350 if (is_optoken(ps, "n"))
1351 used = TRUE;
1352 else if (is_optoken(ps, "f"))
1353 used = FALSE;
1354 else
1355 code = -1;
1356 }
1357 /* We don't deal correctly with generation.
1358 * We assume that the first xref table that marks an
1359 * object as used is the definitive reference.
1360 */
1361 if (code == 0) {
1362 if (!(ps->xref[i].used)) {
1363 ps->xref[i].offset = offset;
1364 ps->xref[i].generation = generation;
1365 ps->xref[i].used = used;
1366 }
1367 }
1368 }
1369 }
1370
1371 if (code == 0) {
1372 code = pdf_scan_read_trailer(ps, &prev);
1373 if ((code == 0) && prev && prev != ps->xref_offset) {
1374 /* read older xref and trailer */
1375 code = pdf_scan_read_xref(ps, prev);
1376 }
1377 }
1378
1379 return code;
1380 }
1381
1382 /* Read a trailer */
1383 static int
1384 pdf_scan_read_trailer(PDFSCAN *ps, unsigned long *prev)
1385 {
1386 int code = 0;
1387 ref p;
1388 code = pdf_scan_next_token(ps);
1389 if ((code == 0) && (ps->token_type != marktype))
1390 code = -1;
1391 push_token(ps);
1392 while (code == 0) {
1393 code = pdf_scan_next_token(ps);
1394 if (code != 0)
1395 break;
1396 if (is_optoken(ps, "startxref")) {
1397 if (ps->root == 0) {
1398 p = dict_get(ps, "Root");
1399 if (p.type == objtype)
1400 ps->root = p.value.objval;
1401 else {
1402 pdf_scan_msgf(ps,
1403 "trailer /Root requires indirect reference\n");
1404 code = -1;
1405 }
1406 }
1407 p = dict_get(ps, "Prev");
1408 if (p.type == integertype)
1409 *prev = p.value.intval;
1410 else if (p.type != invalidtype) {
1411 code = -1;
1412 pdf_scan_msgf(ps, "trailer /Prev requires integer\n");
1413 }
1414 break;
1415 }
1416 if (process_op(ps) != 0)
1417 push_token(ps);
1418 }
1419 if (code != 0)
1420 pdf_scan_msgf(ps, "Error reading trailer\n");
1421 return code;
1422 }
1423
1424
1425 static int pdf_scan_read_object_start(PDFSCAN *ps, int objnum)
1426 {
1427 int code = 0;
1428 int value = 0;
1429 if (objnum == 0) {
1430 pdf_scan_msgf(ps, "Object 0 is always unused\n");
1431 return -1;
1432 }
1433 if (objnum >= ps->xref_len) {
1434 pdf_scan_msgf(ps, "Object reference %d doesn't exist. There are only %d objects\n", objnum, ps->xref_len);
1435 return -1;
1436 }
1437 if (!ps->xref[objnum].used) {
1438 pdf_scan_msgf(ps, "Object %d is unused\n", objnum);
1439 return -1;
1440 }
1441 pdf_scan_seek(ps, ps->xref[objnum].offset, PDFSEEK_SET);
1442
1443 code = pdf_scan_next_token(ps); /* object number */
1444 if (code == 0)
1445 code = type_check(ps, integertype);
1446 if (code == 0) {
1447 value = atoi(ps->buf+ps->begin); /* object number */
1448 code = pdf_scan_next_token(ps); /* generation */
1449 }
1450 if (code == 0)
1451 code = type_check(ps, integertype);
1452 if (code == 0)
1453 code = pdf_scan_next_token(ps); /* obj */
1454 if (code == 0)
1455 code = op_check(ps, "obj");
1456
1457 if (value != objnum) {
1458 pdf_scan_msgf(ps, "Didn't find object %d\n", objnum);
1459 return -1;
1460 }
1461 return code;
1462 }
1463
1464 /*****************************************************************/
1465
1466 /* Read an object, and leave it on the stack */
1467 static int
1468 pdf_scan_read_object(PDFSCAN *ps, int objnum)
1469 {
1470 int code;
1471 ref objref = obj_find(ps, objnum);
1472
1473 if (objref.type != invalidtype) {
1474 /* found in cache */
1475 push_stack(ps, objref);
1476 return 0;
1477 }
1478
1479 code = pdf_scan_read_object_start(ps, objnum);
1480 if (code) {
1481 pdf_scan_msgf(ps, "Didn't find object %d\n", objnum);
1482 return -1;
1483 }
1484
1485 code = pdf_scan_next_token(ps);
1486 if ((code == 0) && (ps->token_type != marktype))
1487 code = -1;
1488 push_token(ps);
1489 while (code == 0) {
1490 code = pdf_scan_next_token(ps);
1491 if (code != 0)
1492 break;
1493 if (is_optoken(ps, "endobj")) {
1494 obj_add(ps, objnum, top_stack(ps));
1495 break;
1496 }
1497 if (process_op(ps) != 0)
1498 push_token(ps);
1499 }
1500 return code;
1501 }
1502
1503 /*****************************************************************/
1504
1505 /* find the object number for a page */
1506 /* Return <= 0 if failure, or object number */
1507 /* First page is 0 */
1508 static int pdf_scan_find_page(PDFSCAN *ps, int pagenum)
1509 {
1510 int code;
1511 ref kids;
1512 ref r;
1513 int pageobj = 0;
1514 int count_base = 0;
1515 int count;
1516 ref *pref;
1517 int i;
1518 int inext;
1519
1520 if (pagenum >= ps->page_count) {
1521 pdf_scan_msgf(ps, "Not that many pages\n");
1522 return -1;
1523 }
1524 code = pdf_scan_read_object(ps, ps->pages);
1525 if (code) {
1526 pdf_scan_msgf(ps, "Didn't find Pages object\n");
1527 return -1;
1528 }
1529 /* iterate through Kids, looking for the one that includes this page */
1530 kids = dict_get(ps, "Kids");
1531 if (kids.type != arraytype) {
1532 pdf_scan_msgf(ps, "/Pages object %d must contain /Kids array\n",
1533 ps->pages);
1534 return -1;
1535 }
1536 pop_stack(ps); /* First Pages */
1537 for (i = 0; (i < kids.rsize) && (code == 0); i=inext) {
1538 inext = i+1;
1539 pref = &kids.value.arrayval[i];
1540 if (pref->type == objtype)
1541 code = pdf_scan_read_object(ps, pref->value.objval);
1542 if (code == 0) {
1543 r = dict_get(ps, "Type");
1544 if (nameref_equals(&r, "Page")) {
1545 if (count_base + i == pagenum) {
1546 /* this is it */
1547 pageobj = pref->value.objval;
1548 pop_stack(ps); /* the wanted page */
1549 break;
1550 }
1551 }
1552 else if (nameref_equals(&r, "Pages")) {
1553 r = dict_get(ps, "Count");
1554 if (r.type == integertype) {
1555 count = r.value.intval;
1556 if (pagenum < count_base + count) {
1557 /* It's under this child */
1558 inext = 0;
1559 pop_stack(ps); /* The old /Pages */
1560 code = pdf_scan_read_object(ps, pref->value.objval);
1561 if (code == 0) {
1562 kids = dict_get(ps, "Kids");
1563 if (kids.type != arraytype) {
1564 pdf_scan_msgf(ps,
1565 "/Pages object %d must contain /Kids array\n",
1566 pref->value.objval);
1567 code = -1;
1568 }
1569 }
1570 }
1571 else {
1572 count_base += count;
1573 }
1574 }
1575 else {
1576 pdf_scan_msgf(ps, "/Pages /Count must be integer\n");
1577 code = -1;
1578 }
1579 }
1580 else {
1581 pdf_scan_msgf(ps,
1582 "pdf_scan_find_page: object %d isn't Pages or Page\n",
1583 pref->value.objval);
1584 code = -1;
1585 }
1586 pop_stack(ps);
1587 }
1588 }
1589
1590 if (pageobj <= 0) {
1591 pdf_scan_msgf(ps, "Failed to find page %d\n", pagenum+1);
1592 code = -1;
1593 }
1594
1595 if (code)
1596 return -1;
1597
1598 /* Don't clean up, since we will use the cached objects
1599 * when extracting the page media.
1600 */
1601
1602 return pageobj;
1603 }
1604
1605
1606 static int
1607 pdf_scan_read_page_count(PDFSCAN *ps)
1608 {
1609 int code;
1610 ref p;
1611 code = pdf_scan_read_object(ps, ps->pages);
1612 if (code) {
1613 pdf_scan_msgf(ps, "Didn't find Pages object\n");
1614 return -1;
1615 }
1616
1617 p = dict_get(ps, "Type");
1618 if (!nameref_equals(&p, "Pages")) {
1619 pdf_scan_msgf(ps, "Pages object didn't have /Type /Pages\n");
1620 return -1;
1621 }
1622 p = dict_get(ps, "Count");
1623 if (p.type != integertype) {
1624 pdf_scan_msgf(ps, "Pages object didn't integer /Count\n");
1625 return -1;
1626 }
1627 ps->page_count = p.value.intval;
1628
1629 return code;
1630 }
1631
1632 static int convert_float(ref r, float *f)
1633 {
1634 if (r.type == realtype)
1635 *f = r.value.realval;
1636 else if (r.type == integertype)
1637 *f = (float)r.value.intval;
1638 else
1639 return -1;
1640 return 0;
1641 }
1642
1643 static int
1644 pdf_scan_read_bbox(PDFBBOX *box, ref array)
1645 {
1646 int code = 0;
1647 if (array.type != arraytype)
1648 code = -1;
1649 if (array.rsize != 4)
1650 code = -1;
1651 if (code == 0)
1652 code = convert_float(array.value.arrayval[0], &box->llx);
1653 if (code == 0)
1654 code = convert_float(array.value.arrayval[1], &box->lly);
1655 if (code == 0)
1656 code = convert_float(array.value.arrayval[2], &box->urx);
1657 if (code == 0)
1658 code = convert_float(array.value.arrayval[3], &box->ury);
1659 return code;
1660 }
1661
1662 /* Read catalog and leave on stack */
1663 static int
1664 pdf_scan_read_catalog(PDFSCAN *ps)
1665 {
1666 int code;
1667 ref p;
1668 /* Read root object, making sure it is /Type /Catalog,
1669 * and that /Pages is an indirect reference
1670 */
1671 code = pdf_scan_read_object(ps, ps->root);
1672 if (code) {
1673 pdf_scan_msgf(ps, "Didn't find Root object\n");
1674 return -1;
1675 }
1676
1677 p = dict_get(ps, "Type");
1678 if (!nameref_equals(&p, "Catalog")) {
1679 pdf_scan_msgf(ps, "Root object didn't have /Type /Catalog\n");
1680 return -1;
1681 }
1682 p = dict_get(ps, "Pages");
1683 if (p.type != objtype) {
1684 pdf_scan_msgf(ps, "Root object didn't indirect reference to /Pages\n");
1685 return -1;
1686 }
1687 ps->pages = p.value.intval;
1688 return 0;
1689 }
1690
1691 /*****************************************************************/
1692 /* public functions */
1693
1694
1695 void
1696 pdf_scan_close(PDFSCAN *ps)
1697 {
1698 pdf_scan_cleanup(ps);
1699 pdf_scan_finish(ps);
1700 free(ps);
1701 }
1702
1703
1704 PDFSCAN *
1705 pdf_scan_open(const TCHAR *filename, void *handle,
1706 int (*fn)(void *handle, const char *ptr, int len))
1707 {
1708 int code;
1709 int rotate;
1710 PDFBBOX mediabox, cropbox;
1711 PDFSCAN *ps = (PDFSCAN *)malloc(sizeof(PDFSCAN));
1712 if (ps == NULL)
1713 return NULL;
1714 memset(ps, 0, sizeof(PDFSCAN));
1715 ps->handle = handle;
1716 ps->print_fn = fn;
1717 code = pdf_scan_init(ps, filename);
1718 if (code == -1)
1719 pdf_scan_msgf(ps, "Couldn't open PDF file\n");
1720 else if (code != 0)
1721 pdf_scan_msgf(ps, "Error initialising PDF scanner\n");
1722
1723 if (code == 0)
1724 code = pdf_scan_find_xref(ps);
1725 if (code == 0)
1726 code = pdf_scan_read_xref(ps, ps->xref_offset);
1727 if (code == 0)
1728 code = pdf_scan_read_catalog(ps);
1729 if (code == 0)
1730 code = pdf_scan_read_page_count(ps);
1731 if (code == 0)
1732 code = pdf_scan_page_media(ps, 0, &rotate, &mediabox, &cropbox);
1733
1734 pdf_scan_cleanup(ps);
1735 if (code != 0) {
1736 pdf_scan_close(ps);
1737 ps = NULL;
1738 }
1739 return ps;
1740 }
1741
1742 int
1743 pdf_scan_page_count(PDFSCAN *ps)
1744 {
1745 if (ps == NULL)
1746 return 0;
1747 return ps->page_count;
1748 }
1749
1750 int
1751 pdf_scan_page_media(PDFSCAN *ps, int pagenum, int *rotate,
1752 PDFBBOX *mediabox, PDFBBOX *cropbox)
1753 {
1754 BOOL found_rotate = FALSE;
1755 BOOL found_mediabox = FALSE;
1756 BOOL found_cropbox = FALSE;
1757 BOOL has_parent = TRUE;
1758 ref p, objref;
1759 int objnum;
1760
1761 if (ps == NULL)
1762 return -1;
1763
1764 if (pagenum == ps->pagenum) {
1765 /* Used cached values */
1766 *rotate = ps->rotate;
1767 *mediabox = ps->mediabox;
1768 *cropbox = ps->cropbox;
1769 return 0;
1770 }
1771
1772 if (ps->file == NULL) {
1773 if (pdf_scan_open_file(ps) != 0)
1774 return -1;
1775 }
1776 objnum = pdf_scan_find_page(ps, pagenum);
1777 if (objnum <= 0) {
1778 pdf_scan_cleanup(ps);
1779 return -1;
1780 }
1781 if (pdf_scan_read_object(ps, objnum) < 0) {
1782 pdf_scan_cleanup(ps);
1783 return -1;
1784 }
1785
1786 while (has_parent) {
1787 if (!found_rotate) {
1788 p = dict_get(ps, "Rotate");
1789 if (p.type == integertype) {
1790 *rotate = p.value.intval;
1791 found_rotate = TRUE;
1792 }
1793 }
1794 if (!found_mediabox) {
1795 p = dict_get(ps, "MediaBox");
1796 if (pdf_scan_read_bbox(mediabox, p) == 0)
1797 found_mediabox = TRUE;
1798 }
1799 if (!found_cropbox) {
1800 p = dict_get(ps, "CropBox");
1801 if (pdf_scan_read_bbox(cropbox, p) == 0)
1802 found_cropbox = TRUE;
1803 }
1804 if (found_rotate && found_mediabox && found_cropbox)
1805 break;
1806
1807 p = dict_get(ps, "Parent");
1808 if (p.type == objtype) {
1809 objref = pop_stack(ps);
1810 if (pdf_scan_read_object(ps, p.value.objval) < 0) {
1811 push_stack(ps, objref);
1812 has_parent = FALSE;
1813 }
1814 }
1815 else
1816 has_parent = FALSE;
1817 }
1818 pop_stack(ps);
1819 if (!found_cropbox) {
1820 *cropbox = *mediabox;
1821 found_cropbox = TRUE;
1822 }
1823 if (!found_rotate) {
1824 *rotate = 0;
1825 found_rotate = TRUE;
1826 }
1827
1828 pdf_scan_cleanup(ps);
1829
1830 if (found_rotate && found_mediabox && found_cropbox) {
1831 /* cache these values */
1832 ps->pagenum = pagenum;
1833 ps->rotate = *rotate;
1834 ps->mediabox = *mediabox;
1835 ps->cropbox = *cropbox;
1836 return 0;
1837 }
1838
1839 return -1;
1840 }
1841
1842 /*****************************************************************/
1843
1844 #ifdef DEMO_PDFSCAN
1845
1846 int test_print_fn(void *handle, const char *ptr, int len)
1847 {
1848 fwrite(ptr, 1, len, stdout);
1849 return len;
1850 }
1851
1852 int main(int argc, char *argv[])
1853 {
1854 PDFSCAN *ps;
1855 int i, count;
1856 int code;
1857 PDFBBOX mediabox, cropbox;
1858 int rotate;
1859
1860 if (argc < 2) {
1861 fprintf(stdout, "Usage: cpdfscan filename\n");
1862 return 1;
1863 }
1864
1865 ps = pdf_scan_open(argv[1], NULL, test_print_fn);
1866 if (ps) {
1867 count = pdf_scan_page_count(ps);
1868 pdf_scan_msgf(ps, "Page count is %d\n", count);
1869 for (i=0; i<count; i++) {
1870 code = pdf_scan_page_media(ps, i, &rotate, &mediabox, &cropbox);
1871 if (code == 0) {
1872 fprintf(stdout, "Page %d /Rotate %d ", i+1, rotate);
1873 fprintf(stdout, "/MediaBox [%g %g %g %g] /CropBox [%g %g %g %g]\n",
1874 mediabox.llx, mediabox.lly, mediabox.urx, mediabox.ury,
1875 cropbox.llx, cropbox.lly, cropbox.urx, cropbox.ury);
1876 }
1877 else
1878 fprintf(stdout, "Page %d media unknown\n", i+1);
1879 }
1880 pdf_scan_close(ps);
1881 }
1882 return 0;
1883 }
1884
1885 #endif