"Fossies" - the Fresh Open Source Software Archive

Member "epstool-3.08/src/cpdfscan.c" (10 Jun 2005, 43086 Bytes) of package /linux/misc/old/ghost/ghostgum/epstool-3.08-os2.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 /* Copyright (C) 2002-2005 Ghostgum Software Pty Ltd.  All rights reserved.
    2 
    3   This software is provided AS-IS with no warranty, either express or
    4   implied.
    5 
    6   This software is distributed under licence and may not be copied,
    7   modified or distributed except as expressly authorised under the terms
    8   of the licence contained in the file LICENCE in this distribution.
    9 
   10   For more information about licensing, please refer to
   11   http://www.ghostgum.com.au/ or contact Ghostsgum Software Pty Ltd, 
   12   218 Gallaghers Rd, Glen Waverley VIC 3150, AUSTRALIA, 
   13   Fax +61 3 9886 6616.
   14 */
   15 
   16 /* $Id: cpdfscan.c,v 1.7 2005/06/10 09:39:24 ghostgum Exp $ */
   17 /* PDF scanner */
   18 
   19 /* This is a rudimentary PDF scanner, intended to get
   20  * the page count, and for each page the Rotate, MediaBox 
   21  * and CropBox.
   22  */
   23 
   24 #ifdef DEMO_PDFSCAN
   25 # include <windows.h>
   26 # include <stdio.h>
   27 # include <stdarg.h>
   28 # include <string.h>
   29 # include <ctype.h>
   30 # ifdef _MSC_VER
   31 #  define vsnprintf _vsnprintf
   32 # endif
   33 # define csfopen fopen
   34 # define cslen strlen
   35 #else
   36 # include "common.h"
   37 # include <ctype.h>
   38 #endif
   39 
   40 #include "cpdfscan.h"
   41 
   42 
   43 /* Limitations.
   44  * 
   45  * We currently load the entire xref table.  To minimise memory
   46  * would could instead keep a list of xref blocks, and do random
   47  * access within those.
   48  *
   49  * Memory management is very simple.  We just keep a linked
   50  * list of allocated blocks for composite objects.
   51  * We empty the stack, and free all PDF objects and composite
   52  * objects before returning to the caller.
   53  * We don't bother doing garbage collection.
   54  */
   55 
   56 
   57 /* We keep a linked list of memory allocated for composite objects 
   58  * such as name, string, array or dict.
   59  */
   60 typedef struct PDFMEM_s PDFMEM;
   61 struct PDFMEM_s {
   62     void *ptr;
   63     int len;
   64     PDFMEM *next;
   65 };
   66  
   67 /* The token scanner and object references understand the following types */
   68 typedef enum rtype_e {
   69     invalidtype=0,
   70     marktype=1,
   71     nulltype=2,
   72     booltype=3,     /* uses boolval */
   73     integertype=4,  /* uses intval */
   74     realtype=5,     /* uses realval */
   75     nametype=6,     /* uses nameval */
   76     stringtype=7,   /* uses strval */
   77     arraytype=8,    /* uses arrayval */
   78     dicttype=9,     /* uses dictval */
   79     optype=10,      /* uses opval */
   80     streamtype=11,  /* uses streamval */
   81     objtype=12,     /* uses objval */
   82     commenttype=13
   83 } rtype;
   84 
   85 const char *rtype_string[] = {
   86     "invalidtype", "marktype", "nulltype", "booltype", "integertype",
   87     "realtype", "nametype", "stringtype", "arraytype", "dicttype",
   88     "optype", "streamtype", "objtype", "commenttype"
   89 };
   90 
   91 /* A reference contains a simple object, or a pointer to 
   92  * a composite object.
   93  */
   94 typedef struct ref_s ref;
   95 struct ref_s {
   96     rtype type;
   97     int rsize;
   98     union value_u {
   99     /* simple */
  100         void *voidval;
  101     BOOL boolval;
  102     int intval;
  103     float realval;
  104     /* composite */
  105     char *nameval;
  106     char *strval;
  107     ref *arrayval;
  108     ref *dictval;
  109     char *opval;
  110     /* simple */
  111     unsigned long streamval;
  112     int objval;
  113     } value;
  114 };
  115 
  116 /* Cross reference table entry */
  117 typedef struct PDFXREF_s {
  118     unsigned long offset;
  119     int generation;
  120     BOOL used;
  121 } PDFXREF;
  122 
  123 struct PDFSCAN_s {
  124     void *handle;
  125     int (*print_fn)(void *handle, const char *ptr, int len);
  126     TCHAR filename[1024];
  127     FILE *file;
  128     char *buf;
  129     int buflen;     /* length of allocated buf */
  130     int len;        /* #bytes currently in buf */
  131     int offset;     /* file offset to start of buf */
  132     int begin;      /* offset in buf to start of token */
  133     int end;        /* offset in buf to end of token */
  134     rtype token_type;   /* token type */
  135     BOOL instream;  /* In a stream, looking for endstream */
  136     unsigned long xref_offset;  /* offset to xref table */
  137     PDFXREF *xref;
  138     int xref_len;
  139 
  140     /* Object numbers obtained during pdf_scan_open() */
  141     int root;       /* root object reference */
  142     int info;       /* document info dicionary reference */
  143     int pages;      /* Pages dictionary reference */
  144     int page_count; /* number of pages */
  145 
  146     /* Cached page media */
  147     int pagenum;
  148     int rotate;
  149     PDFBBOX mediabox;
  150     PDFBBOX cropbox;
  151 
  152     /* memory allocation */
  153     PDFMEM *memory_head;
  154     PDFMEM *memory_tail;
  155 
  156     /* operand stack */
  157     ref *ostack;
  158     int ostack_idx; /* index to top of ostack */
  159     int ostack_len; /* Initially 512 */
  160     int ostack_maxlen;  /* maximum depth of ostack */
  161 
  162     /* objects in memory */
  163     /* This contains pairs of integer & reference */
  164     ref *objs;
  165     int objs_count; /* count of loaded objects */
  166     int objs_len;   /* length of objs */
  167     int objs_maxlen;    /* maximum number entries in objs */
  168 };
  169 
  170 typedef enum PDFSEEK_e {
  171     PDFSEEK_CUR,
  172     PDFSEEK_END,
  173     PDFSEEK_SET
  174 } PDFSEEK;
  175 
  176 
  177 /* Prototypes */
  178 static int pdf_scan_next_token(PDFSCAN *ps);
  179 static int pdf_scan_read_trailer(PDFSCAN *ps, unsigned long *prev);
  180 static int pdf_scan_read_xref(PDFSCAN *ps, unsigned long xref_offset);
  181 
  182 static void clear_stack(PDFSCAN *ps);
  183 static void clear_objs(PDFSCAN *ps);
  184 static void pdf_scan_freeall(PDFSCAN *ps);
  185 static void pdf_scan_cleanup(PDFSCAN *ps);
  186 static int pdf_scan_open_file(PDFSCAN *ps);
  187 
  188 
  189 /*****************************************************************/
  190 /* text message output */
  191 
  192 static int
  193 pdf_scan_write(PDFSCAN *ps, const char *str, int len)
  194 {
  195     if (ps != NULL)
  196         fwrite(str, 1, len, stdout);
  197     else
  198     (*ps->print_fn)(ps->handle, str, len);
  199     return len;
  200 }
  201 
  202 static int
  203 pdf_scan_msgf(PDFSCAN *ps, const char *fmt, ...)
  204 {
  205 va_list args;
  206 int count;
  207 char buf[2048];
  208     va_start(args,fmt);
  209     count = vsnprintf(buf, sizeof(buf), fmt, args);
  210     pdf_scan_write(ps, buf, count);
  211     va_end(args);
  212     return count;
  213 }
  214 
  215 /*****************************************************************/
  216 /* memory allocation */
  217 
  218 static void
  219 pdf_scan_cleanup(PDFSCAN *ps)
  220 {
  221     if (ps->file)
  222     fclose(ps->file);
  223     ps->file = NULL;
  224     clear_stack(ps);
  225     clear_objs(ps);
  226     pdf_scan_freeall(ps);
  227 }
  228 
  229 static void *pdf_scan_alloc(PDFSCAN *ps, const void *ptr, int len)
  230 {
  231     void *data;
  232     PDFMEM *mem = (PDFMEM *)malloc(sizeof(PDFMEM));
  233     if (mem == NULL)
  234     return NULL;
  235 
  236     data = malloc(len);
  237     if (data == NULL) {
  238     free(mem);
  239     return NULL;
  240     }
  241 
  242     mem->ptr = data;
  243     mem->next = NULL;
  244     mem->len = len;
  245     memcpy(data, ptr, len);
  246 
  247     if (ps->memory_tail) {
  248     ps->memory_tail->next = mem;
  249     ps->memory_tail = mem;
  250     }
  251     else
  252     ps->memory_head = ps->memory_tail = mem;
  253     return data;
  254 }
  255 
  256 /* free all name/string/array/dict memory */
  257 static void
  258 pdf_scan_freeall(PDFSCAN *ps)
  259 {
  260     PDFMEM *memnext;
  261     PDFMEM *mem = ps->memory_head;
  262     while (mem) {
  263     memnext = mem->next;
  264     free(mem->ptr);
  265     free(mem);
  266     mem = memnext;
  267     }
  268     ps->memory_head = ps->memory_tail = NULL;
  269 }
  270 
  271 /*****************************************************************/
  272 /* Token checks */
  273 
  274 static BOOL is_optoken(PDFSCAN *ps, const char *str)
  275 {
  276     return (ps->token_type == optype) && 
  277     (ps->end-ps->begin == (int)strlen(str)) && 
  278     (memcmp(ps->buf+ps->begin, str, ps->end-ps->begin) == 0);
  279 }
  280 
  281 static int
  282 type_check(PDFSCAN *ps, rtype type)
  283 {
  284     if (ps->token_type == type)
  285     return 0;
  286 
  287     pdf_scan_msgf(ps, "Error at offset %ld.  Expecting %s and found %s\n",
  288     ps->offset + ps->begin, 
  289     rtype_string[(int)type],
  290     rtype_string[(int)ps->token_type]);
  291     pdf_scan_msgf(ps, "Token is \042");
  292     pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
  293     pdf_scan_msgf(ps, "\042\n");
  294     return -1;
  295 }
  296 
  297 static int
  298 op_check(PDFSCAN *ps, const char *str)
  299 {
  300     int code = type_check(ps, optype);
  301     if (code)
  302     return code;
  303 
  304     if (!is_optoken(ps, str)) {
  305     pdf_scan_msgf(ps, 
  306         "Error at offset %ld.  Expecting \042%s\042 and found \042",
  307         ps->offset + ps->begin, str); 
  308     pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
  309     pdf_scan_msgf(ps, "\042\n");
  310     code = -1;
  311     }
  312     return code;
  313 }
  314 
  315 /*****************************************************************/
  316 /* stack */
  317 
  318 const ref invalidref = {invalidtype, 0, {NULL}};
  319 const ref markref = {marktype, 0, {NULL}};
  320 
  321 /* Push item, return depth of stack */
  322 /* >0 is success, <=0 is failure */
  323 static int push_stack(PDFSCAN *ps, ref r)
  324 {
  325     int idx;
  326     if (ps->ostack_idx + 1 >= ps->ostack_len) {
  327     /* increase stack size */
  328     ref *newstack;
  329     int newlen = ps->ostack_len + 256;
  330     if (newlen > ps->ostack_maxlen) {
  331         pdf_scan_msgf(ps, "push_stack: stack overflow\n");
  332         return 0;
  333     }
  334     newstack = (ref *)malloc(newlen * sizeof(ref));
  335     if (newstack == NULL) {
  336         pdf_scan_msgf(ps, "push_stack: Out of memory\n");
  337         return 0;
  338     }
  339     memcpy(newstack, ps->ostack, ps->ostack_len * sizeof(ref));
  340     free(ps->ostack);
  341     ps->ostack = newstack;
  342     ps->ostack_len = newlen;
  343     }
  344     idx = ++(ps->ostack_idx);
  345     ps->ostack[idx] = r;
  346     return idx;
  347 }
  348 
  349 static ref pop_stack(PDFSCAN *ps)
  350 {
  351     if (ps->ostack_idx <= 0) {
  352     pdf_scan_msgf(ps, "pop_stack: stack underflow\n");
  353     return invalidref;
  354     }
  355     return ps->ostack[ps->ostack_idx--];
  356 }
  357 
  358 static void clear_stack(PDFSCAN *ps)
  359 {
  360     ps->ostack_idx = 0;
  361 }
  362 
  363 static ref index_stack(PDFSCAN *ps, int n)
  364 {
  365     if (n < 0) {
  366     pdf_scan_msgf(ps, "index_stack: index must not be negative\n");
  367     return invalidref;
  368     }
  369     if (ps->ostack_idx <= n) {
  370     pdf_scan_msgf(ps, "index_stack: stack isn't that deep\n");
  371     return invalidref;
  372     }
  373     return ps->ostack[ps->ostack_idx-n];
  374 }
  375 
  376 static ref top_stack(PDFSCAN *ps)
  377 {
  378     if (ps->ostack_idx <= 0) {
  379     pdf_scan_msgf(ps, "top_stack: stack is empty\n");
  380     return invalidref;
  381     }
  382     return ps->ostack[ps->ostack_idx];
  383 }
  384 
  385 /*****************************************************************/
  386 /* references */
  387 
  388 
  389 static ref make_int(int value)
  390 {
  391     ref r;
  392     r.type = integertype;
  393     r.rsize = 0;
  394     r.value.intval = value;
  395     return r;
  396 }
  397 
  398 static ref make_string(PDFSCAN *ps, const char *str, int len)
  399 {
  400     ref r;
  401     r.type = stringtype;
  402     r.rsize = len;
  403     r.value.strval = pdf_scan_alloc(ps, str, len);
  404     if (r.value.strval == NULL)
  405     return invalidref;
  406     return r;
  407 }
  408 
  409 static ref make_name(PDFSCAN *ps, const char *str, int len)
  410 {
  411     ref r;
  412     r.type = nametype;
  413     r.rsize = len;
  414     r.value.nameval = pdf_scan_alloc(ps, str, len);
  415     if (r.value.nameval == NULL)
  416     return invalidref;
  417     return r;
  418 }
  419 
  420 static BOOL nameref_equals(ref *r, const char *name)
  421 {
  422     int len = (int)strlen(name);
  423     if (r->type != nametype)
  424     return FALSE;
  425     if (r->rsize != len)
  426     return FALSE;
  427     return (memcmp(r->value.nameval, name, len) == 0);
  428 }
  429 
  430 /* Get a reference from a dictionary */
  431 /* Return the result, but don't push it */
  432 static ref dict_get(PDFSCAN *ps, const char *name)
  433 {
  434     int namelen = (int)strlen(name);
  435     ref dict = top_stack(ps);
  436     ref *r;
  437     int dictlen;
  438     int i;
  439     if (dict.type == invalidtype)
  440     return invalidref;
  441     dictlen = dict.rsize * 2;
  442     for (i = 0; i<dictlen; i+=2) {
  443     r = &dict.value.dictval[i];
  444     if ((r->rsize == namelen) && (r->type == nametype) &&
  445         (memcmp(r->value.nameval, name, namelen) ==0))
  446         return dict.value.dictval[i+1];
  447     }
  448     return invalidref;
  449 }
  450 
  451 /* convert the items on the stack to an array on the stack */
  452 static ref array_to_mark(PDFSCAN *ps)
  453 {
  454     ref r;
  455     ref *array;
  456     int n = ps->ostack_idx;
  457     int len;
  458     while ((n>0) && (ps->ostack[n].type != marktype))
  459     n--;
  460     if (n == 0) {
  461     pdf_scan_msgf(ps, "array_to_mark: no mark on stack\n");
  462     return invalidref;
  463     }
  464     len = ps->ostack_idx - n;
  465     r.type = arraytype;
  466     r.rsize = len;
  467     r.value.arrayval = NULL;
  468     if (len) {
  469         array = pdf_scan_alloc(ps, &ps->ostack[n+1], len * sizeof(ref));
  470     if (array)
  471         r.value.arrayval = array;
  472     else
  473         return invalidref;
  474     }
  475     ps->ostack_idx -= len + 1;
  476     push_stack(ps, r);
  477     return r;
  478 }
  479 
  480 /* convert the items on the stack to a dictionary on the stack */
  481 static ref dict_to_mark(PDFSCAN *ps)
  482 {
  483     ref r;
  484     ref *dict;
  485     int n = ps->ostack_idx;
  486     int len;
  487     while ((n>0) && (ps->ostack[n].type != marktype))
  488     n--;
  489     if (n == 0) {
  490     pdf_scan_msgf(ps, "dict_to_mark: no mark on stack\n");
  491     return invalidref;
  492     }
  493     len = ps->ostack_idx - n;
  494     if (len & 1) {
  495     pdf_scan_msgf(ps, "dict_to_mark: must have name/value pairs\n");
  496     return invalidref;
  497     }
  498     r.type = dicttype;
  499     r.rsize = len/2;
  500     r.value.arrayval = NULL;
  501     if (len) {
  502         dict = pdf_scan_alloc(ps, &ps->ostack[n+1], len * sizeof(ref));
  503     if (dict)
  504         r.value.arrayval = dict;
  505     else
  506         return invalidref;
  507     }
  508     ps->ostack_idx -= len + 1;
  509     push_stack(ps, r);
  510     return r;
  511 }
  512 
  513 /*****************************************************************/
  514 
  515 /* Push reference from a token */
  516 static ref push_token(PDFSCAN *ps)
  517 {
  518     ref r;
  519     int len = ps->end - ps->begin;
  520     const char *p = ps->buf + ps->begin;
  521     r.type = ps->token_type;
  522     r.rsize = 0;
  523     r.value.voidval = NULL;
  524     switch(r.type) {
  525       case invalidtype:
  526     break;
  527       case marktype:
  528     break;
  529       case nulltype:
  530     break;
  531       case booltype:
  532     if ((len == 4) && (memcmp(p, "true", 4)==0))
  533         r.value.boolval = TRUE;
  534     else if ((len == 5) && (memcmp(p, "true", 5)==0))
  535         r.value.boolval = FALSE;
  536     else
  537         r = invalidref;
  538     break;
  539       case integertype:
  540     {   char buf[64];
  541         if (len > (int)sizeof(buf)-1)
  542         r = invalidref;
  543         else {
  544         memcpy(buf, p, len);
  545         buf[len] = '\0';
  546         r.value.intval = atoi(buf);
  547         }
  548     }
  549     break;
  550       case realtype:
  551     {   char buf[64];
  552         if (len > (int)sizeof(buf)-1)
  553         r = invalidref;
  554         else {
  555         memcpy(buf, p, len);
  556         buf[len] = '\0';
  557         r.value.realval = (float)atof(buf);
  558         }
  559     }
  560     break;
  561       case nametype:
  562     r = make_name(ps, p+1, len-1);
  563     break;
  564       case stringtype:
  565     r = make_string(ps, p, len);
  566     break;
  567       case streamtype:
  568       case commenttype:
  569       case objtype:
  570       case optype:
  571       case arraytype:
  572       case dicttype:
  573     /* Can't push these from a token */
  574     /* These are made by operators like stream, R, ], >> */
  575     return invalidref;
  576       default:
  577     r.type = invalidtype;
  578     break;
  579     }
  580     push_stack(ps, r);
  581     return r;
  582 }
  583 
  584 /* Process known operators */
  585 static int process_op(PDFSCAN *ps)
  586 {
  587    ref r;
  588    if (ps->token_type != optype)
  589     return 1;   /* not an op */
  590    if (is_optoken(ps, "R")) {
  591     /* convert "n 0 R" to an indirect reference */
  592     ref r1 = index_stack(ps, 1);
  593     r = top_stack(ps);
  594     if ((r.type == integertype) && (r1.type == integertype)) {
  595         r.type = objtype;
  596         r.rsize = r.value.intval;
  597         r.value.intval = r1.value.intval;
  598         pop_stack(ps);
  599         pop_stack(ps);
  600         push_stack(ps, r);
  601     }
  602    }
  603    else if (is_optoken(ps, "]")) {
  604     array_to_mark(ps);
  605    }
  606    else if (is_optoken(ps, ">>")) {
  607     dict_to_mark(ps);
  608    }
  609    else if (is_optoken(ps, "null")) {
  610     r.type = nulltype;
  611     r.rsize = 0;
  612     r.value.voidval = NULL;
  613     push_stack(ps, r);
  614    }
  615    else if (is_optoken(ps, "obj")) {
  616     pdf_scan_msgf(ps, "ignoring obj token\n");
  617     /* ignore */
  618    }
  619    else if (is_optoken(ps, "endobj")) {
  620     pdf_scan_msgf(ps, "ignoring endobj token\n");
  621     /* ignore */
  622    }
  623    else if (is_optoken(ps, "stream")) {
  624     /* stream object contains offset to start of stream */
  625     r.type = streamtype;
  626     r.rsize = 0;
  627     r.value.streamval = ps->offset + ps->end;
  628     push_stack(ps, r);
  629     /* Now skip over stream */
  630         pdf_scan_next_token(ps);
  631    }
  632    else {
  633     pdf_scan_msgf(ps, "process_op: unrecognised operator \042");
  634     pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
  635     pdf_scan_msgf(ps, "\042\n");
  636     return -1;
  637    }
  638    return 0;
  639 }
  640 
  641 /*****************************************************************/
  642 /* Debugging and error messages */
  643 
  644 #ifdef NOTUSED
  645 
  646 /* Print a reference, returning number of characters written */
  647 static int
  648 print_ref(PDFSCAN *ps, ref *r)
  649 {
  650     int n = 0;
  651     switch(r->type) {
  652       case invalidtype:
  653     n = pdf_scan_msgf(ps, "--invalid--");
  654     break;
  655       case marktype:
  656     n = pdf_scan_msgf(ps, "--mark--");
  657     break;
  658       case nulltype:
  659     n = pdf_scan_msgf(ps, "--null--");
  660     break;
  661       case booltype:
  662     n = pdf_scan_msgf(ps, "%s", r->value.boolval ? "true" : "false");
  663     break;
  664       case integertype:
  665     n = pdf_scan_msgf(ps, "%d", r->value.intval);
  666     break;
  667       case realtype:
  668     n = pdf_scan_msgf(ps, "%g", r->value.realval);
  669     break;
  670       case nametype:
  671     n = pdf_scan_write(ps, "/", 1);
  672     pdf_scan_write(ps, r->value.nameval, r->rsize);
  673     break;
  674       case stringtype:
  675     n = pdf_scan_write(ps, "(", 1);
  676     n += pdf_scan_write(ps, r->value.strval, r->rsize);
  677     n += pdf_scan_write(ps, ")", 1);
  678     break;
  679       case streamtype:
  680     n = pdf_scan_msgf(ps, "--stream:%d--", r->value.streamval);
  681     break;
  682       case commenttype:
  683     n = pdf_scan_msgf(ps, "--comment--");
  684     break;
  685       case objtype:
  686     n = pdf_scan_msgf(ps, "--obj:%d--", r->value.objval);
  687     break;
  688       case optype:
  689     n = pdf_scan_msgf(ps, "--op:");
  690     n += pdf_scan_write(ps, r->value.opval, r->rsize);
  691     n += pdf_scan_write(ps, "--", 2);
  692     break;
  693       case arraytype:
  694     n = pdf_scan_msgf(ps, "--array:%d--", r->rsize);
  695     break;
  696       case dicttype:
  697     n = pdf_scan_msgf(ps, "--dict:%d--", r->rsize);
  698     break;
  699       default:
  700     n = pdf_scan_msgf(ps, "--unknown--");
  701     break;
  702     }
  703     return n;
  704 }
  705 
  706 /* print a reference, expanding array and dict */ 
  707 static int
  708 print_ref_expand(PDFSCAN *ps, ref *r)
  709 {
  710     int i;
  711     int n = 0;;
  712     if (r->type == arraytype) {
  713     n += pdf_scan_msgf(ps, "[ ");
  714     for (i=0; i<r->rsize; i++) {
  715         n += print_ref(ps, &r->value.arrayval[i]);
  716         n += pdf_scan_msgf(ps, " ");
  717     }
  718     n += pdf_scan_msgf(ps, "]");
  719     }
  720     else if (r->type == dicttype) {
  721     n += pdf_scan_msgf(ps, "<< ");
  722     for (i=0; i<r->rsize; i++) {
  723         n += print_ref(ps, &r->value.dictval[i+i]);
  724         n += pdf_scan_msgf(ps, " ");
  725         n += print_ref(ps, &r->value.dictval[i+i+1]);
  726         n += pdf_scan_msgf(ps, " ");
  727     }
  728     n += pdf_scan_msgf(ps, ">>");
  729     }
  730     else
  731     n += print_ref(ps, r);
  732     return n;
  733 }
  734 
  735 static void
  736 print_stack(PDFSCAN *ps)
  737 {
  738     int i, n=ps->ostack_idx;
  739     int col = 0;
  740     pdf_scan_msgf(ps, "Stack: ");
  741     for (i=1; i<=n; i++) {
  742     col += print_ref(ps, &ps->ostack[i]);
  743     if (col > 70) {
  744             pdf_scan_msgf(ps, "\n");
  745         col = 0;
  746     }
  747     else
  748             col += pdf_scan_msgf(ps, " ");
  749     }
  750     pdf_scan_msgf(ps, "\n");
  751 }
  752 
  753 static void
  754 print_stack_expand(PDFSCAN *ps)
  755 {
  756     int i, n=ps->ostack_idx;
  757     pdf_scan_msgf(ps, "Stack:\n");
  758     for (i=1; i<=n; i++) {
  759         pdf_scan_msgf(ps, "%2d: ", i);
  760     print_ref_expand(ps, &ps->ostack[i]);
  761         pdf_scan_msgf(ps, "\n");
  762     }
  763 }
  764 
  765 static void pdf_scan_print_allocated(PDFSCAN *ps)
  766 {
  767     int count = 0;
  768     int len = 0;
  769     PDFMEM *mem = ps->memory_head;
  770     while (mem) {
  771     len += sizeof(PDFMEM);
  772     len += mem->len;
  773     count++;
  774     mem = mem->next;
  775     }
  776     pdf_scan_msgf(ps, "Allocated memory %d bytes in %d objects\n", 
  777     len, count);
  778 }
  779 
  780 #endif
  781 
  782 /*****************************************************************/
  783 /* object reading and cache */
  784 
  785 static int obj_add(PDFSCAN *ps, int objnum, ref objref)
  786 {
  787     if (ps->objs_count + 2 >= ps->objs_len) {
  788     /* allocate more space */
  789     ref *newobjs;
  790     int newlen = ps->objs_len + 256;
  791     if (newlen > ps->objs_maxlen) {
  792         pdf_scan_msgf(ps, "obj_add: too many objects to cache\n");
  793         return 0;
  794     }
  795     newobjs = (ref *)malloc(newlen * sizeof(ref));
  796     if (newobjs == NULL) {
  797         pdf_scan_msgf(ps, "obj_add: Out of memory\n");
  798         return 0;
  799     }
  800     memcpy(newobjs, ps->objs, ps->objs_len * sizeof(ref));
  801     free(ps->objs);
  802     ps->objs = newobjs;
  803     ps->objs_len = newlen;
  804     }
  805     ps->objs[ps->objs_count++] = make_int(objnum);
  806     ps->objs[ps->objs_count++] = objref;
  807     return ps->objs_count;
  808 }
  809 
  810 static ref obj_find(PDFSCAN *ps, int objnum)
  811 {
  812     int i;
  813     for (i=0; i<ps->objs_count; i+=2) {
  814     if (objnum == ps->objs[i].value.intval)
  815         return ps->objs[i+1];
  816     }
  817     return invalidref;
  818 }
  819 
  820 static void clear_objs(PDFSCAN *ps)
  821 {
  822     ps->objs_count = 0;
  823 }
  824 
  825 /*****************************************************************/
  826 /* token parsing */
  827 
  828 static int is_white(char ch)
  829 {
  830     return (ch == '\0') || (ch == '\t') || (ch == '\n') ||
  831     (ch == '\f') || (ch == '\r') || (ch == ' ');
  832 }
  833 
  834 static int is_delimiter(char ch)
  835 {
  836     return (ch == '(') || (ch == ')') || 
  837     (ch == '<') || (ch == '>') ||
  838     (ch == '[') || (ch == ']') ||
  839     (ch == '{') || (ch == '}') ||
  840     (ch == '/') || (ch == '%');
  841 }
  842 
  843 
  844 /* Scan next token from buffer, returning token type and offset to begin 
  845  * and end of token.
  846  * Return 0 if OK, 1 if no token or not enough data, -1 on error
  847  */
  848 static int pdf_scan_token(const char *buf, int buflen, 
  849     rtype *ttype, int *tbegin, int *tend)
  850 {
  851     int code = -1;
  852     int i = 0;
  853     rtype type;
  854     int begin, end;
  855     *ttype = type = invalidtype;
  856     *tbegin = begin = 0;
  857     *tend = end = 0;
  858     while ((i < buflen) && is_white(buf[i]))
  859     i++;
  860     if (i == buflen)
  861     return 1;
  862 
  863     begin = i;
  864     if (buf[i] == '%') {
  865     while (i < buflen) {
  866         if ((buf[i] == '\n') || (buf[i] == '\r')) {
  867         type = commenttype;
  868         end = i;
  869         code = 0;
  870         break;
  871         }
  872         i++;
  873     }
  874         if (i >= buflen)
  875         code = 1;
  876 
  877     }
  878     else if (buf[i] == '(') {
  879     /* string */
  880     int pcount = 0;
  881     type = stringtype;
  882     i++;
  883     while (i < buflen) {
  884         if (buf[i] == '\\')
  885         i++;
  886         else if (buf[i] == '(')
  887         pcount++;
  888         else if (buf[i] == ')') {
  889         if (pcount <= 0) {
  890             end = i+1;
  891             code = 0;
  892             break;
  893         }
  894         else
  895             pcount--;
  896         }
  897         i++;
  898     }
  899     if (i >= buflen)
  900         code = 1;
  901     }
  902     else if (buf[i] == '<') {
  903     i++;
  904     if (i >= buflen) {
  905         code = 1;
  906     }
  907     else if (buf[i] == '<') {
  908         /* marktype */
  909         end = i+1;
  910         type = marktype;
  911         code = 0;
  912     }
  913     else {
  914         /* hexadecimal string */
  915         type = stringtype;
  916         while (i < buflen) {
  917         if (buf[i] == '>') {
  918             end = i+1;
  919             code = 0;
  920             break;
  921         }
  922         i++;
  923         }
  924         if (i >= buflen)
  925         code = 1;
  926     }
  927     }
  928     else if (buf[i] == '[') {
  929     code = 0;
  930     end = i+1;
  931     type = marktype;
  932     }
  933     else if (buf[i] == '/') {
  934     /* name */
  935     type = nametype;
  936     i++;
  937     while (i < buflen) {
  938         if (is_white(buf[i]) || is_delimiter(buf[i])) {
  939         end = i;
  940         code = 0;
  941         break;
  942         }
  943         i++;
  944     }
  945     if (i >= buflen)
  946         code = 1;
  947     }
  948     else if (is_delimiter(buf[i])) {
  949     /* skip over delimiter */
  950     if (buf[i] == '>') {
  951         i++;
  952         if (i < buflen) {
  953         if (buf[i] == '>') {
  954             type = optype;
  955             end = i+1;
  956             code = 0;
  957         }
  958         else
  959             code = -1;
  960         }
  961     }
  962     else {
  963         type = optype;
  964         end = i+1;
  965         code = 0;
  966     }
  967     if (i >= buflen)
  968         code = 1;
  969     }
  970     else {
  971     /* First assume that it is an op */
  972     type = optype;
  973     while (i < buflen) {
  974         if (is_white(buf[i]) || is_delimiter(buf[i])) {
  975         end = i;
  976         code = 0;
  977         break;
  978         }
  979         i++;
  980     }
  981     if (i >= buflen)
  982         code = 1;
  983 
  984     /* try to convert it into a bool */
  985     if ((code == 0) && (type == optype)) {
  986         if ((end - begin == 4) && 
  987         (memcmp(buf+begin, "true", 4) == 0)) {
  988         type = booltype;
  989         }
  990         else if ((end - begin == 5) && 
  991         (memcmp(buf+begin, "false", 5) == 0)) {
  992         type = booltype;
  993         }
  994     }
  995 
  996     /* try to convert it into an integer */
  997     if ((code == 0) && (type == optype)) {
  998         int j;
  999         char ch;
 1000         BOOL isreal = FALSE;
 1001         BOOL isnum = TRUE;
 1002         for (j=begin; j<end; j++) {
 1003         ch = buf[j];
 1004         if (ch == '.')
 1005             isreal = TRUE;
 1006         if (!((ch == '-') || (ch == '+') || (ch == '.') || 
 1007             isdigit((int)ch)))
 1008             isnum = FALSE;
 1009         }
 1010         if (isnum) {
 1011         if (isreal)
 1012             type = realtype;
 1013         else
 1014             type = integertype;
 1015         }
 1016     }
 1017     }
 1018 
 1019     *ttype = type;
 1020     *tbegin = begin;
 1021     *tend = end;
 1022     return code;
 1023 }
 1024 
 1025 /*****************************************************************/
 1026 
 1027 static void pdf_scan_finish(PDFSCAN *ps)
 1028 {
 1029     if (ps->file) {
 1030     fclose(ps->file);
 1031     ps->file = NULL;
 1032     }
 1033     if (ps->buf) {
 1034     free(ps->buf);
 1035     ps->buf = NULL;
 1036     }
 1037     ps->buflen = 0;
 1038     if (ps->xref) {
 1039     free(ps->xref);
 1040     ps->xref = NULL;
 1041     }
 1042     ps->xref_len = 0;
 1043     if (ps->ostack) {
 1044     free(ps->ostack);
 1045     ps->ostack = NULL;
 1046     }
 1047     ps->ostack_len = 0;
 1048     ps->ostack_idx = 0;
 1049 
 1050     if (ps->objs) {
 1051     free(ps->objs);
 1052     ps->objs = NULL;
 1053     }
 1054     ps->objs_len = 0;
 1055     ps->objs_count = 0;
 1056     memset(ps, 0, sizeof(PDFSCAN));
 1057 }
 1058 
 1059 static int pdf_scan_open_file(PDFSCAN *ps)
 1060 {
 1061     ps->file = csfopen(ps->filename, TEXT("rb"));
 1062     if (ps->file == NULL)
 1063     return -1;
 1064     return 0;
 1065 }
 1066 
 1067 static int pdf_scan_init(PDFSCAN *ps, const TCHAR *name)
 1068 {
 1069     int len = (int)(cslen(name)+1) * sizeof(TCHAR);
 1070     if (len > (int)sizeof(ps->filename))
 1071     return -1;
 1072     memcpy(ps->filename, name, len);
 1073     if (pdf_scan_open_file(ps) != 0) 
 1074     return -1;
 1075     ps->buflen = 256;
 1076     ps->buf = (char *)malloc(ps->buflen);
 1077     if (ps->buf == NULL) {
 1078     pdf_scan_finish(ps);
 1079     return -2;
 1080     }
 1081     ps->ostack_maxlen = 4096;
 1082     ps->ostack_len = 256;
 1083     ps->ostack_idx = 0; /* empty */
 1084     ps->ostack = (ref *)malloc(ps->ostack_len * sizeof(ref));
 1085     if (ps->ostack == NULL) {
 1086     pdf_scan_finish(ps);
 1087     return -2;
 1088     }
 1089     /* make first item on stack invalid */
 1090     ps->ostack[0].type = invalidtype;
 1091     ps->ostack[0].rsize = 0;
 1092     ps->ostack[0].value.voidval = NULL;
 1093 
 1094     /* object cache */
 1095     ps->objs_maxlen = 1024;
 1096     ps->objs_len = 256;
 1097     ps->objs_count = 0; /* empty */
 1098     ps->objs = (ref *)malloc(ps->objs_len * sizeof(ref));
 1099     if (ps->objs == NULL) {
 1100     pdf_scan_finish(ps);
 1101     return -2;
 1102     }
 1103 
 1104     ps->pagenum = -1;   /* no cached media info yet */
 1105 
 1106     return 0;
 1107 }
 1108 
 1109 static int pdf_scan_seek(PDFSCAN *ps, long offset, PDFSEEK whence)
 1110 {
 1111     int code = -1;
 1112     switch (whence) {
 1113     case PDFSEEK_CUR:
 1114         offset = ps->offset + ps->end + offset;
 1115     case PDFSEEK_SET:
 1116         ps->begin = ps->end = ps->len = 0;
 1117         code = fseek(ps->file, offset, SEEK_SET);
 1118         ps->offset = offset;
 1119         break;
 1120     case PDFSEEK_END:
 1121         code = fseek(ps->file, 0, SEEK_END);
 1122         ps->begin = ps->end = ps->len = 0;
 1123         ps->offset = ftell(ps->file);
 1124         break;
 1125     }
 1126     return code;
 1127 }
 1128 
 1129 /* Read next token from PDF file */
 1130 /* Return 0 if OK, or -1 if EOF, -2 if error */
 1131 /* Set *token_type to token type */
 1132 static int pdf_scan_next_token(PDFSCAN *ps)
 1133 {
 1134     int code = 0;
 1135     int count;
 1136     rtype type=invalidtype;
 1137     int begin=0, end=0;
 1138 
 1139     do {
 1140     if ((code == 1) && ps->end) {
 1141         /* move characters to front of buffer */
 1142         if (ps->len - ps->end)
 1143         memmove(ps->buf, ps->buf+ps->end, ps->len - ps->end);
 1144         ps->offset += ps->end;
 1145         ps->len = ps->len - ps->end;
 1146         ps->begin = 0;
 1147         ps->end = 0;
 1148     }
 1149 
 1150     if ((code == 1) && (ps->len >= ps->buflen)) {
 1151         /* increase buffer size */
 1152         char *newbuf;
 1153         int newbuflen = 2 * ps->buflen;
 1154         newbuf = (char *)malloc(newbuflen);
 1155         if (newbuf) {
 1156         memcpy(newbuf, ps->buf, ps->buflen);
 1157         free(ps->buf);
 1158         ps->buf = newbuf;
 1159         ps->buflen = newbuflen;
 1160         }
 1161         else {
 1162         pdf_scan_msgf(ps, "Out of memory in pdf_scan_next_token\n");
 1163         pdf_scan_msgf(ps, "Tried to realloc %d to %d\n",
 1164             ps->buflen, newbuflen);
 1165         code = -2;
 1166         break;
 1167         }
 1168     }
 1169 
 1170     if ((code == 1) || (ps->len == 0)) {
 1171         count = (int)fread(ps->buf+ps->len, 1, ps->buflen-ps->len, 
 1172         ps->file);
 1173         if (count == 0) {
 1174         pdf_scan_msgf(ps, "EOF in pdf_scan_next_token\n");
 1175         code = -1;
 1176         break;
 1177         }
 1178         ps->len += count;
 1179     }
 1180 
 1181     while (ps->instream) {
 1182         /* We are in a stream.  Keep reading until we find
 1183          * the endstream.  This isn't robust. It can be fooled 
 1184          * by "endstream" occuring within a stream.
 1185          */
 1186         while ((ps->end < ps->len) && (ps->buf[ps->end] != 'e'))
 1187         ps->end++;
 1188         /* look for endstream */
 1189         if (ps->end + 9 >= ps->len) {
 1190         code = 1;   /* need more */
 1191         break;
 1192         }
 1193         if (memcmp(ps->buf+ps->end, "endstream", 9) == 0)
 1194         ps->instream = FALSE;
 1195         else
 1196         ps->end++;
 1197     }
 1198     if (!ps->instream)
 1199         code = pdf_scan_token(ps->buf+ps->end, ps->len - ps->end, 
 1200         &type, &begin, &end);
 1201     } while (code == 1);
 1202 
 1203 
 1204     if (code == 0) {
 1205     /* got a token */
 1206     ps->begin = ps->end + begin;
 1207     ps->end = ps->end + end;
 1208     ps->token_type = type;
 1209 
 1210     if ((type == optype) && (ps->end-ps->begin == 6) &&
 1211         (memcmp(ps->buf+ps->begin, "stream", 6) == 0))
 1212         ps->instream = TRUE;
 1213     }
 1214 
 1215     return code;
 1216 }
 1217 
 1218 /*****************************************************************/
 1219 /* Reading %%EOF, xref, traler */
 1220 
 1221 static int
 1222 previous_line(const char *str, int len)
 1223 {
 1224     int i = len-1;
 1225     /* first skip over EOL */
 1226     while ((i > 0) && ((str[i]=='\r') || (str[i]=='\n')))
 1227     i--;
 1228     while ((i > 0) && !((str[i]=='\r') || (str[i]=='\n')))
 1229     i--;
 1230     if (!((str[i]=='\r') || (str[i]=='\n')))
 1231     return -1; /* didn't find a line */
 1232     return i+1;
 1233 }
 1234 
 1235 static int
 1236 pdf_scan_find_xref(PDFSCAN *ps)
 1237 {
 1238     char buf[4096];
 1239     int i, j;
 1240     int code = -1;
 1241     int count;
 1242     pdf_scan_seek(ps, 0, PDFSEEK_END);
 1243     count = min((int)sizeof(buf), ps->offset);
 1244     pdf_scan_seek(ps, -count, PDFSEEK_CUR);
 1245     count = (int)fread(buf, 1, sizeof(buf), ps->file);
 1246     pdf_scan_seek(ps, 0, PDFSEEK_SET);
 1247     if (count == 0)
 1248     return -1;
 1249     i = count - 5;
 1250     while (i > 0) {
 1251     /* Find %%EOF */
 1252     if (memcmp(buf+i, "%%EOF", 5) == 0) {
 1253         code = 0;
 1254         break;
 1255     }
 1256     i--;
 1257     }
 1258     if (i == 0) {
 1259     pdf_scan_msgf(ps, "Failed to find %%EOF\n");
 1260     code = -1;
 1261     }
 1262     if (code == 0) {
 1263     /* Look for xref table offset */
 1264     j = previous_line(buf, i);
 1265     if (j >= 0)
 1266         ps->xref_offset = atol(buf+j);
 1267     else 
 1268         code = -1;
 1269     i = j;
 1270     if (ps->xref_offset == 0)
 1271         code = -1;
 1272     if (code != 0)
 1273         pdf_scan_msgf(ps, "Failed to find cross reference table\n");
 1274     }
 1275 
 1276     if (code == 0) {
 1277     /* Look for "startxref" */
 1278     j = previous_line(buf, i);
 1279     if (j >= 0) {
 1280         if (memcmp(buf+j, "startxref", 9) != 0)
 1281         code = -1;
 1282     }
 1283     else {
 1284         code = -1;
 1285     }
 1286     if (code != 0)
 1287         pdf_scan_msgf(ps, "Failed to find startxref\n");
 1288     }
 1289     return code;
 1290 }
 1291 
 1292 /* Read a cross reference table */
 1293 /* This is called for each cross reference table */
 1294 static int
 1295 pdf_scan_read_xref(PDFSCAN *ps, unsigned long xref_offset)
 1296 {
 1297     int code;
 1298     int i;
 1299     int first = 0;
 1300     int count = 0;
 1301     unsigned long prev = 0;
 1302     unsigned long offset = 0;
 1303     int generation = 0;
 1304     BOOL used = FALSE;
 1305     pdf_scan_seek(ps, xref_offset, PDFSEEK_SET);
 1306     code = pdf_scan_next_token(ps);
 1307     if (code == 0)
 1308     code = op_check(ps, "xref");
 1309     while (code == 0) {
 1310         code = pdf_scan_next_token(ps);
 1311         if ((code == 0) && is_optoken(ps, "trailer"))
 1312         break;  /* finished this xref table */
 1313     if (code == 0) {
 1314         first = atoi(ps->buf + ps->begin);
 1315             code = pdf_scan_next_token(ps);
 1316     }
 1317     if (code == 0) {
 1318         count = atoi(ps->buf + ps->begin);
 1319     }
 1320     if (code == 0) {
 1321         /* make sure there is enough space in the table */
 1322         if (first + count > ps->xref_len) {
 1323         int len = (first + count) * sizeof(PDFXREF);
 1324         PDFXREF *newxref = (PDFXREF *)malloc(len);
 1325         if (newxref) {
 1326             memset(newxref, 0, len);
 1327             memcpy(newxref, ps->xref, ps->xref_len * sizeof(PDFXREF));
 1328             free(ps->xref);
 1329             ps->xref = newxref;
 1330             ps->xref_len = first + count;
 1331         }
 1332         else {
 1333             pdf_scan_msgf(ps, "pdf_scan_read_xref: out of memory\n");
 1334             code = -2;
 1335             break;
 1336         }
 1337         }
 1338     }
 1339     for (i=first; i<first+count; i++) {
 1340             code = pdf_scan_next_token(ps);
 1341         if (code == 0) {
 1342         offset = atol(ps->buf+ps->begin);
 1343                 code = pdf_scan_next_token(ps);
 1344         }
 1345         if (code == 0) {
 1346         generation = atoi(ps->buf+ps->begin);
 1347                 code = pdf_scan_next_token(ps);
 1348         }
 1349         if (code == 0) {
 1350         if (is_optoken(ps, "n"))
 1351             used = TRUE;
 1352         else if (is_optoken(ps, "f"))
 1353             used = FALSE;
 1354         else
 1355             code = -1;
 1356         }
 1357         /* We don't deal correctly with generation.
 1358          * We assume that the first xref table that marks an
 1359          * object as used is the definitive reference.
 1360          */
 1361         if (code == 0) {
 1362         if (!(ps->xref[i].used)) {
 1363             ps->xref[i].offset = offset;
 1364             ps->xref[i].generation = generation;
 1365             ps->xref[i].used = used;
 1366         }
 1367         }
 1368     }
 1369     }
 1370 
 1371     if (code == 0) {
 1372     code = pdf_scan_read_trailer(ps, &prev);
 1373     if ((code == 0) && prev && prev != ps->xref_offset) {
 1374         /* read older xref and trailer */
 1375         code = pdf_scan_read_xref(ps, prev);
 1376     }
 1377     }
 1378 
 1379     return code;
 1380 }
 1381 
 1382 /* Read a trailer */
 1383 static int
 1384 pdf_scan_read_trailer(PDFSCAN *ps, unsigned long *prev)
 1385 {
 1386     int code = 0;
 1387     ref p;
 1388     code = pdf_scan_next_token(ps);
 1389     if ((code == 0) && (ps->token_type != marktype))
 1390     code = -1;
 1391     push_token(ps);
 1392     while (code == 0) {
 1393         code = pdf_scan_next_token(ps);
 1394     if (code != 0)
 1395         break;
 1396     if (is_optoken(ps, "startxref")) {
 1397         if (ps->root == 0) {
 1398             p = dict_get(ps, "Root");
 1399             if (p.type == objtype)
 1400             ps->root = p.value.objval;
 1401         else {
 1402             pdf_scan_msgf(ps, 
 1403             "trailer /Root requires indirect reference\n");
 1404             code = -1;
 1405         }
 1406         }
 1407         p = dict_get(ps, "Prev");
 1408         if (p.type == integertype)
 1409         *prev = p.value.intval;
 1410         else if (p.type != invalidtype) {
 1411         code = -1;
 1412         pdf_scan_msgf(ps, "trailer /Prev requires integer\n");
 1413         }
 1414         break;
 1415     }
 1416     if (process_op(ps) != 0)
 1417         push_token(ps);
 1418     }
 1419     if (code != 0)
 1420     pdf_scan_msgf(ps, "Error reading trailer\n");
 1421     return code;
 1422 }
 1423 
 1424 
 1425 static int pdf_scan_read_object_start(PDFSCAN *ps, int objnum)
 1426 {
 1427     int code = 0;
 1428     int value = 0;
 1429     if (objnum == 0) {
 1430     pdf_scan_msgf(ps, "Object 0 is always unused\n");
 1431     return -1;
 1432     }
 1433     if (objnum >= ps->xref_len) {
 1434     pdf_scan_msgf(ps, "Object reference %d doesn't exist.  There are only %d objects\n", objnum, ps->xref_len);
 1435     return -1;
 1436     }
 1437     if (!ps->xref[objnum].used) {
 1438     pdf_scan_msgf(ps, "Object %d is unused\n", objnum);
 1439     return -1;
 1440     }
 1441     pdf_scan_seek(ps, ps->xref[objnum].offset, PDFSEEK_SET);
 1442 
 1443     code = pdf_scan_next_token(ps);     /* object number */
 1444     if (code == 0)
 1445     code = type_check(ps, integertype);
 1446     if (code == 0) {
 1447     value = atoi(ps->buf+ps->begin);    /* object number */
 1448     code = pdf_scan_next_token(ps);     /* generation */
 1449     }
 1450     if (code == 0)
 1451     code = type_check(ps, integertype);
 1452     if (code == 0)
 1453     code = pdf_scan_next_token(ps);     /* obj */
 1454     if (code == 0)
 1455     code = op_check(ps, "obj");
 1456 
 1457     if (value != objnum) {
 1458     pdf_scan_msgf(ps, "Didn't find object %d\n", objnum);
 1459     return -1;
 1460     }
 1461     return code;
 1462 }
 1463 
 1464 /*****************************************************************/
 1465 
 1466 /* Read an object, and leave it on the stack */
 1467 static int
 1468 pdf_scan_read_object(PDFSCAN *ps, int objnum)
 1469 {
 1470     int code;
 1471     ref objref = obj_find(ps, objnum);
 1472 
 1473     if (objref.type != invalidtype) {
 1474     /* found in cache */
 1475     push_stack(ps, objref);
 1476     return 0;
 1477     }
 1478 
 1479     code = pdf_scan_read_object_start(ps, objnum);
 1480     if (code) {
 1481     pdf_scan_msgf(ps, "Didn't find object %d\n", objnum);
 1482     return -1;
 1483     }
 1484 
 1485     code = pdf_scan_next_token(ps);
 1486     if ((code == 0) && (ps->token_type != marktype))
 1487     code = -1;
 1488     push_token(ps);
 1489     while (code == 0) {
 1490         code = pdf_scan_next_token(ps);
 1491     if (code != 0)
 1492         break;
 1493     if (is_optoken(ps, "endobj")) {
 1494         obj_add(ps, objnum, top_stack(ps));
 1495         break;
 1496     }
 1497     if (process_op(ps) != 0)
 1498         push_token(ps);
 1499     }
 1500     return code;
 1501 }
 1502 
 1503 /*****************************************************************/
 1504 
 1505 /* find the object number for a page */
 1506 /* Return <= 0 if failure, or object number */
 1507 /* First page is 0 */
 1508 static int pdf_scan_find_page(PDFSCAN *ps, int pagenum)
 1509 {
 1510     int code;
 1511     ref kids;
 1512     ref r;
 1513     int pageobj = 0;
 1514     int count_base = 0;
 1515     int count;
 1516     ref *pref;
 1517     int i;
 1518     int inext;
 1519 
 1520     if (pagenum >= ps->page_count) {
 1521     pdf_scan_msgf(ps, "Not that many pages\n");
 1522     return -1;
 1523     }
 1524     code = pdf_scan_read_object(ps, ps->pages);
 1525     if (code) {
 1526     pdf_scan_msgf(ps, "Didn't find Pages object\n");
 1527     return -1;
 1528     }
 1529     /* iterate through Kids, looking for the one that includes this page */ 
 1530     kids = dict_get(ps, "Kids");
 1531     if (kids.type != arraytype) {
 1532     pdf_scan_msgf(ps, "/Pages object %d must contain /Kids array\n",
 1533         ps->pages);
 1534     return -1;
 1535     }
 1536     pop_stack(ps);  /* First Pages */
 1537     for (i = 0; (i < kids.rsize) && (code == 0); i=inext) {
 1538     inext = i+1;
 1539     pref = &kids.value.arrayval[i];
 1540     if (pref->type == objtype)
 1541         code = pdf_scan_read_object(ps, pref->value.objval);
 1542     if (code == 0) {
 1543         r = dict_get(ps, "Type"); 
 1544         if (nameref_equals(&r, "Page")) {
 1545         if (count_base + i == pagenum) {
 1546             /* this is it */
 1547             pageobj = pref->value.objval;
 1548             pop_stack(ps);  /* the wanted page */
 1549             break;
 1550         }
 1551         }
 1552         else if (nameref_equals(&r, "Pages")) {
 1553             r = dict_get(ps, "Count"); 
 1554         if (r.type == integertype) {
 1555             count = r.value.intval;
 1556             if (pagenum < count_base + count) {
 1557             /* It's under this child */
 1558             inext = 0;
 1559                 pop_stack(ps);  /* The old /Pages */
 1560             code = pdf_scan_read_object(ps, pref->value.objval);
 1561             if (code == 0) {
 1562                 kids = dict_get(ps, "Kids");
 1563                 if (kids.type != arraytype) {
 1564                 pdf_scan_msgf(ps, 
 1565                 "/Pages object %d must contain /Kids array\n",
 1566                     pref->value.objval);
 1567                 code = -1;
 1568                 }
 1569             }
 1570             }
 1571             else {
 1572             count_base += count;
 1573             }
 1574         }
 1575         else {
 1576             pdf_scan_msgf(ps, "/Pages /Count must be integer\n");
 1577             code = -1;
 1578         }
 1579         }
 1580         else {
 1581         pdf_scan_msgf(ps, 
 1582             "pdf_scan_find_page: object %d isn't Pages or Page\n", 
 1583             pref->value.objval);
 1584         code = -1;
 1585         }
 1586         pop_stack(ps);
 1587     }
 1588     }
 1589 
 1590     if (pageobj <= 0) {
 1591     pdf_scan_msgf(ps, "Failed to find page %d\n", pagenum+1);
 1592     code = -1;
 1593     }
 1594 
 1595     if (code)
 1596     return -1;
 1597 
 1598     /* Don't clean up, since we will use the cached objects
 1599      * when extracting the page media.
 1600      */
 1601 
 1602     return pageobj;
 1603 }
 1604 
 1605 
 1606 static int
 1607 pdf_scan_read_page_count(PDFSCAN *ps)
 1608 {
 1609     int code;
 1610     ref p;
 1611     code = pdf_scan_read_object(ps, ps->pages);
 1612     if (code) {
 1613     pdf_scan_msgf(ps, "Didn't find Pages object\n");
 1614     return -1;
 1615     }
 1616 
 1617     p = dict_get(ps, "Type");
 1618     if (!nameref_equals(&p, "Pages")) {
 1619     pdf_scan_msgf(ps, "Pages object didn't have /Type /Pages\n");
 1620     return -1;
 1621     }
 1622     p = dict_get(ps, "Count");
 1623     if (p.type != integertype) {
 1624     pdf_scan_msgf(ps, "Pages object didn't integer /Count\n");
 1625     return -1;
 1626     }
 1627     ps->page_count = p.value.intval;
 1628 
 1629     return code;
 1630 }
 1631 
 1632 static int convert_float(ref r, float *f)
 1633 {
 1634     if (r.type == realtype)
 1635     *f = r.value.realval;
 1636     else if (r.type == integertype)
 1637     *f = (float)r.value.intval;
 1638     else
 1639        return -1;
 1640     return 0;
 1641 }
 1642 
 1643 static int
 1644 pdf_scan_read_bbox(PDFBBOX *box, ref array)
 1645 {
 1646     int code = 0;
 1647     if (array.type != arraytype)
 1648     code = -1;
 1649     if (array.rsize != 4)
 1650     code = -1;
 1651     if (code == 0)
 1652         code = convert_float(array.value.arrayval[0], &box->llx);
 1653     if (code == 0)
 1654     code = convert_float(array.value.arrayval[1], &box->lly);
 1655     if (code == 0)
 1656     code = convert_float(array.value.arrayval[2], &box->urx);
 1657     if (code == 0)
 1658     code = convert_float(array.value.arrayval[3], &box->ury);
 1659     return code;
 1660 }
 1661 
 1662 /* Read catalog and leave on stack */
 1663 static int
 1664 pdf_scan_read_catalog(PDFSCAN *ps)
 1665 {
 1666     int code;
 1667     ref p;
 1668     /* Read root object, making sure it is /Type /Catalog,
 1669      * and that /Pages is an indirect reference
 1670      */
 1671     code = pdf_scan_read_object(ps, ps->root);
 1672     if (code) {
 1673     pdf_scan_msgf(ps, "Didn't find Root object\n");
 1674     return -1;
 1675     }
 1676 
 1677     p = dict_get(ps, "Type");
 1678     if (!nameref_equals(&p, "Catalog")) {
 1679     pdf_scan_msgf(ps, "Root object didn't have /Type /Catalog\n");
 1680     return -1;
 1681     }
 1682     p = dict_get(ps, "Pages");
 1683     if (p.type != objtype) {
 1684     pdf_scan_msgf(ps, "Root object didn't indirect reference to /Pages\n");
 1685     return -1;
 1686     }
 1687     ps->pages = p.value.intval;
 1688     return 0;
 1689 }
 1690 
 1691 /*****************************************************************/
 1692 /* public functions */
 1693 
 1694 
 1695 void
 1696 pdf_scan_close(PDFSCAN *ps)
 1697 {
 1698     pdf_scan_cleanup(ps);
 1699     pdf_scan_finish(ps);
 1700     free(ps);
 1701 }
 1702 
 1703 
 1704 PDFSCAN *
 1705 pdf_scan_open(const TCHAR *filename, void *handle,
 1706     int (*fn)(void *handle, const char *ptr, int len))
 1707 {
 1708     int code;
 1709     int rotate;
 1710     PDFBBOX mediabox, cropbox;
 1711     PDFSCAN *ps = (PDFSCAN *)malloc(sizeof(PDFSCAN));
 1712     if (ps == NULL)
 1713     return NULL;
 1714     memset(ps, 0, sizeof(PDFSCAN));
 1715     ps->handle = handle;
 1716     ps->print_fn = fn;
 1717     code = pdf_scan_init(ps, filename);
 1718     if (code == -1)
 1719     pdf_scan_msgf(ps, "Couldn't open PDF file\n");
 1720     else if (code != 0)
 1721     pdf_scan_msgf(ps, "Error initialising PDF scanner\n");
 1722 
 1723     if (code == 0)
 1724         code = pdf_scan_find_xref(ps);
 1725     if (code == 0)
 1726     code = pdf_scan_read_xref(ps, ps->xref_offset);
 1727     if (code == 0)
 1728     code = pdf_scan_read_catalog(ps);
 1729     if (code == 0)
 1730     code = pdf_scan_read_page_count(ps);
 1731     if (code == 0)
 1732     code = pdf_scan_page_media(ps, 0, &rotate, &mediabox, &cropbox);
 1733 
 1734     pdf_scan_cleanup(ps);
 1735     if (code != 0) {
 1736     pdf_scan_close(ps);
 1737     ps = NULL;
 1738     }
 1739     return ps;
 1740 }
 1741 
 1742 int
 1743 pdf_scan_page_count(PDFSCAN *ps) 
 1744 {
 1745     if (ps == NULL)
 1746     return 0;
 1747     return ps->page_count;
 1748 }
 1749 
 1750 int
 1751 pdf_scan_page_media(PDFSCAN *ps, int pagenum, int *rotate,
 1752     PDFBBOX *mediabox, PDFBBOX *cropbox)
 1753 {
 1754     BOOL found_rotate = FALSE;
 1755     BOOL found_mediabox = FALSE;
 1756     BOOL found_cropbox = FALSE;
 1757     BOOL has_parent = TRUE;
 1758     ref p, objref;
 1759     int objnum;
 1760 
 1761     if (ps == NULL)
 1762     return -1;
 1763 
 1764     if (pagenum == ps->pagenum) {
 1765     /* Used cached values */
 1766     *rotate = ps->rotate;
 1767     *mediabox = ps->mediabox;
 1768     *cropbox = ps->cropbox;
 1769     return 0;
 1770     }
 1771 
 1772     if (ps->file == NULL) {
 1773     if (pdf_scan_open_file(ps) != 0) 
 1774         return -1;
 1775     }
 1776     objnum = pdf_scan_find_page(ps, pagenum);
 1777     if (objnum <= 0) {
 1778     pdf_scan_cleanup(ps);
 1779     return -1;
 1780     }
 1781     if (pdf_scan_read_object(ps, objnum) < 0) {
 1782     pdf_scan_cleanup(ps);
 1783     return -1;
 1784     }
 1785 
 1786     while (has_parent) {
 1787     if (!found_rotate) {
 1788         p = dict_get(ps, "Rotate");
 1789         if (p.type == integertype) {
 1790         *rotate = p.value.intval;
 1791         found_rotate = TRUE;
 1792         }
 1793     }
 1794     if (!found_mediabox) {
 1795         p = dict_get(ps, "MediaBox");
 1796         if (pdf_scan_read_bbox(mediabox, p) == 0)
 1797         found_mediabox = TRUE;
 1798     }
 1799     if (!found_cropbox) {
 1800         p = dict_get(ps, "CropBox");
 1801         if (pdf_scan_read_bbox(cropbox, p) == 0)
 1802         found_cropbox = TRUE;
 1803     }
 1804         if (found_rotate && found_mediabox && found_cropbox)
 1805         break;
 1806 
 1807     p = dict_get(ps, "Parent");
 1808     if (p.type == objtype) {
 1809         objref = pop_stack(ps);
 1810         if (pdf_scan_read_object(ps, p.value.objval) < 0) {
 1811         push_stack(ps, objref);
 1812         has_parent = FALSE;
 1813         }
 1814     }
 1815     else
 1816         has_parent = FALSE;
 1817     }
 1818     pop_stack(ps);
 1819     if (!found_cropbox) {
 1820     *cropbox = *mediabox;
 1821     found_cropbox = TRUE;
 1822     }
 1823     if (!found_rotate) {
 1824     *rotate = 0;
 1825     found_rotate = TRUE;
 1826     }
 1827 
 1828     pdf_scan_cleanup(ps);
 1829     
 1830     if (found_rotate && found_mediabox && found_cropbox) {
 1831     /* cache these values */
 1832     ps->pagenum = pagenum;
 1833     ps->rotate = *rotate;
 1834     ps->mediabox = *mediabox;
 1835     ps->cropbox = *cropbox;
 1836         return 0;
 1837     }
 1838     
 1839     return -1;
 1840 }
 1841 
 1842 /*****************************************************************/
 1843 
 1844 #ifdef DEMO_PDFSCAN
 1845 
 1846 int test_print_fn(void *handle, const char *ptr, int len)
 1847 {
 1848     fwrite(ptr, 1, len, stdout);
 1849     return len;
 1850 }
 1851 
 1852 int main(int argc, char *argv[])
 1853 {
 1854     PDFSCAN *ps;
 1855     int i, count;
 1856     int code;
 1857     PDFBBOX mediabox, cropbox;
 1858     int rotate;
 1859 
 1860     if (argc < 2) {
 1861     fprintf(stdout, "Usage: cpdfscan filename\n");
 1862     return 1;
 1863     }
 1864 
 1865     ps = pdf_scan_open(argv[1], NULL, test_print_fn);
 1866     if (ps) {
 1867     count = pdf_scan_page_count(ps);
 1868     pdf_scan_msgf(ps, "Page count is %d\n", count);
 1869     for (i=0; i<count; i++) {
 1870         code = pdf_scan_page_media(ps, i, &rotate, &mediabox, &cropbox);
 1871         if (code == 0) {
 1872             fprintf(stdout, "Page %d /Rotate %d ", i+1, rotate);
 1873             fprintf(stdout, "/MediaBox [%g %g %g %g] /CropBox [%g %g %g %g]\n", 
 1874             mediabox.llx, mediabox.lly, mediabox.urx, mediabox.ury,
 1875             cropbox.llx, cropbox.lly, cropbox.urx, cropbox.ury);
 1876         }
 1877         else
 1878             fprintf(stdout, "Page %d media unknown\n", i+1);
 1879     }
 1880     pdf_scan_close(ps);
 1881     }
 1882     return 0;
 1883 }
 1884 
 1885 #endif