w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

Lexer.cc
Go to the documentation of this file.
1 //========================================================================
2 //
3 // Lexer.cc
4 //
5 // Copyright 1996-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8 
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
15 //
16 // Copyright (C) 2006-2010, 2012-2014, 2017-2019 Albert Astals Cid <aacid@kde.org>
17 // Copyright (C) 2006 Krzysztof Kowalczyk <kkowalczyk@gmail.com>
18 // Copyright (C) 2010 Carlos Garcia Campos <carlosgc@gnome.org>
19 // Copyright (C) 2012, 2013 Adrian Johnson <ajohnson@redneon.com>
20 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
21 //
22 // To see a description of the changes please see the Changelog file that
23 // came with your tarball or type make ChangeLog if you are building from git
24 //
25 //========================================================================
26 
27 #include <config.h>
28 
29 #include <cstdlib>
30 #include <cstddef>
31 #include <cstring>
32 #include <climits>
33 #include <cctype>
34 #include "Lexer.h"
35 #include "Error.h"
36 #include "XRef.h"
37 
38 //------------------------------------------------------------------------
39 
40 // A '1' in this array means the character is white space. A '1' or
41 // '2' means the character ends a name or command.
42 static const char specialChars[256] = {
43  1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, // 0x
44  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
45  1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, // 2x
46  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, // 3x
47  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4x
48  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 5x
49  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6x
50  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 7x
51  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x
52  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x
53  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ax
54  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // bx
55  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // cx
56  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // dx
57  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ex
58  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // fx
59 };
60 
61 static const int IntegerSafeLimit = (INT_MAX - 9) / 10;
62 static const long long LongLongSafeLimit = (LLONG_MAX - 9) / 10;
63 
64 //------------------------------------------------------------------------
65 // Lexer
66 //------------------------------------------------------------------------
67 
68 Lexer::Lexer(XRef *xrefA, Stream *str)
69 {
71  xref = xrefA;
72 
73  curStr = Object(str);
74  streams = new Array(xref);
75  streams->add(curStr.copy());
76  strPtr = 0;
77  freeArray = true;
79 }
80 
81 Lexer::Lexer(XRef *xrefA, Object *obj)
82 {
84  xref = xrefA;
85 
86  if (obj->isStream()) {
87  streams = new Array(xref);
88  freeArray = true;
89  streams->add(obj->copy());
90  } else {
91  streams = obj->getArray();
92  freeArray = false;
93  }
94  strPtr = 0;
95  if (streams->getLength() > 0) {
97  if (curStr.isStream()) {
99  }
100  }
101 }
102 
104 {
105  if (curStr.isStream()) {
107  }
108  if (freeArray) {
109  delete streams;
110  }
111 }
112 
113 int Lexer::getChar(bool comesFromLook)
114 {
115  int c;
116 
120  return c;
121  }
122 
123  c = EOF;
124  while (curStr.isStream() && (c = curStr.streamGetChar()) == EOF) {
125  if (comesFromLook == true) {
126  return EOF;
127  } else {
129  curStr = Object();
130  ++strPtr;
131  if (strPtr < streams->getLength()) {
132  curStr = streams->get(strPtr);
133  if (curStr.isStream()) {
135  }
136  }
137  }
138  }
139  return c;
140 }
141 
142 int Lexer::lookChar()
143 {
144 
147  }
149  if (lookCharLastValueCached == EOF) {
151  return EOF;
152  } else {
154  }
155 }
156 
158 {
159  char *p;
160  int c, c2;
161  bool comment, neg, done, overflownInteger, overflownLongLong;
162  int numParen;
163  int xi;
164  long long xll = 0;
165  double xf = 0, scale;
166  GooString *s;
167  int n, m;
168 
169  // skip whitespace and comments
170  comment = false;
171  while (true) {
172  if ((c = getChar()) == EOF) {
173  return Object(objEOF);
174  }
175  if (comment) {
176  if (c == '\r' || c == '\n')
177  comment = false;
178  } else if (c == '%') {
179  comment = true;
180  } else if (specialChars[c] != 1) {
181  break;
182  }
183  }
184 
185  // start reading token
186  switch (c) {
187 
188  // number
189  case '0':
190  case '1':
191  case '2':
192  case '3':
193  case '4':
194  case '5':
195  case '6':
196  case '7':
197  case '8':
198  case '9':
199  case '+':
200  case '-':
201  case '.':
202  overflownInteger = false;
203  overflownLongLong = false;
204  neg = false;
205  xi = 0;
206  if (c == '-') {
207  neg = true;
208  } else if (c == '.') {
209  goto doReal;
210  } else if (c != '+') {
211  xi = c - '0';
212  }
213  while (true) {
214  c = lookChar();
215  if (isdigit(c)) {
216  getChar();
217  if (unlikely(overflownLongLong)) {
218  xf = xf * 10.0 + (c - '0');
219  } else if (unlikely(overflownInteger)) {
220  if (unlikely(xll > LongLongSafeLimit) && (xll > (LLONG_MAX - (c - '0')) / 10)) {
221  overflownLongLong = true;
222  xf = xll * 10.0 + (c - '0');
223  } else {
224  xll = xll * 10 + (c - '0');
225  }
226  } else {
227  if (unlikely(xi > IntegerSafeLimit) && (xi > (INT_MAX - (c - '0')) / 10.0)) {
228  overflownInteger = true;
229  xll = xi * 10LL + (c - '0');
230  } else {
231  xi = xi * 10 + (c - '0');
232  }
233  }
234  } else if (c == '.') {
235  getChar();
236  goto doReal;
237  } else {
238  break;
239  }
240  }
241  if (neg) {
242  xi = -xi;
243  xll = -xll;
244  xf = -xf;
245  }
246  if (unlikely(overflownInteger)) {
247  if (overflownLongLong) {
248  return Object(xf);
249  } else {
250  if (unlikely(xll == INT_MIN)) {
251  return Object(static_cast<int>(INT_MIN));
252  } else {
253  return Object(xll);
254  }
255  }
256  } else {
257  return Object(xi);
258  }
259  break;
260  doReal:
261  if (likely(!overflownInteger)) {
262  xf = xi;
263  } else if (!overflownLongLong) {
264  xf = xll;
265  }
266  scale = 0.1;
267  while (true) {
268  c = lookChar();
269  if (c == '-') {
270  // ignore minus signs in the middle of numbers to match
271  // Adobe's behavior
272  error(errSyntaxWarning, getPos(), "Badly formatted number");
273  getChar();
274  continue;
275  }
276  if (!isdigit(c)) {
277  break;
278  }
279  getChar();
280  xf = xf + scale * (c - '0');
281  scale *= 0.1;
282  }
283  if (neg) {
284  xf = -xf;
285  }
286  return Object(xf);
287  break;
288 
289  // string
290  case '(':
291  p = tokBuf;
292  n = 0;
293  numParen = 1;
294  done = false;
295  s = nullptr;
296  do {
297  c2 = EOF;
298  switch (c = getChar()) {
299 
300  case EOF:
301 #if 0
302  // This breaks some PDF files, e.g., ones from Photoshop.
303  case '\r':
304  case '\n':
305 #endif
306  error(errSyntaxError, getPos(), "Unterminated string");
307  done = true;
308  break;
309 
310  case '(':
311  ++numParen;
312  c2 = c;
313  break;
314 
315  case ')':
316  if (--numParen == 0) {
317  done = true;
318  } else {
319  c2 = c;
320  }
321  break;
322 
323  case '\\':
324  switch (c = getChar()) {
325  case 'n':
326  c2 = '\n';
327  break;
328  case 'r':
329  c2 = '\r';
330  break;
331  case 't':
332  c2 = '\t';
333  break;
334  case 'b':
335  c2 = '\b';
336  break;
337  case 'f':
338  c2 = '\f';
339  break;
340  case '\\':
341  case '(':
342  case ')':
343  c2 = c;
344  break;
345  case '0':
346  case '1':
347  case '2':
348  case '3':
349  case '4':
350  case '5':
351  case '6':
352  case '7':
353  c2 = c - '0';
354  c = lookChar();
355  if (c >= '0' && c <= '7') {
356  getChar();
357  c2 = (c2 << 3) + (c - '0');
358  c = lookChar();
359  if (c >= '0' && c <= '7') {
360  getChar();
361  c2 = (c2 << 3) + (c - '0');
362  }
363  }
364  break;
365  case '\r':
366  c = lookChar();
367  if (c == '\n') {
368  getChar();
369  }
370  break;
371  case '\n':
372  break;
373  case EOF:
374  error(errSyntaxError, getPos(), "Unterminated string");
375  done = true;
376  break;
377  default:
378  c2 = c;
379  break;
380  }
381  break;
382 
383  default:
384  c2 = c;
385  break;
386  }
387 
388  if (c2 != EOF) {
389  if (n == tokBufSize) {
390  if (!s)
391  s = new GooString(tokBuf, tokBufSize);
392  else
393  s->append(tokBuf, tokBufSize);
394  p = tokBuf;
395  n = 0;
396 
397  // we are growing see if the document is not malformed and we are growing too much
398  if (objNum > 0 && xref != nullptr) {
399  const int newObjNum = xref->getNumEntry(getPos());
400  if (newObjNum != objNum) {
401  error(errSyntaxError, getPos(), "Unterminated string");
402  done = true;
403  delete s;
404  n = -2;
405  }
406  }
407  }
408  *p++ = (char)c2;
409  ++n;
410  }
411  } while (!done);
412  if (n >= 0) {
413  if (!s)
414  s = new GooString(tokBuf, n);
415  else
416  s->append(tokBuf, n);
417  return Object(s);
418  } else {
419  return Object(objEOF);
420  }
421  break;
422 
423  // name
424  case '/':
425  p = tokBuf;
426  n = 0;
427  s = nullptr;
428  while ((c = lookChar()) != EOF && !specialChars[c]) {
429  getChar();
430  if (c == '#') {
431  c2 = lookChar();
432  if (c2 >= '0' && c2 <= '9') {
433  c = c2 - '0';
434  } else if (c2 >= 'A' && c2 <= 'F') {
435  c = c2 - 'A' + 10;
436  } else if (c2 >= 'a' && c2 <= 'f') {
437  c = c2 - 'a' + 10;
438  } else {
439  goto notEscChar;
440  }
441  getChar();
442  c <<= 4;
443  c2 = getChar();
444  if (c2 >= '0' && c2 <= '9') {
445  c += c2 - '0';
446  } else if (c2 >= 'A' && c2 <= 'F') {
447  c += c2 - 'A' + 10;
448  } else if (c2 >= 'a' && c2 <= 'f') {
449  c += c2 - 'a' + 10;
450  } else {
451  error(errSyntaxError, getPos(), "Illegal digit in hex char in name");
452  }
453  }
454  notEscChar:
455  // the PDF spec claims that names are limited to 127 chars, but
456  // Distiller 8 will produce longer names, and Acrobat 8 will
457  // accept longer names
458  ++n;
459  if (n < tokBufSize) {
460  *p++ = c;
461  } else if (n == tokBufSize) {
462  error(errSyntaxError, getPos(), "Warning: name token is longer than what the specification says it can be");
463  *p = c;
464  s = new GooString(tokBuf, n);
465  } else {
466  s->append((char)c);
467  }
468  }
469  if (n < tokBufSize) {
470  *p = '\0';
471  return Object(objName, tokBuf);
472  } else {
473  Object obj(objName, s->c_str());
474  delete s;
475  return obj;
476  }
477  break;
478 
479  // array punctuation
480  case '[':
481  case ']':
482  tokBuf[0] = c;
483  tokBuf[1] = '\0';
484  return Object(objCmd, tokBuf);
485  break;
486 
487  // hex string or dict punctuation
488  case '<':
489  c = lookChar();
490 
491  // dict punctuation
492  if (c == '<') {
493  getChar();
494  tokBuf[0] = tokBuf[1] = '<';
495  tokBuf[2] = '\0';
496  return Object(objCmd, tokBuf);
497 
498  // hex string
499  } else {
500  p = tokBuf;
501  m = n = 0;
502  c2 = 0;
503  s = nullptr;
504  while (true) {
505  c = getChar();
506  if (c == '>') {
507  break;
508  } else if (c == EOF) {
509  error(errSyntaxError, getPos(), "Unterminated hex string");
510  break;
511  } else if (specialChars[c] != 1) {
512  c2 = c2 << 4;
513  if (c >= '0' && c <= '9')
514  c2 += c - '0';
515  else if (c >= 'A' && c <= 'F')
516  c2 += c - 'A' + 10;
517  else if (c >= 'a' && c <= 'f')
518  c2 += c - 'a' + 10;
519  else
520  error(errSyntaxError, getPos(), "Illegal character <{0:02x}> in hex string", c);
521  if (++m == 2) {
522  if (n == tokBufSize) {
523  if (!s)
524  s = new GooString(tokBuf, tokBufSize);
525  else
526  s->append(tokBuf, tokBufSize);
527  p = tokBuf;
528  n = 0;
529  }
530  *p++ = (char)c2;
531  ++n;
532  c2 = 0;
533  m = 0;
534  }
535  }
536  }
537  if (!s)
538  s = new GooString(tokBuf, n);
539  else
540  s->append(tokBuf, n);
541  if (m == 1)
542  s->append((char)(c2 << 4));
543  return Object(s);
544  }
545  break;
546 
547  // dict punctuation
548  case '>':
549  c = lookChar();
550  if (c == '>') {
551  getChar();
552  tokBuf[0] = tokBuf[1] = '>';
553  tokBuf[2] = '\0';
554  return Object(objCmd, tokBuf);
555  } else {
556  error(errSyntaxError, getPos(), "Illegal character '>'");
557  return Object(objError);
558  }
559  break;
560 
561  // error
562  case ')':
563  case '{':
564  case '}':
565  error(errSyntaxError, getPos(), "Illegal character '{0:c}'", c);
566  return Object(objError);
567  break;
568 
569  // command
570  default:
571  p = tokBuf;
572  *p++ = c;
573  n = 1;
574  while ((c = lookChar()) != EOF && !specialChars[c]) {
575  getChar();
576  if (++n == tokBufSize) {
577  error(errSyntaxError, getPos(), "Command token too long");
578  break;
579  }
580  *p++ = c;
581  }
582  *p = '\0';
583  if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
584  return Object(true);
585  } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
586  return Object(false);
587  } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
588  return Object(objNull);
589  } else {
590  return Object(objCmd, tokBuf);
591  }
592  break;
593  }
594 
595  return Object();
596 }
597 
598 Object Lexer::getObj(const char *cmdA, int objNum)
599 {
600  char *p;
601  int c;
602  bool comment;
603  int n;
604 
605  // skip whitespace and comments
606  comment = false;
607  const char *cmd1 = tokBuf;
608  *tokBuf = 0;
609  while (strcmp(cmdA, cmd1) && (objNum < 0 || (xref && xref->getNumEntry(getPos()) == objNum))) {
610  while (true) {
611  if ((c = getChar()) == EOF) {
612  return Object(objEOF);
613  }
614  if (comment) {
615  if (c == '\r' || c == '\n') {
616  comment = false;
617  }
618  } else if (c == '%') {
619  comment = true;
620  } else if (specialChars[c] != 1) {
621  break;
622  }
623  }
624  p = tokBuf;
625  *p++ = c;
626  n = 1;
627  while ((c = lookChar()) != EOF && specialChars[c] == 0) {
628  getChar();
629  if (++n == tokBufSize) {
630  break;
631  }
632  *p++ = c;
633  }
634  *p = '\0';
635  }
636 
637  return Object(objCmd, tokBuf);
638 }
639 
641 {
642  int c;
643 
644  while (true) {
645  c = getChar();
646  if (c == EOF || c == '\n') {
647  return;
648  }
649  if (c == '\r') {
650  if ((c = lookChar()) == '\n') {
651  getChar();
652  }
653  return;
654  }
655  }
656 }
657 
658 bool Lexer::isSpace(int c)
659 {
660  return c >= 0 && c <= 0xff && specialChars[c] == 1;
661 }
void add(Object *elem)
Definition: Array.cc:41
Object * get(int i, Object *obj, int recursion=0)
Definition: Array.cc:54
int getLength()
Definition: Array.h:48
static const int LOOK_VALUE_NOT_CACHED
Definition: Lexer.h:93
char tokBuf[128]
Definition: Lexer.h:82
int getChar()
Definition: Lexer.cc:89
XRef * xref
Definition: Lexer.h:109
static GBool isSpace(int c)
Definition: Lexer.cc:553
void skipToNextLine()
Definition: Lexer.cc:532
int strPtr
Definition: Lexer.h:79
Object * getObj(Object *obj)
Definition: Lexer.cc:112
Lexer(XRef *xref, Stream *str)
Definition: Lexer.cc:50
Array * streams
Definition: Lexer.h:78
int lookCharLastValueCached
Definition: Lexer.h:94
GFileOffset getPos()
Definition: Lexer.h:63
~Lexer()
Definition: Lexer.cc:79
GBool freeArray
Definition: Lexer.h:81
int lookChar()
Definition: Lexer.cc:105
Object curStr
Definition: Lexer.h:80
Definition: Object.h:84
Array * getArray()
Definition: Object.h:160
void streamClose()
Definition: Object.h:296
GBool isStream()
Definition: Object.h:138
Object * copy(Object *obj)
Definition: Object.cc:80
void streamReset()
Definition: Object.h:293
int streamGetChar()
Definition: Object.h:299
Definition: Stream.h:67
Definition: XRef.h:58
int getNumEntry(Goffset offset)
Definition: XRef.cc:1326
#define n
Definition: t4ht.c:1290
int strcmp()
Definition: coll.cpp:143
#define error(a)
Definition: dviinfo.c:48
#define s
Definition: afcover.h:80
#define c(n)
Definition: gpos-common.c:150
char comment[255+1]
Definition: hbf2gf.c:350
#define likely(x)
Definition: jbig2arith.cc:115
#define unlikely(x)
Definition: jbig2arith.cc:116
small capitals from c petite p
Definition: afcover.h:72
#define EOF
Definition: afmparse.c:59
@ errSyntaxError
Definition: Error.h:25
@ errSyntaxWarning
Definition: Error.h:23
#define tokBufSize
Definition: Lexer.h:23
@ objCmd
Definition: Object.h:62
@ objNull
Definition: Object.h:53
@ objError
Definition: Object.h:63
@ objName
Definition: Object.h:52
@ objEOF
Definition: Object.h:64
#define INT_MIN
Definition: c-minmax.h:50
#define INT_MAX
Definition: c-minmax.h:53
#define LLONG_MAX
Definition: config.h:179
#define isdigit(c)
Definition: snprintf.c:177
int getLength(char *s)
Definition: lengths.c:99
struct array Array
double scale
Definition: pnmhistmap.c:38
static const int IntegerSafeLimit
Definition: Lexer.cc:61
static const long long LongLongSafeLimit
Definition: Lexer.cc:62
static const char specialChars[256]
Definition: Lexer.cc:42
#define str(s)
Definition: sh6.c:399
#define c2
Definition: t1io.c:53
m
Definition: tex4ht.c:3990