w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

PDFParser.cpp
Go to the documentation of this file.
1 /*************************************************************************
2 ** PDFParser.cpp **
3 ** **
4 ** This file is part of dvisvgm -- a fast DVI to SVG converter **
5 ** Copyright (C) 2005-2021 Martin Gieseking <martin.gieseking@uos.de> **
6 ** **
7 ** This program is free software; you can redistribute it and/or **
8 ** modify it under the terms of the GNU General Public License as **
9 ** published by the Free Software Foundation; either version 3 of **
10 ** the License, or (at your option) any later version. **
11 ** **
12 ** This program is distributed in the hope that it will be useful, but **
13 ** WITHOUT ANY WARRANTY; without even the implied warranty of **
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the **
15 ** GNU General Public License for more details. **
16 ** **
17 ** You should have received a copy of the GNU General Public License **
18 ** along with this program; if not, see <http://www.gnu.org/licenses/>. **
19 *************************************************************************/
20 
21 #include <cctype>
22 #include <istream>
23 #include <ostream>
24 #include <sstream>
25 #include <stdexcept>
26 #include "InputReader.hpp"
27 #include "PDFParser.hpp"
28 #include "utility.hpp"
29 #include <iterator>
30 
31 using namespace std;
32 
33 
34 /** Parses PDF from an input stream and returns the corresponding object representation.
35  * @param[in] is input stream the PDF data is read from
36  * @param[in] opHandler handler used to treat PDF operators
37  * @return the parsed objects */
38 vector<PDFObject> PDFParser::parse (std::istream &is, const PDFOperatorHandler &opHandler) {
40  return parse(ir, opHandler);
41 }
42 
43 
44 /** Parses PDF from a string and returns the corresponding object representation.
45  * @param[in] str string that contains the PDF data
46  * @param[in] opHandler handler used to treat PDF operators
47  * @return the parsed objects */
48 vector<PDFObject> PDFParser::parse (const std::string &str, const PDFOperatorHandler &opHandler) {
49  istringstream iss(str);
50  return parse(iss, opHandler);
51 }
52 
53 
54 /** Parses PDF from an InputReader object and returns the corresponding object representation.
55  * @param[in] ir InputReader the PDF data is read from
56  * @param[in] opHandler handler used to treat PDF operators
57  * @return the parsed objects */
58 vector<PDFObject> PDFParser::parse (InputReader &ir, const PDFOperatorHandler &opHandler) {
59  vector<PDFObject> objects;
60  while (!ir.eof()) {
61  ir.skipSpace();
62  if (ir.peek() == '%') // comment?
63  while (ir.get() != '\n' && !ir.eof());
64  else if (!ir.eof())
65  parse(ir, objects, opHandler);
66  }
67  return objects;
68 }
69 
70 
71 /** Default handler for PDF operators. Just adds the operators to the
72  * object vector without evaluating them.
73  * @param[in] opname name of the operator
74  * @param[in,out] objects vector holding the parsed objects */
75 static void append_operator (const string &opname, vector<PDFObject> &objects) {
76  objects.emplace_back(PDFOperator(opname));
77 }
78 
79 
80 /** Parses PDF from an input stream and returns the corresponding object representation.
81  * @param[in] is input stream the PDF data is read from
82  * @return the parsed objects */
83 vector<PDFObject> PDFParser::parse (std::istream &is) {
84  return parse(is, append_operator);
85 }
86 
87 
88 /** Parses PDF from a string and returns the corresponding object representation.
89  * @param[in] str string that contains the PDF data
90  * @return the parsed objects */
91 vector<PDFObject> PDFParser::parse (const std::string &str) {
92  return parse(str, append_operator);
93 }
94 
95 /** Parses PDF from an InputReader object and returns the corresponding object representation.
96  * @param[in] ir InputReader the PDF data is read from
97  * @return the parsed objects */
98 vector<PDFObject> PDFParser::parse (InputReader &ir) {
99  return parse(ir, append_operator);
100 }
101 
102 
103 /** Parses PDF from an InputReader object and appends the recognized objects to a vector.
104  * @param[in] ir InputReader the PDF data is read from
105  * @param[in,out] objects the parsed PDF objects are appended to this vector
106  * @return the parsed objects */
107 void PDFParser::parse (InputReader &ir, vector<PDFObject> &objects) {
109 }
110 
111 
112 inline bool isoctaldigit (int c) {return c >= '0' && c <= '7';}
113 
114 /** Parses a PDF escape sequence of the form \FOO, where FOO is a single
115  * character or a sequence of 1-3 octal digits
116  * @return pair (s,c), s=true if c contains a parsed character */
117 static pair<bool,int> parse_escape_seq (InputReader &ir) {
118  // leading backslash has already been read
119  if (isoctaldigit(ir.peek())) {
120  string str;
121  for (int i=0; i < 3 && isoctaldigit(ir.peek()); i++)
122  str += static_cast<char>(ir.get());
123  return pair<bool,int>{true, stoi(str, nullptr, 8)};
124  }
125  char c = static_cast<char>(ir.get());
126  switch (c) {
127  case 'n': c = '\n'; break;
128  case 'r': c = '\r'; break;
129  case 't': c = '\t'; break;
130  case 'b': c = '\b'; break;
131  case 'f': c = '\f'; break;
132  case '\n':
133  case '\r':
134  if ((c == '\n' && ir.peek() == '\r') || (c == '\r' && ir.peek() == '\n'))
135  ir.get();
136  return pair<bool,int>{false, 0};
137  }
138  return pair<bool,int>{true, c};
139 }
140 
141 
142 /** Parses a literal PDF string of the form (FOO). */
143 static string parse_literal_string (InputReader &ir) {
144  string str;
145  ir.get(); // skip initial '('
146  int open_parens=1;
147  while (ir.peek() >= 0 && open_parens > 0) {
148  if (ir.peek() == '\n' || ir.peek() == '\r')
149  break;
150  int c = ir.get();
151  switch (c) {
152  case '(': open_parens++; break;
153  case ')': open_parens--; break;
154  case '\\':
155  pair<bool,int> state = parse_escape_seq(ir);
156  c = state.first ? state.second : -1;
157  break;
158  }
159  if (open_parens > 0 && c >= 0)
160  str += static_cast<char>(c);
161  }
162  if (open_parens > 0)
163  throw PDFException("missing ')' at end of literal string");
164  return str;
165 }
166 
167 
168 /** Gets a single hex digit from the InputReader. */
169 static char get_hex_digit (InputReader &ir) {
170  int c = ir.get();
171  if (isxdigit(c))
172  return char(c);
173  throw PDFException("invalid hexadecimal digit '" + string(1, char(c)) + "'");
174 }
175 
176 
177 /** Parses a PDF hex string of the form <FOO>, where FOO is a sequence of
178  * hex digits optionally separated by whitespace. */
179 static string parse_hex_string (InputReader &ir) {
180  // initial '<' has already been read
181  string str;
182  ir.skipSpace();
183  while (ir.peek() > 0 && ir.peek() != '>') {
184  string hexpair;
185  hexpair += get_hex_digit(ir);
186  ir.skipSpace();
187  if (ir.peek() > 0 && ir.peek() != '>')
188  hexpair += get_hex_digit(ir);
189  else if (ir.peek() == '>')
190  hexpair += '0';
191  ir.skipSpace();
192  str += static_cast<char>(stoi(hexpair, nullptr, 16));
193  }
194  if (ir.peek() != '>')
195  throw PDFException("missing '>' at end of hexadecimal string");
196  ir.get(); // skip closing '>'
197  return str;
198 }
199 
200 
202 
203 /** Parses a PDF number from a string. The number is either integer or real.
204  * @param[in] str string to parse
205  * @param[out] nv variant holding the numeric value
206  * @return true if entire string has been parsed succesfully */
207 static bool parse_number (const string &str, NumberVariant &nv) {
208  if (str.empty())
209  return false;
210  try {
211  size_t dotpos = str.find('.');
212  if (dotpos == string::npos) { // not a real number?
213  size_t count;
214  nv = NumberVariant(stoi(str, &count, 10)); // then try to convert str to int
215  return count == str.length(); // successful only if all characters have been processed
216  }
217  string postdot = str.substr(dotpos+1);
218  // ensure signless integer after dot to exclude exponental notation
219  // which is not allowed in PDF real number constants
220  if (!postdot.empty() && isdigit(postdot[0])) {
221  size_t count;
222  stoi(postdot, &count, 10);
223  if (count != postdot.length())
224  return false;
225  }
226  size_t count;
227  nv = NumberVariant(stod(str, &count));
228  return count == str.length();
229  }
230  catch (invalid_argument &e) {
231  return false;
232  }
233 }
234 
235 
236 /** Parses a PDF array from the input stream and returns a corresponding object. */
238  ir.get(); // skip '['
239  vector<PDFObject> localObjects;
240  while (!ir.eof() && ir.peek() != ']')
241  parse(ir, localObjects, opHandler);
242  ir.skipSpace();
243  if (ir.peek() != ']')
244  throw PDFException("missing ']' at end of array");
245  ir.get();
246  PDFArray arr;
247  std::move(localObjects.begin(), localObjects.end(), back_inserter(arr));
248  return arr;
249 }
250 
251 
252 /** Parses a PDF dictionary from the input stream and returns a corresponding object.
253  * The function expects that the first opening angle bracket denoting the start of an
254  * dictionary has already been swallowed from the stream. */
256  ir.get(); // skip second "<"
257  vector<PDFObject> localObjects;
258  while (!ir.eof() && ir.peek() != '>')
259  parse(ir, localObjects, opHandler);
260  if (ir.getString(2) != ">>")
261  throw PDFException("missing '>>' at end of dictionary");
262  PDFDict dict;
263  for (auto it=localObjects.begin(); it != localObjects.end(); ++it) {
264  if (!it->get<PDFName>())
265  throw PDFException("name key expected in dictionary");
266  const PDFName &key = *it->get<PDFName>();
267  if (++it == localObjects.end())
268  throw PDFException(string("missing dictionary value for key '")+key.str+"'");
269  dict.emplace(key.str, std::move(*it));
270  }
271  return dict;
272 }
273 
274 
275 static PDFStream parse_stream (InputReader &ir, const char *delim) {
276  do
277  ir.skipUntil("endstream");
278  while (ir.peek() >= 0 && !strchr(delim, ir.peek())); // ensure delimiter after "endstream"
279  return PDFStream();
280 }
281 
282 
283 static PDFIndirectObject parse_indirect_object (InputReader &ir, const char *delim, vector<PDFObject> &objects) {
284  do
285  ir.skipUntil("endobj");
286  while (ir.peek() >= 0 && !strchr(delim, ir.peek())); // ensure delimiter after "endobj"
287  if (objects.size() >= 2) {
288  const int *genno = objects.back().get<int>();
289  objects.pop_back();
290  const int *objno = objects.back().get<int>();
291  objects.pop_back();
292  if (objno && genno)
293  return PDFIndirectObject(*objno, *genno);
294  }
295  throw PDFException("object and generation number expected before 'obj'");
296 }
297 
298 
299 static PDFObjectRef parse_object_ref (vector<PDFObject> &objects) {
300  if (objects.size() >= 2) {
301  const int *genno = objects.back().get<int>();
302  objects.pop_back();
303  const int *objno = objects.back().get<int>();
304  objects.pop_back();
305  if (objno && genno)
306  return PDFObjectRef(*objno, *genno);
307  }
308  throw PDFException("object and generation number expected before 'R'");
309 }
310 
311 
312 /** Replaces all occurences of "#XX" (XX are two hex digits) with the corresponding character. */
313 static string& subst_numeric_chars (string &str) {
314  for (size_t pos=str.find('#'); pos != string::npos; pos=str.find('#', pos+1)) {
315  if (pos > str.length()-3)
316  throw PDFException("sign character # must be followed by two hexadecimal digits");
317  if (isxdigit(str[pos+1]) && isxdigit(str[pos+2])) {
318  int c = stoi(str.substr(pos+1, 2), nullptr, 16);
319  if (c == 0)
320  throw PDFException("null character not permitted in name");
321  str.replace(pos, 3, 1, static_cast<char>(c));
322  }
323  else
324  throw PDFException("sign character # must be followed by two hexadecimal digits");
325  }
326  return str;
327 }
328 
329 
330 /** Parses a single PDF object from an InputReader object.
331  * @param[in,out] ir reader object to read the PDF data from
332  * @param[out] objects the parsed object is appended to this vector
333  * @param[in] opHandler handler used to treat PDF operators
334  * @throws PDFException on failure */
335 void PDFParser::parse (InputReader &ir, vector<PDFObject> &objects, const PDFOperatorHandler &opHandler) {
336  static const char *delim = "()<>[]{}/% \t\n\r\f";
337  ir.skipSpace();
338  if (ir.peek() < 0)
339  return;
340  switch (ir.peek()) {
341  case '(':
342  objects.emplace_back(parse_literal_string(ir)); break;
343  case '[':
344  objects.emplace_back(util::make_unique<PDFArray>(parseArray(ir, opHandler))); break;
345  case '<':
346  ir.get();
347  if (ir.peek() != '<')
348  objects.emplace_back(parse_hex_string(ir));
349  else
350  objects.emplace_back(util::make_unique<PDFDict>(parseDict(ir, opHandler)));
351  break;
352  case '/': {
353  ir.get();
354  string name = ir.getString(delim);
355  objects.emplace_back(PDFName(subst_numeric_chars(name)));
356  break;
357  }
358  default: {
359  string str = ir.getString(delim);
360  if (str.empty())
361  break;
362  if (str == "null")
363  objects.emplace_back(PDFNull());
364  else if (str == "true")
365  objects.emplace_back(true);
366  else if (str == "false")
367  objects.emplace_back(false);
368  else if (str == "stream")
369  objects.emplace_back(parse_stream(ir, delim));
370  else if (str == "obj")
371  objects.emplace_back(parse_indirect_object(ir, delim, objects));
372  else if (str == "R")
373  objects.emplace_back(parse_object_ref(objects));
374  else {
376  if (!parse_number(str, number))
377  opHandler(str, objects);
378  else {
379  if (mpark::get_if<int>(&number))
380  objects.emplace_back(mpark::get<int>(number));
381  else
382  objects.emplace_back(mpark::get<double>(number));
383  }
384  }
385  }
386  }
387 }
388 
389 //////////////////////////////////////////////////////////////////////////
390 
392  template <typename V>
393  double operator () (const V &val) {return 0;}
394 };
395 
396 template<> double ToDoubleVisitor::operator () (const int &val) {return static_cast<double>(val);}
397 template<> double ToDoubleVisitor::operator () (const double &val) {return val;}
398 template<> double ToDoubleVisitor::operator () (const string &val) {
399  try {
400  return stod(val);
401  }
402  catch (exception &e) {
403  return 0;
404  }
405 }
406 
407 
408 PDFObject::operator double () const {
409  return mpark::visit(ToDoubleVisitor(), _value);
410 }
411 
412 
413 PDFObject::operator std::string () const {
414  ostringstream oss;
415  oss << *this;
416  return oss.str();
417 }
418 
419 
420 static std::ostream& operator << (std::ostream &os, const PDFName &name) {return os << name.str;}
421 static ostream& operator << (ostream &os, const PDFNull&) {return os << "null";}
422 static ostream& operator << (ostream &os, const PDFStream&) {return os << "stream";}
423 static ostream& operator << (ostream &os, const PDFOperator &op) {return os << op.opname;}
424 
425 static ostream& operator << (ostream &os, const PDFIndirectObject &obj) {
426  return os << "obj(" << obj.objnum << ", " << obj.gennum << ')';
427 }
428 
429 
430 static ostream& operator << (ostream &os, const PDFObjectRef &ref) {
431  return os << "obj(" << ref.objnum << ", " << ref.gennum << ')';
432 }
433 
434 
435 static ostream& operator << (ostream &os, const unique_ptr<vector<PDFObject>> &val) {
436  os << '[';
437  for (auto it=val->begin(); it != val->end(); ++it) {
438  if (it != val->begin())
439  os << ", ";
440  it->write(os);
441  }
442  os << ']';
443  return os;
444 }
445 
446 
447 static ostream& operator << (ostream &os, const unique_ptr<Dictionary<string,PDFObject>> &val) {
448  os << "<<";
449  for (auto it=val->begin(); it != val->end(); ++it) {
450  if (it != val->begin())
451  os << ", ";
452  os << it->first << ':' << it->second;
453  }
454  os << ">>";
455  return os;
456 }
457 
458 
459 struct WriteVisitor {
460  explicit WriteVisitor (ostream &os) : _os(os) {}
461  template <typename T> void operator () (const T &val) {_os << val;}
462  ostream &_os;
463 };
464 
465 
466 void PDFObject::write (ostream &os) const {
468 }
static PDFIndirectObject parse_indirect_object(InputReader &ir, const char *delim, vector< PDFObject > &objects)
Definition: PDFParser.cpp:283
static pair< bool, int > parse_escape_seq(InputReader &ir)
Definition: PDFParser.cpp:117
static PDFStream parse_stream(InputReader &ir, const char *delim)
Definition: PDFParser.cpp:275
static bool parse_number(const string &str, NumberVariant &nv)
Definition: PDFParser.cpp:207
static string & subst_numeric_chars(string &str)
Definition: PDFParser.cpp:313
static string parse_literal_string(InputReader &ir)
Definition: PDFParser.cpp:143
bool isoctaldigit(int c)
Definition: PDFParser.cpp:112
static char get_hex_digit(InputReader &ir)
Definition: PDFParser.cpp:169
static PDFObjectRef parse_object_ref(vector< PDFObject > &objects)
Definition: PDFParser.cpp:299
mpark::variant< int, double > NumberVariant
Definition: PDFParser.cpp:201
static string parse_hex_string(InputReader &ir)
Definition: PDFParser.cpp:179
static void append_operator(const string &opname, vector< PDFObject > &objects)
Definition: PDFParser.cpp:75
std::vector< PDFObject > PDFArray
Definition: PDFParser.hpp:94
#define count(a)
Definition: aptex-macros.h:781
#define name
static integer open_parens
Definition: aptex.h:419
std::pair< typename Map::iterator, bool > emplace(const K &key, V &&value)
Definition: PDFParser.hpp:57
virtual std::string getString()
virtual bool skipUntil(const char *str)
virtual bool eof() const =0
virtual int get()=0
virtual void skipSpace()
Definition: InputReader.cpp:94
virtual int peek() const =0
Value _value
Definition: PDFParser.hpp:132
void write(std::ostream &os) const
Definition: PDFParser.cpp:466
std::function< void(const std::string &, std::vector< PDFObject > &)> PDFOperatorHandler
Definition: PDFParser.hpp:161
PDFDict parseDict(InputReader &ir, const PDFOperatorHandler &opHandler)
Definition: PDFParser.cpp:255
std::vector< PDFObject > parse(std::istream &is)
Definition: PDFParser.cpp:83
PDFArray parseArray(InputReader &ir, const PDFOperatorHandler &opHandler)
Definition: PDFParser.cpp:237
StringAccum & operator<<(StringAccum &sa, char c)
Append character c to StringAccum sa.
Definition: straccum.hh:518
const char * delim
Definition: dvistuff.c:159
#define T
Definition: fmt.h:20
struct move_struct move
#define c(n)
Definition: gpos-common.c:150
#define strchr
Definition: gsftopk.c:59
small capitals from c petite p scientific i
Definition: afcover.h:80
#define string
Definition: ctangleboot.c:111
#define isdigit(c)
Definition: snprintf.c:177
const int * pos
Definition: combiners.h:905
constexpr auto visit(Visitor &&visitor, Vs &&... vs) -> decltype((detail::all(lib::array< bool, sizeof...(Vs)>{{!vs.valueless_by_exception()...}}) ?(void) 0 :throw_bad_variant_access()), detail::visitation::variant::visit_value(lib::forward< Visitor >(visitor), lib::forward< Vs >(vs)...))
Definition: variant.hpp:2697
def ref(x)
Definition: pdf-org.py:104
STL namespace.
#define V
Definition: pgmcrater.c:68
integer nv
Definition: pmxab.c:88
char * opname
Definition: psbi.c:676
#define objects(p)
Definition: gc.c:219
Slot * iss
Definition: opcodes.h:276
#define str(s)
Definition: sh6.c:399
ShellFileEnvironment e
Definition: sh6.c:388
double operator()(const V &val)
Definition: PDFParser.cpp:393
ostream & _os
Definition: PDFParser.cpp:462
void operator()(const T &val)
Definition: PDFParser.cpp:461
WriteVisitor(ostream &os)
Definition: PDFParser.cpp:460
Definition: sh.h:1226
Definition: strexpr.c:21
int number
Definition: t1part.c:207
#define is
Definition: tex2xindy.c:759
#define key
Definition: tex2xindy.c:753
val
Definition: tex4ht.c:3227