dnspython  1.16.0
About: dnspython is a DNS toolkit (for Python 2.x) that supports almost all record types.
  Fossies Dox: dnspython-1.16.0.tar.gz  ("inofficial" and yet experimental doxygen-generated source code documentation)  

tokenizer.py
Go to the documentation of this file.
1 # Copyright (C) Dnspython Contributors, see LICENSE for text of ISC license
2 
3 # Copyright (C) 2003-2017 Nominum, Inc.
4 #
5 # Permission to use, copy, modify, and distribute this software and its
6 # documentation for any purpose with or without fee is hereby granted,
7 # provided that the above copyright notice and this permission notice
8 # appear in all copies.
9 #
10 # THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES
11 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR
13 # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
16 # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 
18 """Tokenize DNS master file format"""
19 
20 from io import StringIO
21 import sys
22 
23 import dns.exception
24 import dns.name
25 import dns.ttl
26 from ._compat import long, text_type, binary_type
27 
28 _DELIMITERS = {
29  ' ': True,
30  '\t': True,
31  '\n': True,
32  ';': True,
33  '(': True,
34  ')': True,
35  '"': True}
36 
37 _QUOTING_DELIMITERS = {'"': True}
38 
39 EOF = 0
40 EOL = 1
41 WHITESPACE = 2
42 IDENTIFIER = 3
43 QUOTED_STRING = 4
44 COMMENT = 5
45 DELIMITER = 6
46 
47 
49  """An attempt was made to unget a token when the unget buffer was full."""
50 
51 
52 class Token(object):
53  """A DNS master file format token.
54 
55  ttype: The token type
56  value: The token value
57  has_escape: Does the token value contain escapes?
58  """
59 
60  def __init__(self, ttype, value='', has_escape=False):
61  """Initialize a token instance."""
62 
63  self.ttype = ttype
64  self.value = value
65  self.has_escape = has_escape
66 
67  def is_eof(self):
68  return self.ttype == EOF
69 
70  def is_eol(self):
71  return self.ttype == EOL
72 
73  def is_whitespace(self):
74  return self.ttype == WHITESPACE
75 
76  def is_identifier(self):
77  return self.ttype == IDENTIFIER
78 
79  def is_quoted_string(self):
80  return self.ttype == QUOTED_STRING
81 
82  def is_comment(self):
83  return self.ttype == COMMENT
84 
85  def is_delimiter(self):
86  return self.ttype == DELIMITER
87 
88  def is_eol_or_eof(self):
89  return self.ttype == EOL or self.ttype == EOF
90 
91  def __eq__(self, other):
92  if not isinstance(other, Token):
93  return False
94  return (self.ttype == other.ttype and
95  self.value == other.value)
96 
97  def __ne__(self, other):
98  if not isinstance(other, Token):
99  return True
100  return (self.ttype != other.ttype or
101  self.value != other.value)
102 
103  def __str__(self):
104  return '%d "%s"' % (self.ttype, self.value)
105 
106  def unescape(self):
107  if not self.has_escape:
108  return self
109  unescaped = ''
110  l = len(self.value)
111  i = 0
112  while i < l:
113  c = self.value[i]
114  i += 1
115  if c == '\\':
116  if i >= l:
118  c = self.value[i]
119  i += 1
120  if c.isdigit():
121  if i >= l:
123  c2 = self.value[i]
124  i += 1
125  if i >= l:
127  c3 = self.value[i]
128  i += 1
129  if not (c2.isdigit() and c3.isdigit()):
131  c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
132  unescaped += c
133  return Token(self.ttype, unescaped)
134 
135  # compatibility for old-style tuple tokens
136 
137  def __len__(self):
138  return 2
139 
140  def __iter__(self):
141  return iter((self.ttype, self.value))
142 
143  def __getitem__(self, i):
144  if i == 0:
145  return self.ttype
146  elif i == 1:
147  return self.value
148  else:
149  raise IndexError
150 
151 
152 class Tokenizer(object):
153  """A DNS master file format tokenizer.
154 
155  A token object is basically a (type, value) tuple. The valid
156  types are EOF, EOL, WHITESPACE, IDENTIFIER, QUOTED_STRING,
157  COMMENT, and DELIMITER.
158 
159  file: The file to tokenize
160 
161  ungotten_char: The most recently ungotten character, or None.
162 
163  ungotten_token: The most recently ungotten token, or None.
164 
165  multiline: The current multiline level. This value is increased
166  by one every time a '(' delimiter is read, and decreased by one every time
167  a ')' delimiter is read.
168 
169  quoting: This variable is true if the tokenizer is currently
170  reading a quoted string.
171 
172  eof: This variable is true if the tokenizer has encountered EOF.
173 
174  delimiters: The current delimiter dictionary.
175 
176  line_number: The current line number
177 
178  filename: A filename that will be returned by the where() method.
179  """
180 
181  def __init__(self, f=sys.stdin, filename=None):
182  """Initialize a tokenizer instance.
183 
184  f: The file to tokenize. The default is sys.stdin.
185  This parameter may also be a string, in which case the tokenizer
186  will take its input from the contents of the string.
187 
188  filename: the name of the filename that the where() method
189  will return.
190  """
191 
192  if isinstance(f, text_type):
193  f = StringIO(f)
194  if filename is None:
195  filename = '<string>'
196  elif isinstance(f, binary_type):
197  f = StringIO(f.decode())
198  if filename is None:
199  filename = '<string>'
200  else:
201  if filename is None:
202  if f is sys.stdin:
203  filename = '<stdin>'
204  else:
205  filename = '<file>'
206  self.file = f
207  self.ungotten_char = None
208  self.ungotten_token = None
209  self.multiline = 0
210  self.quoting = False
211  self.eof = False
212  self.delimiters = _DELIMITERS
213  self.line_number = 1
214  self.filename = filename
215 
216  def _get_char(self):
217  """Read a character from input.
218  """
219 
220  if self.ungotten_char is None:
221  if self.eof:
222  c = ''
223  else:
224  c = self.file.read(1)
225  if c == '':
226  self.eof = True
227  elif c == '\n':
228  self.line_number += 1
229  else:
230  c = self.ungotten_char
231  self.ungotten_char = None
232  return c
233 
234  def where(self):
235  """Return the current location in the input.
236 
237  Returns a (string, int) tuple. The first item is the filename of
238  the input, the second is the current line number.
239  """
240 
241  return (self.filename, self.line_number)
242 
243  def _unget_char(self, c):
244  """Unget a character.
245 
246  The unget buffer for characters is only one character large; it is
247  an error to try to unget a character when the unget buffer is not
248  empty.
249 
250  c: the character to unget
251  raises UngetBufferFull: there is already an ungotten char
252  """
253 
254  if self.ungotten_char is not None:
255  raise UngetBufferFull
256  self.ungotten_char = c
257 
258  def skip_whitespace(self):
259  """Consume input until a non-whitespace character is encountered.
260 
261  The non-whitespace character is then ungotten, and the number of
262  whitespace characters consumed is returned.
263 
264  If the tokenizer is in multiline mode, then newlines are whitespace.
265 
266  Returns the number of characters skipped.
267  """
268 
269  skipped = 0
270  while True:
271  c = self._get_char()
272  if c != ' ' and c != '\t':
273  if (c != '\n') or not self.multiline:
274  self._unget_char(c)
275  return skipped
276  skipped += 1
277 
278  def get(self, want_leading=False, want_comment=False):
279  """Get the next token.
280 
281  want_leading: If True, return a WHITESPACE token if the
282  first character read is whitespace. The default is False.
283 
284  want_comment: If True, return a COMMENT token if the
285  first token read is a comment. The default is False.
286 
287  Raises dns.exception.UnexpectedEnd: input ended prematurely
288 
289  Raises dns.exception.SyntaxError: input was badly formed
290 
291  Returns a Token.
292  """
293 
294  if self.ungotten_token is not None:
295  token = self.ungotten_token
296  self.ungotten_token = None
297  if token.is_whitespace():
298  if want_leading:
299  return token
300  elif token.is_comment():
301  if want_comment:
302  return token
303  else:
304  return token
305  skipped = self.skip_whitespace()
306  if want_leading and skipped > 0:
307  return Token(WHITESPACE, ' ')
308  token = ''
309  ttype = IDENTIFIER
310  has_escape = False
311  while True:
312  c = self._get_char()
313  if c == '' or c in self.delimiters:
314  if c == '' and self.quoting:
316  if token == '' and ttype != QUOTED_STRING:
317  if c == '(':
318  self.multiline += 1
319  self.skip_whitespace()
320  continue
321  elif c == ')':
322  if self.multiline <= 0:
324  self.multiline -= 1
325  self.skip_whitespace()
326  continue
327  elif c == '"':
328  if not self.quoting:
329  self.quoting = True
330  self.delimiters = _QUOTING_DELIMITERS
331  ttype = QUOTED_STRING
332  continue
333  else:
334  self.quoting = False
335  self.delimiters = _DELIMITERS
336  self.skip_whitespace()
337  continue
338  elif c == '\n':
339  return Token(EOL, '\n')
340  elif c == ';':
341  while 1:
342  c = self._get_char()
343  if c == '\n' or c == '':
344  break
345  token += c
346  if want_comment:
347  self._unget_char(c)
348  return Token(COMMENT, token)
349  elif c == '':
350  if self.multiline:
352  'unbalanced parentheses')
353  return Token(EOF)
354  elif self.multiline:
355  self.skip_whitespace()
356  token = ''
357  continue
358  else:
359  return Token(EOL, '\n')
360  else:
361  # This code exists in case we ever want a
362  # delimiter to be returned. It never produces
363  # a token currently.
364  token = c
365  ttype = DELIMITER
366  else:
367  self._unget_char(c)
368  break
369  elif self.quoting:
370  if c == '\\':
371  c = self._get_char()
372  if c == '':
374  if c.isdigit():
375  c2 = self._get_char()
376  if c2 == '':
378  c3 = self._get_char()
379  if c == '':
381  if not (c2.isdigit() and c3.isdigit()):
383  c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
384  elif c == '\n':
385  raise dns.exception.SyntaxError('newline in quoted string')
386  elif c == '\\':
387  #
388  # It's an escape. Put it and the next character into
389  # the token; it will be checked later for goodness.
390  #
391  token += c
392  has_escape = True
393  c = self._get_char()
394  if c == '' or c == '\n':
396  token += c
397  if token == '' and ttype != QUOTED_STRING:
398  if self.multiline:
399  raise dns.exception.SyntaxError('unbalanced parentheses')
400  ttype = EOF
401  return Token(ttype, token, has_escape)
402 
403  def unget(self, token):
404  """Unget a token.
405 
406  The unget buffer for tokens is only one token large; it is
407  an error to try to unget a token when the unget buffer is not
408  empty.
409 
410  token: the token to unget
411 
412  Raises UngetBufferFull: there is already an ungotten token
413  """
414 
415  if self.ungotten_token is not None:
416  raise UngetBufferFull
417  self.ungotten_token = token
418 
419  def next(self):
420  """Return the next item in an iteration.
421 
422  Returns a Token.
423  """
424 
425  token = self.get()
426  if token.is_eof():
427  raise StopIteration
428  return token
429 
430  __next__ = next
431 
432  def __iter__(self):
433  return self
434 
435  # Helpers
436 
437  def get_int(self, base=10):
438  """Read the next token and interpret it as an integer.
439 
440  Raises dns.exception.SyntaxError if not an integer.
441 
442  Returns an int.
443  """
444 
445  token = self.get().unescape()
446  if not token.is_identifier():
447  raise dns.exception.SyntaxError('expecting an identifier')
448  if not token.value.isdigit():
449  raise dns.exception.SyntaxError('expecting an integer')
450  return int(token.value, base)
451 
452  def get_uint8(self):
453  """Read the next token and interpret it as an 8-bit unsigned
454  integer.
455 
456  Raises dns.exception.SyntaxError if not an 8-bit unsigned integer.
457 
458  Returns an int.
459  """
460 
461  value = self.get_int()
462  if value < 0 or value > 255:
464  '%d is not an unsigned 8-bit integer' % value)
465  return value
466 
467  def get_uint16(self, base=10):
468  """Read the next token and interpret it as a 16-bit unsigned
469  integer.
470 
471  Raises dns.exception.SyntaxError if not a 16-bit unsigned integer.
472 
473  Returns an int.
474  """
475 
476  value = self.get_int(base=base)
477  if value < 0 or value > 65535:
478  if base == 8:
480  '%o is not an octal unsigned 16-bit integer' % value)
481  else:
483  '%d is not an unsigned 16-bit integer' % value)
484  return value
485 
486  def get_uint32(self):
487  """Read the next token and interpret it as a 32-bit unsigned
488  integer.
489 
490  Raises dns.exception.SyntaxError if not a 32-bit unsigned integer.
491 
492  Returns an int.
493  """
494 
495  token = self.get().unescape()
496  if not token.is_identifier():
497  raise dns.exception.SyntaxError('expecting an identifier')
498  if not token.value.isdigit():
499  raise dns.exception.SyntaxError('expecting an integer')
500  value = long(token.value)
501  if value < 0 or value > long(4294967296):
503  '%d is not an unsigned 32-bit integer' % value)
504  return value
505 
506  def get_string(self, origin=None):
507  """Read the next token and interpret it as a string.
508 
509  Raises dns.exception.SyntaxError if not a string.
510 
511  Returns a string.
512  """
513 
514  token = self.get().unescape()
515  if not (token.is_identifier() or token.is_quoted_string()):
516  raise dns.exception.SyntaxError('expecting a string')
517  return token.value
518 
519  def get_identifier(self, origin=None):
520  """Read the next token, which should be an identifier.
521 
522  Raises dns.exception.SyntaxError if not an identifier.
523 
524  Returns a string.
525  """
526 
527  token = self.get().unescape()
528  if not token.is_identifier():
529  raise dns.exception.SyntaxError('expecting an identifier')
530  return token.value
531 
532  def get_name(self, origin=None):
533  """Read the next token and interpret it as a DNS name.
534 
535  Raises dns.exception.SyntaxError if not a name.
536 
537  Returns a dns.name.Name.
538  """
539 
540  token = self.get()
541  if not token.is_identifier():
542  raise dns.exception.SyntaxError('expecting an identifier')
543  return dns.name.from_text(token.value, origin)
544 
545  def get_eol(self):
546  """Read the next token and raise an exception if it isn't EOL or
547  EOF.
548 
549  Returns a string.
550  """
551 
552  token = self.get()
553  if not token.is_eol_or_eof():
555  'expected EOL or EOF, got %d "%s"' % (token.ttype,
556  token.value))
557  return token.value
558 
559  def get_ttl(self):
560  """Read the next token and interpret it as a DNS TTL.
561 
562  Raises dns.exception.SyntaxError or dns.ttl.BadTTL if not an
563  identifier or badly formed.
564 
565  Returns an int.
566  """
567 
568  token = self.get().unescape()
569  if not token.is_identifier():
570  raise dns.exception.SyntaxError('expecting an identifier')
571  return dns.ttl.from_text(token.value)
dns.tokenizer.Tokenizer.get_uint32
def get_uint32(self)
Definition: tokenizer.py:486
dns.tokenizer.Tokenizer.get
def get(self, want_leading=False, want_comment=False)
Definition: tokenizer.py:278
dns.tokenizer.Tokenizer._get_char
def _get_char(self)
Definition: tokenizer.py:216
dns.tokenizer.Token.unescape
def unescape(self)
Definition: tokenizer.py:106
dns.exception.SyntaxError
Definition: exception.py:113
dns.tokenizer.Token.has_escape
has_escape
Definition: tokenizer.py:65
dns.tokenizer.Tokenizer.get_string
def get_string(self, origin=None)
Definition: tokenizer.py:506
dns.tokenizer.Tokenizer.ungotten_token
ungotten_token
Definition: tokenizer.py:208
dns.tokenizer.Tokenizer.multiline
multiline
Definition: tokenizer.py:209
dns.tokenizer.Tokenizer.filename
filename
Definition: tokenizer.py:214
dns.name.from_text
def from_text(text, origin=root, idna_codec=None)
Definition: name.py:873
dns.tokenizer.Token.__len__
def __len__(self)
Definition: tokenizer.py:137
dns.tokenizer.Token
Definition: tokenizer.py:52
dns.tokenizer.Token.is_identifier
def is_identifier(self)
Definition: tokenizer.py:76
dns.tokenizer.Token.is_eol_or_eof
def is_eol_or_eof(self)
Definition: tokenizer.py:88
dns.exception.DNSException
Definition: exception.py:24
dns.exception.UnexpectedEnd
Definition: exception.py:117
dns.tokenizer.Token.__init__
def __init__(self, ttype, value='', has_escape=False)
Definition: tokenizer.py:60
dns.tokenizer.Tokenizer.where
def where(self)
Definition: tokenizer.py:234
dns.tokenizer.Token.is_quoted_string
def is_quoted_string(self)
Definition: tokenizer.py:79
dns.tokenizer.Token.ttype
ttype
Definition: tokenizer.py:63
dns.ttl.from_text
def from_text(text)
Definition: ttl.py:28
dns.tokenizer.Token.__ne__
def __ne__(self, other)
Definition: tokenizer.py:97
dns.tokenizer.Tokenizer.skip_whitespace
def skip_whitespace(self)
Definition: tokenizer.py:258
dns.tokenizer.Token.value
value
Definition: tokenizer.py:64
dns.tokenizer.Tokenizer.unget
def unget(self, token)
Definition: tokenizer.py:403
dns.tokenizer.Tokenizer.__iter__
def __iter__(self)
Definition: tokenizer.py:432
dns.tokenizer.Token.is_comment
def is_comment(self)
Definition: tokenizer.py:82
dns.tokenizer.Tokenizer.get_identifier
def get_identifier(self, origin=None)
Definition: tokenizer.py:519
dns.tokenizer.Tokenizer.line_number
line_number
Definition: tokenizer.py:213
dns.tokenizer.UngetBufferFull
Definition: tokenizer.py:48
dns.tokenizer.Tokenizer.get_int
def get_int(self, base=10)
Definition: tokenizer.py:437
dns.tokenizer.Token.__getitem__
def __getitem__(self, i)
Definition: tokenizer.py:143
dns.tokenizer.Token.__eq__
def __eq__(self, other)
Definition: tokenizer.py:91
dns.tokenizer.Token.is_delimiter
def is_delimiter(self)
Definition: tokenizer.py:85
dns.tokenizer.Token.is_eol
def is_eol(self)
Definition: tokenizer.py:70
dns.tokenizer.Tokenizer
Definition: tokenizer.py:152
dns.tokenizer.Tokenizer.get_uint8
def get_uint8(self)
Definition: tokenizer.py:452
dns.name
Definition: name.py:1
dns.tokenizer.Tokenizer.file
file
Definition: tokenizer.py:206
dns.tokenizer.Tokenizer.get_eol
def get_eol(self)
Definition: tokenizer.py:545
dns.tokenizer.Token.is_eof
def is_eof(self)
Definition: tokenizer.py:67
dns.tokenizer.Tokenizer.get_uint16
def get_uint16(self, base=10)
Definition: tokenizer.py:467
dns.tokenizer.Token.is_whitespace
def is_whitespace(self)
Definition: tokenizer.py:73
dns.tokenizer.Tokenizer.get_name
def get_name(self, origin=None)
Definition: tokenizer.py:532
dns.tokenizer.Tokenizer.next
def next(self)
Definition: tokenizer.py:419
dns.tokenizer.Token.__iter__
def __iter__(self)
Definition: tokenizer.py:140
dns.tokenizer.Tokenizer.delimiters
delimiters
Definition: tokenizer.py:212
dns.tokenizer.Tokenizer.eof
eof
Definition: tokenizer.py:211
dns.ttl
Definition: ttl.py:1
dns.tokenizer.Tokenizer.get_ttl
def get_ttl(self)
Definition: tokenizer.py:559
dns.tokenizer.Tokenizer.ungotten_char
ungotten_char
Definition: tokenizer.py:207
dns.tokenizer.Tokenizer._unget_char
def _unget_char(self, c)
Definition: tokenizer.py:243
dns.tokenizer.Tokenizer.__init__
def __init__(self, f=sys.stdin, filename=None)
Definition: tokenizer.py:181
dns.tokenizer.Token.__str__
def __str__(self)
Definition: tokenizer.py:103
dns._compat.long
long
Definition: _compat.py:10
dns.exception
Definition: exception.py:1
dns.tokenizer.Tokenizer.quoting
quoting
Definition: tokenizer.py:210