lib/dnspython/dns/tokenizer.py

   1 # Copyright (C) 2003-2007, 2009, 2010 Nominum, Inc.
   2 #
   3 # Permission to use, copy, modify, and distribute this software and its
   4 # documentation for any purpose with or without fee is hereby granted,
   5 # provided that the above copyright notice and this permission notice
   6 # appear in all copies.
   7 #
   8 # THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES
   9 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR
  11 # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  14 # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15
  16 """Tokenize DNS master file format"""
  17
  18 import cStringIO
  19 import sys
  20
  21 import dns.exception
  22 import dns.name
  23 import dns.ttl
  24
  25 _DELIMITERS = {
  26     ' ' : True,
  27     '\t' : True,
  28     '\n' : True,
  29     ';' : True,
  30     '(' : True,
  31     ')' : True,
  32     '"' : True }
  33
  34 _QUOTING_DELIMITERS = { '"' : True }
  35
  36 EOF = 0
  37 EOL = 1
  38 WHITESPACE = 2
  39 IDENTIFIER = 3
  40 QUOTED_STRING = 4
  41 COMMENT = 5
  42 DELIMITER = 6
  43
  44 class UngetBufferFull(dns.exception.DNSException):
  45     """Raised when an attempt is made to unget a token when the unget
  46     buffer is full."""
  47     pass
  48
  49 class Token(object):
  50     """A DNS master file format token.
  51
  52     @ivar ttype: The token type
  53     @type ttype: int
  54     @ivar value: The token value
  55     @type value: string
  56     @ivar has_escape: Does the token value contain escapes?
  57     @type has_escape: bool
  58     """
  59
  60     def __init__(self, ttype, value='', has_escape=False):
  61         """Initialize a token instance.
  62
  63         @param ttype: The token type
  64         @type ttype: int
  65         @ivar value: The token value
  66         @type value: string
  67         @ivar has_escape: Does the token value contain escapes?
  68         @type has_escape: bool
  69         """
  70         self.ttype = ttype
  71         self.value = value
  72         self.has_escape = has_escape
  73
  74     def is_eof(self):
  75         return self.ttype == EOF
  76
  77     def is_eol(self):
  78         return self.ttype == EOL
  79
  80     def is_whitespace(self):
  81         return self.ttype == WHITESPACE
  82
  83     def is_identifier(self):
  84         return self.ttype == IDENTIFIER
  85
  86     def is_quoted_string(self):
  87         return self.ttype == QUOTED_STRING
  88
  89     def is_comment(self):
  90         return self.ttype == COMMENT
  91
  92     def is_delimiter(self):
  93         return self.ttype == DELIMITER
  94
  95     def is_eol_or_eof(self):
  96         return (self.ttype == EOL or self.ttype == EOF)
  97
  98     def __eq__(self, other):
  99         if not isinstance(other, Token):
 100             return False
 101         return (self.ttype == other.ttype and
 102                 self.value == other.value)
 103
 104     def __ne__(self, other):
 105         if not isinstance(other, Token):
 106             return True
 107         return (self.ttype != other.ttype or
 108                 self.value != other.value)
 109
 110     def __str__(self):
 111         return '%d "%s"' % (self.ttype, self.value)
 112
 113     def unescape(self):
 114         if not self.has_escape:
 115             return self
 116         unescaped = ''
 117         l = len(self.value)
 118         i = 0
 119         while i < l:
 120             c = self.value[i]
 121             i += 1
 122             if c == '\\':
 123                 if i >= l:
 124                     raise dns.exception.UnexpectedEnd
 125                 c = self.value[i]
 126                 i += 1
 127                 if c.isdigit():
 128                     if i >= l:
 129                         raise dns.exception.UnexpectedEnd
 130                     c2 = self.value[i]
 131                     i += 1
 132                     if i >= l:
 133                         raise dns.exception.UnexpectedEnd
 134                     c3 = self.value[i]
 135                     i += 1
 136                     if not (c2.isdigit() and c3.isdigit()):
 137                         raise dns.exception.SyntaxError
 138                     c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
 139             unescaped += c
 140         return Token(self.ttype, unescaped)
 141
 142     # compatibility for old-style tuple tokens
 143
 144     def __len__(self):
 145         return 2
 146
 147     def __iter__(self):
 148         return iter((self.ttype, self.value))
 149
 150     def __getitem__(self, i):
 151         if i == 0:
 152             return self.ttype
 153         elif i == 1:
 154             return self.value
 155         else:
 156             raise IndexError
 157
 158 class Tokenizer(object):
 159     """A DNS master file format tokenizer.
 160
 161     A token is a (type, value) tuple, where I{type} is an int, and
 162     I{value} is a string.  The valid types are EOF, EOL, WHITESPACE,
 163     IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER.
 164
 165     @ivar file: The file to tokenize
 166     @type file: file
 167     @ivar ungotten_char: The most recently ungotten character, or None.
 168     @type ungotten_char: string
 169     @ivar ungotten_token: The most recently ungotten token, or None.
 170     @type ungotten_token: (int, string) token tuple
 171     @ivar multiline: The current multiline level.  This value is increased
 172     by one every time a '(' delimiter is read, and decreased by one every time
 173     a ')' delimiter is read.
 174     @type multiline: int
 175     @ivar quoting: This variable is true if the tokenizer is currently
 176     reading a quoted string.
 177     @type quoting: bool
 178     @ivar eof: This variable is true if the tokenizer has encountered EOF.
 179     @type eof: bool
 180     @ivar delimiters: The current delimiter dictionary.
 181     @type delimiters: dict
 182     @ivar line_number: The current line number
 183     @type line_number: int
 184     @ivar filename: A filename that will be returned by the L{where} method.
 185     @type filename: string
 186     """
 187
 188     def __init__(self, f=sys.stdin, filename=None):
 189         """Initialize a tokenizer instance.
 190
 191         @param f: The file to tokenize.  The default is sys.stdin.
 192         This parameter may also be a string, in which case the tokenizer
 193         will take its input from the contents of the string.
 194         @type f: file or string
 195         @param filename: the name of the filename that the L{where} method
 196         will return.
 197         @type filename: string
 198         """
 199
 200         if isinstance(f, str):
 201             f = cStringIO.StringIO(f)
 202             if filename is None:
 203                 filename = '<string>'
 204         else:
 205             if filename is None:
 206                 if f is sys.stdin:
 207                     filename = '<stdin>'
 208                 else:
 209                     filename = '<file>'
 210         self.file = f
 211         self.ungotten_char = None
 212         self.ungotten_token = None
 213         self.multiline = 0
 214         self.quoting = False
 215         self.eof = False
 216         self.delimiters = _DELIMITERS
 217         self.line_number = 1
 218         self.filename = filename
 219
 220     def _get_char(self):
 221         """Read a character from input.
 222         @rtype: string
 223         """
 224
 225         if self.ungotten_char is None:
 226             if self.eof:
 227                 c = ''
 228             else:
 229                 c = self.file.read(1)
 230                 if c == '':
 231                     self.eof = True
 232                 elif c == '\n':
 233                     self.line_number += 1
 234         else:
 235             c = self.ungotten_char
 236             self.ungotten_char = None
 237         return c
 238
 239     def where(self):
 240         """Return the current location in the input.
 241
 242         @rtype: (string, int) tuple.  The first item is the filename of
 243         the input, the second is the current line number.
 244         """
 245
 246         return (self.filename, self.line_number)
 247
 248     def _unget_char(self, c):
 249         """Unget a character.
 250
 251         The unget buffer for characters is only one character large; it is
 252         an error to try to unget a character when the unget buffer is not
 253         empty.
 254
 255         @param c: the character to unget
 256         @type c: string
 257         @raises UngetBufferFull: there is already an ungotten char
 258         """
 259
 260         if not self.ungotten_char is None:
 261             raise UngetBufferFull
 262         self.ungotten_char = c
 263
 264     def skip_whitespace(self):
 265         """Consume input until a non-whitespace character is encountered.
 266
 267         The non-whitespace character is then ungotten, and the number of
 268         whitespace characters consumed is returned.
 269
 270         If the tokenizer is in multiline mode, then newlines are whitespace.
 271
 272         @rtype: int
 273         """
 274
 275         skipped = 0
 276         while True:
 277             c = self._get_char()
 278             if c != ' ' and c != '\t':
 279                 if (c != '\n') or not self.multiline:
 280                     self._unget_char(c)
 281                     return skipped
 282             skipped += 1
 283
 284     def get(self, want_leading = False, want_comment = False):
 285         """Get the next token.
 286
 287         @param want_leading: If True, return a WHITESPACE token if the
 288         first character read is whitespace.  The default is False.
 289         @type want_leading: bool
 290         @param want_comment: If True, return a COMMENT token if the
 291         first token read is a comment.  The default is False.
 292         @type want_comment: bool
 293         @rtype: Token object
 294         @raises dns.exception.UnexpectedEnd: input ended prematurely
 295         @raises dns.exception.SyntaxError: input was badly formed
 296         """
 297
 298         if not self.ungotten_token is None:
 299             token = self.ungotten_token
 300             self.ungotten_token = None
 301             if token.is_whitespace():
 302                 if want_leading:
 303                     return token
 304             elif token.is_comment():
 305                 if want_comment:
 306                     return token
 307             else:
 308                 return token
 309         skipped = self.skip_whitespace()
 310         if want_leading and skipped > 0:
 311             return Token(WHITESPACE, ' ')
 312         token = ''
 313         ttype = IDENTIFIER
 314         has_escape = False
 315         while True:
 316             c = self._get_char()
 317             if c == '' or c in self.delimiters:
 318                 if c == '' and self.quoting:
 319                     raise dns.exception.UnexpectedEnd
 320                 if token == '' and ttype != QUOTED_STRING:
 321                     if c == '(':
 322                         self.multiline += 1
 323                         self.skip_whitespace()
 324                         continue
 325                     elif c == ')':
 326                         if not self.multiline > 0:
 327                             raise dns.exception.SyntaxError
 328                         self.multiline -= 1
 329                         self.skip_whitespace()
 330                         continue
 331                     elif c == '"':
 332                         if not self.quoting:
 333                             self.quoting = True
 334                             self.delimiters = _QUOTING_DELIMITERS
 335                             ttype = QUOTED_STRING
 336                             continue
 337                         else:
 338                             self.quoting = False
 339                             self.delimiters = _DELIMITERS
 340                             self.skip_whitespace()
 341                             continue
 342                     elif c == '\n':
 343                         return Token(EOL, '\n')
 344                     elif c == ';':
 345                         while 1:
 346                             c = self._get_char()
 347                             if c == '\n' or c == '':
 348                                 break
 349                             token += c
 350                         if want_comment:
 351                             self._unget_char(c)
 352                             return Token(COMMENT, token)
 353                         elif c == '':
 354                             if self.multiline:
 355                                 raise dns.exception.SyntaxError('unbalanced parentheses')
 356                             return Token(EOF)
 357                         elif self.multiline:
 358                             self.skip_whitespace()
 359                             token = ''
 360                             continue
 361                         else:
 362                             return Token(EOL, '\n')
 363                     else:
 364                         # This code exists in case we ever want a
 365                         # delimiter to be returned.  It never produces
 366                         # a token currently.
 367                         token = c
 368                         ttype = DELIMITER
 369                 else:
 370                     self._unget_char(c)
 371                 break
 372             elif self.quoting:
 373                 if c == '\\':
 374                     c = self._get_char()
 375                     if c == '':
 376                         raise dns.exception.UnexpectedEnd
 377                     if c.isdigit():
 378                         c2 = self._get_char()
 379                         if c2 == '':
 380                             raise dns.exception.UnexpectedEnd
 381                         c3 = self._get_char()
 382                         if c == '':
 383                             raise dns.exception.UnexpectedEnd
 384                         if not (c2.isdigit() and c3.isdigit()):
 385                             raise dns.exception.SyntaxError
 386                         c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
 387                 elif c == '\n':
 388                     raise dns.exception.SyntaxError('newline in quoted string')
 389             elif c == '\\':
 390                 #
 391                 # It's an escape.  Put it and the next character into
 392                 # the token; it will be checked later for goodness.
 393                 #
 394                 token += c
 395                 has_escape = True
 396                 c = self._get_char()
 397                 if c == '' or c == '\n':
 398                     raise dns.exception.UnexpectedEnd
 399             token += c
 400         if token == '' and ttype != QUOTED_STRING:
 401             if self.multiline:
 402                 raise dns.exception.SyntaxError('unbalanced parentheses')
 403             ttype = EOF
 404         return Token(ttype, token, has_escape)
 405
 406     def unget(self, token):
 407         """Unget a token.
 408
 409         The unget buffer for tokens is only one token large; it is
 410         an error to try to unget a token when the unget buffer is not
 411         empty.
 412
 413         @param token: the token to unget
 414         @type token: Token object
 415         @raises UngetBufferFull: there is already an ungotten token
 416         """
 417
 418         if not self.ungotten_token is None:
 419             raise UngetBufferFull
 420         self.ungotten_token = token
 421
 422     def next(self):
 423         """Return the next item in an iteration.
 424         @rtype: (int, string)
 425         """
 426
 427         token = self.get()
 428         if token.is_eof():
 429             raise StopIteration
 430         return token
 431
 432     def __iter__(self):
 433         return self
 434
 435     # Helpers
 436
 437     def get_int(self):
 438         """Read the next token and interpret it as an integer.
 439
 440         @raises dns.exception.SyntaxError:
 441         @rtype: int
 442         """
 443
 444         token = self.get().unescape()
 445         if not token.is_identifier():
 446             raise dns.exception.SyntaxError('expecting an identifier')
 447         if not token.value.isdigit():
 448             raise dns.exception.SyntaxError('expecting an integer')
 449         return int(token.value)
 450
 451     def get_uint8(self):
 452         """Read the next token and interpret it as an 8-bit unsigned
 453         integer.
 454
 455         @raises dns.exception.SyntaxError:
 456         @rtype: int
 457         """
 458
 459         value = self.get_int()
 460         if value < 0 or value > 255:
 461             raise dns.exception.SyntaxError('%d is not an unsigned 8-bit integer' % value)
 462         return value
 463
 464     def get_uint16(self):
 465         """Read the next token and interpret it as a 16-bit unsigned
 466         integer.
 467
 468         @raises dns.exception.SyntaxError:
 469         @rtype: int
 470         """
 471
 472         value = self.get_int()
 473         if value < 0 or value > 65535:
 474             raise dns.exception.SyntaxError('%d is not an unsigned 16-bit integer' % value)
 475         return value
 476
 477     def get_uint32(self):
 478         """Read the next token and interpret it as a 32-bit unsigned
 479         integer.
 480
 481         @raises dns.exception.SyntaxError:
 482         @rtype: int
 483         """
 484
 485         token = self.get().unescape()
 486         if not token.is_identifier():
 487             raise dns.exception.SyntaxError('expecting an identifier')
 488         if not token.value.isdigit():
 489             raise dns.exception.SyntaxError('expecting an integer')
 490         value = long(token.value)
 491         if value < 0 or value > 4294967296L:
 492             raise dns.exception.SyntaxError('%d is not an unsigned 32-bit integer' % value)
 493         return value
 494
 495     def get_string(self, origin=None):
 496         """Read the next token and interpret it as a string.
 497
 498         @raises dns.exception.SyntaxError:
 499         @rtype: string
 500         """
 501
 502         token = self.get().unescape()
 503         if not (token.is_identifier() or token.is_quoted_string()):
 504             raise dns.exception.SyntaxError('expecting a string')
 505         return token.value
 506
 507     def get_identifier(self, origin=None):
 508         """Read the next token and raise an exception if it is not an identifier.
 509
 510         @raises dns.exception.SyntaxError:
 511         @rtype: string
 512         """
 513
 514         token = self.get().unescape()
 515         if not token.is_identifier():
 516             raise dns.exception.SyntaxError('expecting an identifier')
 517         return token.value
 518
 519     def get_name(self, origin=None):
 520         """Read the next token and interpret it as a DNS name.
 521
 522         @raises dns.exception.SyntaxError:
 523         @rtype: dns.name.Name object"""
 524
 525         token = self.get()
 526         if not token.is_identifier():
 527             raise dns.exception.SyntaxError('expecting an identifier')
 528         return dns.name.from_text(token.value, origin)
 529
 530     def get_eol(self):
 531         """Read the next token and raise an exception if it isn't EOL or
 532         EOF.
 533
 534         @raises dns.exception.SyntaxError:
 535         @rtype: string
 536         """
 537
 538         token = self.get()
 539         if not token.is_eol_or_eof():
 540             raise dns.exception.SyntaxError('expected EOL or EOF, got %d "%s"' % (token.ttype, token.value))
 541         return token.value
 542
 543     def get_ttl(self):
 544         token = self.get().unescape()
 545         if not token.is_identifier():
 546             raise dns.exception.SyntaxError('expecting an identifier')
 547         return dns.ttl.from_text(token.value)