gitignore: add some hidden files
[Samba.git] / third_party / dnspython / dns / tokenizer.py
blob4bff7b6c72d552dc5441d6420221c54fd03b2648
1 # Copyright (C) 2003-2007, 2009-2011 Nominum, Inc.
3 # Permission to use, copy, modify, and distribute this software and its
4 # documentation for any purpose with or without fee is hereby granted,
5 # provided that the above copyright notice and this permission notice
6 # appear in all copies.
8 # THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES
9 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR
11 # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
14 # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 """Tokenize DNS master file format"""
18 import cStringIO
19 import sys
21 import dns.exception
22 import dns.name
23 import dns.ttl
25 _DELIMITERS = {
26 ' ' : True,
27 '\t' : True,
28 '\n' : True,
29 ';' : True,
30 '(' : True,
31 ')' : True,
32 '"' : True }
34 _QUOTING_DELIMITERS = { '"' : True }
36 EOF = 0
37 EOL = 1
38 WHITESPACE = 2
39 IDENTIFIER = 3
40 QUOTED_STRING = 4
41 COMMENT = 5
42 DELIMITER = 6
44 class UngetBufferFull(dns.exception.DNSException):
45 """Raised when an attempt is made to unget a token when the unget
46 buffer is full."""
47 pass
49 class Token(object):
50 """A DNS master file format token.
52 @ivar ttype: The token type
53 @type ttype: int
54 @ivar value: The token value
55 @type value: string
56 @ivar has_escape: Does the token value contain escapes?
57 @type has_escape: bool
58 """
60 def __init__(self, ttype, value='', has_escape=False):
61 """Initialize a token instance.
63 @param ttype: The token type
64 @type ttype: int
65 @ivar value: The token value
66 @type value: string
67 @ivar has_escape: Does the token value contain escapes?
68 @type has_escape: bool
69 """
70 self.ttype = ttype
71 self.value = value
72 self.has_escape = has_escape
74 def is_eof(self):
75 return self.ttype == EOF
77 def is_eol(self):
78 return self.ttype == EOL
80 def is_whitespace(self):
81 return self.ttype == WHITESPACE
83 def is_identifier(self):
84 return self.ttype == IDENTIFIER
86 def is_quoted_string(self):
87 return self.ttype == QUOTED_STRING
89 def is_comment(self):
90 return self.ttype == COMMENT
92 def is_delimiter(self):
93 return self.ttype == DELIMITER
95 def is_eol_or_eof(self):
96 return (self.ttype == EOL or self.ttype == EOF)
98 def __eq__(self, other):
99 if not isinstance(other, Token):
100 return False
101 return (self.ttype == other.ttype and
102 self.value == other.value)
104 def __ne__(self, other):
105 if not isinstance(other, Token):
106 return True
107 return (self.ttype != other.ttype or
108 self.value != other.value)
110 def __str__(self):
111 return '%d "%s"' % (self.ttype, self.value)
113 def unescape(self):
114 if not self.has_escape:
115 return self
116 unescaped = ''
117 l = len(self.value)
118 i = 0
119 while i < l:
120 c = self.value[i]
121 i += 1
122 if c == '\\':
123 if i >= l:
124 raise dns.exception.UnexpectedEnd
125 c = self.value[i]
126 i += 1
127 if c.isdigit():
128 if i >= l:
129 raise dns.exception.UnexpectedEnd
130 c2 = self.value[i]
131 i += 1
132 if i >= l:
133 raise dns.exception.UnexpectedEnd
134 c3 = self.value[i]
135 i += 1
136 if not (c2.isdigit() and c3.isdigit()):
137 raise dns.exception.SyntaxError
138 c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
139 unescaped += c
140 return Token(self.ttype, unescaped)
142 # compatibility for old-style tuple tokens
144 def __len__(self):
145 return 2
147 def __iter__(self):
148 return iter((self.ttype, self.value))
150 def __getitem__(self, i):
151 if i == 0:
152 return self.ttype
153 elif i == 1:
154 return self.value
155 else:
156 raise IndexError
158 class Tokenizer(object):
159 """A DNS master file format tokenizer.
161 A token is a (type, value) tuple, where I{type} is an int, and
162 I{value} is a string. The valid types are EOF, EOL, WHITESPACE,
163 IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER.
165 @ivar file: The file to tokenize
166 @type file: file
167 @ivar ungotten_char: The most recently ungotten character, or None.
168 @type ungotten_char: string
169 @ivar ungotten_token: The most recently ungotten token, or None.
170 @type ungotten_token: (int, string) token tuple
171 @ivar multiline: The current multiline level. This value is increased
172 by one every time a '(' delimiter is read, and decreased by one every time
173 a ')' delimiter is read.
174 @type multiline: int
175 @ivar quoting: This variable is true if the tokenizer is currently
176 reading a quoted string.
177 @type quoting: bool
178 @ivar eof: This variable is true if the tokenizer has encountered EOF.
179 @type eof: bool
180 @ivar delimiters: The current delimiter dictionary.
181 @type delimiters: dict
182 @ivar line_number: The current line number
183 @type line_number: int
184 @ivar filename: A filename that will be returned by the L{where} method.
185 @type filename: string
188 def __init__(self, f=sys.stdin, filename=None):
189 """Initialize a tokenizer instance.
191 @param f: The file to tokenize. The default is sys.stdin.
192 This parameter may also be a string, in which case the tokenizer
193 will take its input from the contents of the string.
194 @type f: file or string
195 @param filename: the name of the filename that the L{where} method
196 will return.
197 @type filename: string
200 if isinstance(f, str):
201 f = cStringIO.StringIO(f)
202 if filename is None:
203 filename = '<string>'
204 else:
205 if filename is None:
206 if f is sys.stdin:
207 filename = '<stdin>'
208 else:
209 filename = '<file>'
210 self.file = f
211 self.ungotten_char = None
212 self.ungotten_token = None
213 self.multiline = 0
214 self.quoting = False
215 self.eof = False
216 self.delimiters = _DELIMITERS
217 self.line_number = 1
218 self.filename = filename
220 def _get_char(self):
221 """Read a character from input.
222 @rtype: string
225 if self.ungotten_char is None:
226 if self.eof:
227 c = ''
228 else:
229 c = self.file.read(1)
230 if c == '':
231 self.eof = True
232 elif c == '\n':
233 self.line_number += 1
234 else:
235 c = self.ungotten_char
236 self.ungotten_char = None
237 return c
239 def where(self):
240 """Return the current location in the input.
242 @rtype: (string, int) tuple. The first item is the filename of
243 the input, the second is the current line number.
246 return (self.filename, self.line_number)
248 def _unget_char(self, c):
249 """Unget a character.
251 The unget buffer for characters is only one character large; it is
252 an error to try to unget a character when the unget buffer is not
253 empty.
255 @param c: the character to unget
256 @type c: string
257 @raises UngetBufferFull: there is already an ungotten char
260 if not self.ungotten_char is None:
261 raise UngetBufferFull
262 self.ungotten_char = c
264 def skip_whitespace(self):
265 """Consume input until a non-whitespace character is encountered.
267 The non-whitespace character is then ungotten, and the number of
268 whitespace characters consumed is returned.
270 If the tokenizer is in multiline mode, then newlines are whitespace.
272 @rtype: int
275 skipped = 0
276 while True:
277 c = self._get_char()
278 if c != ' ' and c != '\t':
279 if (c != '\n') or not self.multiline:
280 self._unget_char(c)
281 return skipped
282 skipped += 1
284 def get(self, want_leading = False, want_comment = False):
285 """Get the next token.
287 @param want_leading: If True, return a WHITESPACE token if the
288 first character read is whitespace. The default is False.
289 @type want_leading: bool
290 @param want_comment: If True, return a COMMENT token if the
291 first token read is a comment. The default is False.
292 @type want_comment: bool
293 @rtype: Token object
294 @raises dns.exception.UnexpectedEnd: input ended prematurely
295 @raises dns.exception.SyntaxError: input was badly formed
298 if not self.ungotten_token is None:
299 token = self.ungotten_token
300 self.ungotten_token = None
301 if token.is_whitespace():
302 if want_leading:
303 return token
304 elif token.is_comment():
305 if want_comment:
306 return token
307 else:
308 return token
309 skipped = self.skip_whitespace()
310 if want_leading and skipped > 0:
311 return Token(WHITESPACE, ' ')
312 token = ''
313 ttype = IDENTIFIER
314 has_escape = False
315 while True:
316 c = self._get_char()
317 if c == '' or c in self.delimiters:
318 if c == '' and self.quoting:
319 raise dns.exception.UnexpectedEnd
320 if token == '' and ttype != QUOTED_STRING:
321 if c == '(':
322 self.multiline += 1
323 self.skip_whitespace()
324 continue
325 elif c == ')':
326 if not self.multiline > 0:
327 raise dns.exception.SyntaxError
328 self.multiline -= 1
329 self.skip_whitespace()
330 continue
331 elif c == '"':
332 if not self.quoting:
333 self.quoting = True
334 self.delimiters = _QUOTING_DELIMITERS
335 ttype = QUOTED_STRING
336 continue
337 else:
338 self.quoting = False
339 self.delimiters = _DELIMITERS
340 self.skip_whitespace()
341 continue
342 elif c == '\n':
343 return Token(EOL, '\n')
344 elif c == ';':
345 while 1:
346 c = self._get_char()
347 if c == '\n' or c == '':
348 break
349 token += c
350 if want_comment:
351 self._unget_char(c)
352 return Token(COMMENT, token)
353 elif c == '':
354 if self.multiline:
355 raise dns.exception.SyntaxError('unbalanced parentheses')
356 return Token(EOF)
357 elif self.multiline:
358 self.skip_whitespace()
359 token = ''
360 continue
361 else:
362 return Token(EOL, '\n')
363 else:
364 # This code exists in case we ever want a
365 # delimiter to be returned. It never produces
366 # a token currently.
367 token = c
368 ttype = DELIMITER
369 else:
370 self._unget_char(c)
371 break
372 elif self.quoting:
373 if c == '\\':
374 c = self._get_char()
375 if c == '':
376 raise dns.exception.UnexpectedEnd
377 if c.isdigit():
378 c2 = self._get_char()
379 if c2 == '':
380 raise dns.exception.UnexpectedEnd
381 c3 = self._get_char()
382 if c == '':
383 raise dns.exception.UnexpectedEnd
384 if not (c2.isdigit() and c3.isdigit()):
385 raise dns.exception.SyntaxError
386 c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
387 elif c == '\n':
388 raise dns.exception.SyntaxError('newline in quoted string')
389 elif c == '\\':
391 # It's an escape. Put it and the next character into
392 # the token; it will be checked later for goodness.
394 token += c
395 has_escape = True
396 c = self._get_char()
397 if c == '' or c == '\n':
398 raise dns.exception.UnexpectedEnd
399 token += c
400 if token == '' and ttype != QUOTED_STRING:
401 if self.multiline:
402 raise dns.exception.SyntaxError('unbalanced parentheses')
403 ttype = EOF
404 return Token(ttype, token, has_escape)
406 def unget(self, token):
407 """Unget a token.
409 The unget buffer for tokens is only one token large; it is
410 an error to try to unget a token when the unget buffer is not
411 empty.
413 @param token: the token to unget
414 @type token: Token object
415 @raises UngetBufferFull: there is already an ungotten token
418 if not self.ungotten_token is None:
419 raise UngetBufferFull
420 self.ungotten_token = token
422 def next(self):
423 """Return the next item in an iteration.
424 @rtype: (int, string)
427 token = self.get()
428 if token.is_eof():
429 raise StopIteration
430 return token
432 def __iter__(self):
433 return self
435 # Helpers
437 def get_int(self):
438 """Read the next token and interpret it as an integer.
440 @raises dns.exception.SyntaxError:
441 @rtype: int
444 token = self.get().unescape()
445 if not token.is_identifier():
446 raise dns.exception.SyntaxError('expecting an identifier')
447 if not token.value.isdigit():
448 raise dns.exception.SyntaxError('expecting an integer')
449 return int(token.value)
451 def get_uint8(self):
452 """Read the next token and interpret it as an 8-bit unsigned
453 integer.
455 @raises dns.exception.SyntaxError:
456 @rtype: int
459 value = self.get_int()
460 if value < 0 or value > 255:
461 raise dns.exception.SyntaxError('%d is not an unsigned 8-bit integer' % value)
462 return value
464 def get_uint16(self):
465 """Read the next token and interpret it as a 16-bit unsigned
466 integer.
468 @raises dns.exception.SyntaxError:
469 @rtype: int
472 value = self.get_int()
473 if value < 0 or value > 65535:
474 raise dns.exception.SyntaxError('%d is not an unsigned 16-bit integer' % value)
475 return value
477 def get_uint32(self):
478 """Read the next token and interpret it as a 32-bit unsigned
479 integer.
481 @raises dns.exception.SyntaxError:
482 @rtype: int
485 token = self.get().unescape()
486 if not token.is_identifier():
487 raise dns.exception.SyntaxError('expecting an identifier')
488 if not token.value.isdigit():
489 raise dns.exception.SyntaxError('expecting an integer')
490 value = long(token.value)
491 if value < 0 or value > 4294967296L:
492 raise dns.exception.SyntaxError('%d is not an unsigned 32-bit integer' % value)
493 return value
495 def get_string(self, origin=None):
496 """Read the next token and interpret it as a string.
498 @raises dns.exception.SyntaxError:
499 @rtype: string
502 token = self.get().unescape()
503 if not (token.is_identifier() or token.is_quoted_string()):
504 raise dns.exception.SyntaxError('expecting a string')
505 return token.value
507 def get_identifier(self, origin=None):
508 """Read the next token and raise an exception if it is not an identifier.
510 @raises dns.exception.SyntaxError:
511 @rtype: string
514 token = self.get().unescape()
515 if not token.is_identifier():
516 raise dns.exception.SyntaxError('expecting an identifier')
517 return token.value
519 def get_name(self, origin=None):
520 """Read the next token and interpret it as a DNS name.
522 @raises dns.exception.SyntaxError:
523 @rtype: dns.name.Name object"""
525 token = self.get()
526 if not token.is_identifier():
527 raise dns.exception.SyntaxError('expecting an identifier')
528 return dns.name.from_text(token.value, origin)
530 def get_eol(self):
531 """Read the next token and raise an exception if it isn't EOL or
532 EOF.
534 @raises dns.exception.SyntaxError:
535 @rtype: string
538 token = self.get()
539 if not token.is_eol_or_eof():
540 raise dns.exception.SyntaxError('expected EOL or EOF, got %d "%s"' % (token.ttype, token.value))
541 return token.value
543 def get_ttl(self):
544 token = self.get().unescape()
545 if not token.is_identifier():
546 raise dns.exception.SyntaxError('expecting an identifier')
547 return dns.ttl.from_text(token.value)