Package dns :: Module tokenizer
[hide private]
[frames] | no frames]

Source Code for Module dns.tokenizer

  1  # Copyright (C) 2003-2007, 2009-2011 Nominum, Inc. 
  2  # 
  3  # Permission to use, copy, modify, and distribute this software and its 
  4  # documentation for any purpose with or without fee is hereby granted, 
  5  # provided that the above copyright notice and this permission notice 
  6  # appear in all copies. 
  7  # 
  8  # THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES 
  9  # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 
 10  # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR 
 11  # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 
 12  # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 
 13  # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 
 14  # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 
 15   
 16  """Tokenize DNS master file format""" 
 17   
 18  import cStringIO 
 19  import sys 
 20   
 21  import dns.exception 
 22  import dns.name 
 23  import dns.ttl 
 24   
 25  _DELIMITERS = { 
 26      ' ' : True, 
 27      '\t' : True, 
 28      '\n' : True, 
 29      ';' : True, 
 30      '(' : True, 
 31      ')' : True, 
 32      '"' : True } 
 33   
 34  _QUOTING_DELIMITERS = { '"' : True } 
 35   
 36  EOF = 0 
 37  EOL = 1 
 38  WHITESPACE = 2 
 39  IDENTIFIER = 3 
 40  QUOTED_STRING = 4 
 41  COMMENT = 5 
 42  DELIMITER = 6 
 43   
44 -class UngetBufferFull(dns.exception.DNSException):
45 """An attempt was made to unget a token when the unget buffer was full."""
46
47 -class Token(object):
48 """A DNS master file format token. 49 50 @ivar ttype: The token type 51 @type ttype: int 52 @ivar value: The token value 53 @type value: string 54 @ivar has_escape: Does the token value contain escapes? 55 @type has_escape: bool 56 """ 57
58 - def __init__(self, ttype, value='', has_escape=False):
59 """Initialize a token instance. 60 61 @param ttype: The token type 62 @type ttype: int 63 @param value: The token value 64 @type value: string 65 @param has_escape: Does the token value contain escapes? 66 @type has_escape: bool 67 """ 68 self.ttype = ttype 69 self.value = value 70 self.has_escape = has_escape
71
72 - def is_eof(self):
73 return self.ttype == EOF
74
75 - def is_eol(self):
76 return self.ttype == EOL
77
78 - def is_whitespace(self):
79 return self.ttype == WHITESPACE
80
81 - def is_identifier(self):
82 return self.ttype == IDENTIFIER
83
84 - def is_quoted_string(self):
85 return self.ttype == QUOTED_STRING
86
87 - def is_comment(self):
88 return self.ttype == COMMENT
89
90 - def is_delimiter(self):
91 return self.ttype == DELIMITER
92
93 - def is_eol_or_eof(self):
94 return (self.ttype == EOL or self.ttype == EOF)
95
96 - def __eq__(self, other):
97 if not isinstance(other, Token): 98 return False 99 return (self.ttype == other.ttype and 100 self.value == other.value)
101
102 - def __ne__(self, other):
103 if not isinstance(other, Token): 104 return True 105 return (self.ttype != other.ttype or 106 self.value != other.value)
107
108 - def __str__(self):
109 return '%d "%s"' % (self.ttype, self.value)
110
111 - def unescape(self):
112 if not self.has_escape: 113 return self 114 unescaped = '' 115 l = len(self.value) 116 i = 0 117 while i < l: 118 c = self.value[i] 119 i += 1 120 if c == '\\': 121 if i >= l: 122 raise dns.exception.UnexpectedEnd 123 c = self.value[i] 124 i += 1 125 if c.isdigit(): 126 if i >= l: 127 raise dns.exception.UnexpectedEnd 128 c2 = self.value[i] 129 i += 1 130 if i >= l: 131 raise dns.exception.UnexpectedEnd 132 c3 = self.value[i] 133 i += 1 134 if not (c2.isdigit() and c3.isdigit()): 135 raise dns.exception.SyntaxError 136 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 137 unescaped += c 138 return Token(self.ttype, unescaped)
139 140 # compatibility for old-style tuple tokens 141
142 - def __len__(self):
143 return 2
144
145 - def __iter__(self):
146 return iter((self.ttype, self.value))
147
148 - def __getitem__(self, i):
149 if i == 0: 150 return self.ttype 151 elif i == 1: 152 return self.value 153 else: 154 raise IndexError
155
156 -class Tokenizer(object):
157 """A DNS master file format tokenizer. 158 159 A token is a (type, value) tuple, where I{type} is an int, and 160 I{value} is a string. The valid types are EOF, EOL, WHITESPACE, 161 IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER. 162 163 @ivar file: The file to tokenize 164 @type file: file 165 @ivar ungotten_char: The most recently ungotten character, or None. 166 @type ungotten_char: string 167 @ivar ungotten_token: The most recently ungotten token, or None. 168 @type ungotten_token: (int, string) token tuple 169 @ivar multiline: The current multiline level. This value is increased 170 by one every time a '(' delimiter is read, and decreased by one every time 171 a ')' delimiter is read. 172 @type multiline: int 173 @ivar quoting: This variable is true if the tokenizer is currently 174 reading a quoted string. 175 @type quoting: bool 176 @ivar eof: This variable is true if the tokenizer has encountered EOF. 177 @type eof: bool 178 @ivar delimiters: The current delimiter dictionary. 179 @type delimiters: dict 180 @ivar line_number: The current line number 181 @type line_number: int 182 @ivar filename: A filename that will be returned by the L{where} method. 183 @type filename: string 184 """ 185
186 - def __init__(self, f=sys.stdin, filename=None):
187 """Initialize a tokenizer instance. 188 189 @param f: The file to tokenize. The default is sys.stdin. 190 This parameter may also be a string, in which case the tokenizer 191 will take its input from the contents of the string. 192 @type f: file or string 193 @param filename: the name of the filename that the L{where} method 194 will return. 195 @type filename: string 196 """ 197 198 if isinstance(f, (str, unicode)): 199 f = cStringIO.StringIO(f) 200 if filename is None: 201 filename = '<string>' 202 else: 203 if filename is None: 204 if f is sys.stdin: 205 filename = '<stdin>' 206 else: 207 filename = '<file>' 208 self.file = f 209 self.ungotten_char = None 210 self.ungotten_token = None 211 self.multiline = 0 212 self.quoting = False 213 self.eof = False 214 self.delimiters = _DELIMITERS 215 self.line_number = 1 216 self.filename = filename
217
218 - def _get_char(self):
219 """Read a character from input. 220 @rtype: string 221 """ 222 223 if self.ungotten_char is None: 224 if self.eof: 225 c = '' 226 else: 227 c = self.file.read(1) 228 if c == '': 229 self.eof = True 230 elif c == '\n': 231 self.line_number += 1 232 else: 233 c = self.ungotten_char 234 self.ungotten_char = None 235 return c
236
237 - def where(self):
238 """Return the current location in the input. 239 240 @rtype: (string, int) tuple. The first item is the filename of 241 the input, the second is the current line number. 242 """ 243 244 return (self.filename, self.line_number)
245
246 - def _unget_char(self, c):
247 """Unget a character. 248 249 The unget buffer for characters is only one character large; it is 250 an error to try to unget a character when the unget buffer is not 251 empty. 252 253 @param c: the character to unget 254 @type c: string 255 @raises UngetBufferFull: there is already an ungotten char 256 """ 257 258 if not self.ungotten_char is None: 259 raise UngetBufferFull 260 self.ungotten_char = c
261
262 - def skip_whitespace(self):
263 """Consume input until a non-whitespace character is encountered. 264 265 The non-whitespace character is then ungotten, and the number of 266 whitespace characters consumed is returned. 267 268 If the tokenizer is in multiline mode, then newlines are whitespace. 269 270 @rtype: int 271 """ 272 273 skipped = 0 274 while True: 275 c = self._get_char() 276 if c != ' ' and c != '\t': 277 if (c != '\n') or not self.multiline: 278 self._unget_char(c) 279 return skipped 280 skipped += 1
281
282 - def get(self, want_leading = False, want_comment = False):
283 """Get the next token. 284 285 @param want_leading: If True, return a WHITESPACE token if the 286 first character read is whitespace. The default is False. 287 @type want_leading: bool 288 @param want_comment: If True, return a COMMENT token if the 289 first token read is a comment. The default is False. 290 @type want_comment: bool 291 @rtype: Token object 292 @raises dns.exception.UnexpectedEnd: input ended prematurely 293 @raises dns.exception.SyntaxError: input was badly formed 294 """ 295 296 if not self.ungotten_token is None: 297 token = self.ungotten_token 298 self.ungotten_token = None 299 if token.is_whitespace(): 300 if want_leading: 301 return token 302 elif token.is_comment(): 303 if want_comment: 304 return token 305 else: 306 return token 307 skipped = self.skip_whitespace() 308 if want_leading and skipped > 0: 309 return Token(WHITESPACE, ' ') 310 token = '' 311 ttype = IDENTIFIER 312 has_escape = False 313 while True: 314 c = self._get_char() 315 if c == '' or c in self.delimiters: 316 if c == '' and self.quoting: 317 raise dns.exception.UnexpectedEnd 318 if token == '' and ttype != QUOTED_STRING: 319 if c == '(': 320 self.multiline += 1 321 self.skip_whitespace() 322 continue 323 elif c == ')': 324 if not self.multiline > 0: 325 raise dns.exception.SyntaxError 326 self.multiline -= 1 327 self.skip_whitespace() 328 continue 329 elif c == '"': 330 if not self.quoting: 331 self.quoting = True 332 self.delimiters = _QUOTING_DELIMITERS 333 ttype = QUOTED_STRING 334 continue 335 else: 336 self.quoting = False 337 self.delimiters = _DELIMITERS 338 self.skip_whitespace() 339 continue 340 elif c == '\n': 341 return Token(EOL, '\n') 342 elif c == ';': 343 while 1: 344 c = self._get_char() 345 if c == '\n' or c == '': 346 break 347 token += c 348 if want_comment: 349 self._unget_char(c) 350 return Token(COMMENT, token) 351 elif c == '': 352 if self.multiline: 353 raise dns.exception.SyntaxError('unbalanced parentheses') 354 return Token(EOF) 355 elif self.multiline: 356 self.skip_whitespace() 357 token = '' 358 continue 359 else: 360 return Token(EOL, '\n') 361 else: 362 # This code exists in case we ever want a 363 # delimiter to be returned. It never produces 364 # a token currently. 365 token = c 366 ttype = DELIMITER 367 else: 368 self._unget_char(c) 369 break 370 elif self.quoting: 371 if c == '\\': 372 c = self._get_char() 373 if c == '': 374 raise dns.exception.UnexpectedEnd 375 if c.isdigit(): 376 c2 = self._get_char() 377 if c2 == '': 378 raise dns.exception.UnexpectedEnd 379 c3 = self._get_char() 380 if c == '': 381 raise dns.exception.UnexpectedEnd 382 if not (c2.isdigit() and c3.isdigit()): 383 raise dns.exception.SyntaxError 384 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 385 elif c == '\n': 386 raise dns.exception.SyntaxError('newline in quoted string') 387 elif c == '\\': 388 # 389 # It's an escape. Put it and the next character into 390 # the token; it will be checked later for goodness. 391 # 392 token += c 393 has_escape = True 394 c = self._get_char() 395 if c == '' or c == '\n': 396 raise dns.exception.UnexpectedEnd 397 token += c 398 if token == '' and ttype != QUOTED_STRING: 399 if self.multiline: 400 raise dns.exception.SyntaxError('unbalanced parentheses') 401 ttype = EOF 402 return Token(ttype, token, has_escape)
403
404 - def unget(self, token):
405 """Unget a token. 406 407 The unget buffer for tokens is only one token large; it is 408 an error to try to unget a token when the unget buffer is not 409 empty. 410 411 @param token: the token to unget 412 @type token: Token object 413 @raises UngetBufferFull: there is already an ungotten token 414 """ 415 416 if not self.ungotten_token is None: 417 raise UngetBufferFull 418 self.ungotten_token = token
419
420 - def next(self):
421 """Return the next item in an iteration. 422 @rtype: (int, string) 423 """ 424 425 token = self.get() 426 if token.is_eof(): 427 raise StopIteration 428 return token
429
430 - def __iter__(self):
431 return self
432 433 # Helpers 434
435 - def get_int(self):
436 """Read the next token and interpret it as an integer. 437 438 @raises dns.exception.SyntaxError: 439 @rtype: int 440 """ 441 442 token = self.get().unescape() 443 if not token.is_identifier(): 444 raise dns.exception.SyntaxError('expecting an identifier') 445 if not token.value.isdigit(): 446 raise dns.exception.SyntaxError('expecting an integer') 447 return int(token.value)
448
449 - def get_uint8(self):
450 """Read the next token and interpret it as an 8-bit unsigned 451 integer. 452 453 @raises dns.exception.SyntaxError: 454 @rtype: int 455 """ 456 457 value = self.get_int() 458 if value < 0 or value > 255: 459 raise dns.exception.SyntaxError('%d is not an unsigned 8-bit integer' % value) 460 return value
461
462 - def get_uint16(self):
463 """Read the next token and interpret it as a 16-bit unsigned 464 integer. 465 466 @raises dns.exception.SyntaxError: 467 @rtype: int 468 """ 469 470 value = self.get_int() 471 if value < 0 or value > 65535: 472 raise dns.exception.SyntaxError('%d is not an unsigned 16-bit integer' % value) 473 return value
474
475 - def get_uint32(self):
476 """Read the next token and interpret it as a 32-bit unsigned 477 integer. 478 479 @raises dns.exception.SyntaxError: 480 @rtype: int 481 """ 482 483 token = self.get().unescape() 484 if not token.is_identifier(): 485 raise dns.exception.SyntaxError('expecting an identifier') 486 if not token.value.isdigit(): 487 raise dns.exception.SyntaxError('expecting an integer') 488 value = long(token.value) 489 if value < 0 or value > 4294967296L: 490 raise dns.exception.SyntaxError('%d is not an unsigned 32-bit integer' % value) 491 return value
492
493 - def get_string(self, origin=None):
494 """Read the next token and interpret it as a string. 495 496 @raises dns.exception.SyntaxError: 497 @rtype: string 498 """ 499 500 token = self.get().unescape() 501 if not (token.is_identifier() or token.is_quoted_string()): 502 raise dns.exception.SyntaxError('expecting a string') 503 return token.value
504
505 - def get_identifier(self, origin=None):
506 """Read the next token and raise an exception if it is not an identifier. 507 508 @raises dns.exception.SyntaxError: 509 @rtype: string 510 """ 511 512 token = self.get().unescape() 513 if not token.is_identifier(): 514 raise dns.exception.SyntaxError('expecting an identifier') 515 return token.value
516
517 - def get_name(self, origin=None):
518 """Read the next token and interpret it as a DNS name. 519 520 @raises dns.exception.SyntaxError: 521 @rtype: dns.name.Name object""" 522 523 token = self.get() 524 if not token.is_identifier(): 525 raise dns.exception.SyntaxError('expecting an identifier') 526 return dns.name.from_text(token.value, origin)
527
528 - def get_eol(self):
529 """Read the next token and raise an exception if it isn't EOL or 530 EOF. 531 532 @raises dns.exception.SyntaxError: 533 @rtype: string 534 """ 535 536 token = self.get() 537 if not token.is_eol_or_eof(): 538 raise dns.exception.SyntaxError('expected EOL or EOF, got %d "%s"' % (token.ttype, token.value)) 539 return token.value
540
541 - def get_ttl(self):
542 token = self.get().unescape() 543 if not token.is_identifier(): 544 raise dns.exception.SyntaxError('expecting an identifier') 545 return dns.ttl.from_text(token.value)
546