Source code for pytomo.dns.tokenizer

# Copyright (C) 2003-2007, 2009, 2010 Nominum, Inc.
#
# Permission to use, copy, modify, and distribute this software and its
# documentation for any purpose with or without fee is hereby granted,
# provided that the above copyright notice and this permission notice
# appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

"""Tokenize DNS master file format"""

from __future__ import absolute_import

import cStringIO
import sys

from . import exception as dns_exception
from . import name as dns_name
from . import ttl as dns_ttl

_DELIMITERS = {
    ' ' : True,
    '\t' : True,
    '\n' : True,
    ';' : True,
    '(' : True,
    ')' : True,
    '"' : True }

_QUOTING_DELIMITERS = { '"' : True }

EOF = 0
EOL = 1
WHITESPACE = 2
IDENTIFIER = 3
QUOTED_STRING = 4
COMMENT = 5
DELIMITER = 6

[docs]class UngetBufferFull(dns_exception.DNSException): """Raised when an attempt is made to unget a token when the unget buffer is full.""" pass
[docs]class Token(object): """A DNS master file format token. @ivar ttype: The token type @type ttype: int @ivar value: The token value @type value: string @ivar has_escape: Does the token value contain escapes? @type has_escape: bool """ def __init__(self, ttype, value='', has_escape=False): """Initialize a token instance. @param ttype: The token type @type ttype: int @ivar value: The token value @type value: string @ivar has_escape: Does the token value contain escapes? @type has_escape: bool """ self.ttype = ttype self.value = value self.has_escape = has_escape
[docs] def is_eof(self): return self.ttype == EOF
[docs] def is_eol(self): return self.ttype == EOL
[docs] def is_whitespace(self): return self.ttype == WHITESPACE
[docs] def is_identifier(self): return self.ttype == IDENTIFIER
[docs] def is_quoted_string(self): return self.ttype == QUOTED_STRING
[docs] def is_comment(self): return self.ttype == COMMENT
[docs] def is_delimiter(self): return self.ttype == DELIMITER
[docs] def is_eol_or_eof(self): return (self.ttype == EOL or self.ttype == EOF)
def __eq__(self, other): if not isinstance(other, Token): return False return (self.ttype == other.ttype and self.value == other.value) def __ne__(self, other): if not isinstance(other, Token): return True return (self.ttype != other.ttype or self.value != other.value) def __str__(self): return '%d "%s"' % (self.ttype, self.value)
[docs] def unescape(self): if not self.has_escape: return self unescaped = '' l = len(self.value) i = 0 while i < l: c = self.value[i] i += 1 if c == '\\': if i >= l: raise dns_exception.UnexpectedEnd c = self.value[i] i += 1 if c.isdigit(): if i >= l: raise dns_exception.UnexpectedEnd c2 = self.value[i] i += 1 if i >= l: raise dns_exception.UnexpectedEnd c3 = self.value[i] i += 1 if not (c2.isdigit() and c3.isdigit()): raise dns_exception.SyntaxError c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) unescaped += c return Token(self.ttype, unescaped) # compatibility for old-style tuple tokens
def __len__(self): return 2 def __iter__(self): return iter((self.ttype, self.value)) def __getitem__(self, i): if i == 0: return self.ttype elif i == 1: return self.value else: raise IndexError
[docs]class Tokenizer(object): """A DNS master file format tokenizer. A token is a (type, value) tuple, where I{type} is an int, and I{value} is a string. The valid types are EOF, EOL, WHITESPACE, IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER. @ivar file: The file to tokenize @type file: file @ivar ungotten_char: The most recently ungotten character, or None. @type ungotten_char: string @ivar ungotten_token: The most recently ungotten token, or None. @type ungotten_token: (int, string) token tuple @ivar multiline: The current multiline level. This value is increased by one every time a '(' delimiter is read, and decreased by one every time a ')' delimiter is read. @type multiline: int @ivar quoting: This variable is true if the tokenizer is currently reading a quoted string. @type quoting: bool @ivar eof: This variable is true if the tokenizer has encountered EOF. @type eof: bool @ivar delimiters: The current delimiter dictionary. @type delimiters: dict @ivar line_number: The current line number @type line_number: int @ivar filename: A filename that will be returned by the L{where} method. @type filename: string """ def __init__(self, f=sys.stdin, filename=None): """Initialize a tokenizer instance. @param f: The file to tokenize. The default is sys.stdin. This parameter may also be a string, in which case the tokenizer will take its input from the contents of the string. @type f: file or string @param filename: the name of the filename that the L{where} method will return. @type filename: string """ if isinstance(f, str): f = cStringIO.StringIO(f) if filename is None: filename = '<string>' else: if filename is None: if f is sys.stdin: filename = '<stdin>' else: filename = '<file>' self.file = f self.ungotten_char = None self.ungotten_token = None self.multiline = 0 self.quoting = False self.eof = False self.delimiters = _DELIMITERS self.line_number = 1 self.filename = filename def _get_char(self): """Read a character from input. @rtype: string """ if self.ungotten_char is None: if self.eof: c = '' else: c = self.file.read(1) if c == '': self.eof = True elif c == '\n': self.line_number += 1 else: c = self.ungotten_char self.ungotten_char = None return c
[docs] def where(self): """Return the current location in the input. @rtype: (string, int) tuple. The first item is the filename of the input, the second is the current line number. """ return (self.filename, self.line_number)
def _unget_char(self, c): """Unget a character. The unget buffer for characters is only one character large; it is an error to try to unget a character when the unget buffer is not empty. @param c: the character to unget @type c: string @raises UngetBufferFull: there is already an ungotten char """ if not self.ungotten_char is None: raise UngetBufferFull self.ungotten_char = c
[docs] def skip_whitespace(self): """Consume input until a non-whitespace character is encountered. The non-whitespace character is then ungotten, and the number of whitespace characters consumed is returned. If the tokenizer is in multiline mode, then newlines are whitespace. @rtype: int """ skipped = 0 while True: c = self._get_char() if c != ' ' and c != '\t': if (c != '\n') or not self.multiline: self._unget_char(c) return skipped skipped += 1
[docs] def get(self, want_leading = False, want_comment = False): """Get the next token. @param want_leading: If True, return a WHITESPACE token if the first character read is whitespace. The default is False. @type want_leading: bool @param want_comment: If True, return a COMMENT token if the first token read is a comment. The default is False. @type want_comment: bool @rtype: Token object @raises dns_exception.UnexpectedEnd: input ended prematurely @raises dns_exception.SyntaxError: input was badly formed """ if not self.ungotten_token is None: token = self.ungotten_token self.ungotten_token = None if token.is_whitespace(): if want_leading: return token elif token.is_comment(): if want_comment: return token else: return token skipped = self.skip_whitespace() if want_leading and skipped > 0: return Token(WHITESPACE, ' ') token = '' ttype = IDENTIFIER has_escape = False while True: c = self._get_char() if c == '' or c in self.delimiters: if c == '' and self.quoting: raise dns_exception.UnexpectedEnd if token == '' and ttype != QUOTED_STRING: if c == '(': self.multiline += 1 self.skip_whitespace() continue elif c == ')': if not self.multiline > 0: raise dns_exception.SyntaxError self.multiline -= 1 self.skip_whitespace() continue elif c == '"': if not self.quoting: self.quoting = True self.delimiters = _QUOTING_DELIMITERS ttype = QUOTED_STRING continue else: self.quoting = False self.delimiters = _DELIMITERS self.skip_whitespace() continue elif c == '\n': return Token(EOL, '\n') elif c == ';': while 1: c = self._get_char() if c == '\n' or c == '': break token += c if want_comment: self._unget_char(c) return Token(COMMENT, token) elif c == '': if self.multiline: raise dns_exception.SyntaxError('unbalanced parentheses') return Token(EOF) elif self.multiline: self.skip_whitespace() token = '' continue else: return Token(EOL, '\n') else: # This code exists in case we ever want a # delimiter to be returned. It never produces # a token currently. token = c ttype = DELIMITER else: self._unget_char(c) break elif self.quoting: if c == '\\': c = self._get_char() if c == '': raise dns_exception.UnexpectedEnd if c.isdigit(): c2 = self._get_char() if c2 == '': raise dns_exception.UnexpectedEnd c3 = self._get_char() if c == '': raise dns_exception.UnexpectedEnd if not (c2.isdigit() and c3.isdigit()): raise dns_exception.SyntaxError c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) elif c == '\n': raise dns_exception.SyntaxError('newline in quoted string') elif c == '\\': # # It's an escape. Put it and the next character into # the token; it will be checked later for goodness. # token += c has_escape = True c = self._get_char() if c == '' or c == '\n': raise dns_exception.UnexpectedEnd token += c if token == '' and ttype != QUOTED_STRING: if self.multiline: raise dns_exception.SyntaxError('unbalanced parentheses') ttype = EOF return Token(ttype, token, has_escape)
[docs] def unget(self, token): """Unget a token. The unget buffer for tokens is only one token large; it is an error to try to unget a token when the unget buffer is not empty. @param token: the token to unget @type token: Token object @raises UngetBufferFull: there is already an ungotten token """ if not self.ungotten_token is None: raise UngetBufferFull self.ungotten_token = token
[docs] def next(self): """Return the next item in an iteration. @rtype: (int, string) """ token = self.get() if token.is_eof(): raise StopIteration return token
def __iter__(self): return self # Helpers
[docs] def get_int(self): """Read the next token and interpret it as an integer. @raises dns_exception.SyntaxError: @rtype: int """ token = self.get().unescape() if not token.is_identifier(): raise dns_exception.SyntaxError('expecting an identifier') if not token.value.isdigit(): raise dns_exception.SyntaxError('expecting an integer') return int(token.value)
[docs] def get_uint8(self): """Read the next token and interpret it as an 8-bit unsigned integer. @raises dns_exception.SyntaxError: @rtype: int """ value = self.get_int() if value < 0 or value > 255: raise dns_exception.SyntaxError('%d is not an unsigned 8-bit integer' % value) return value
[docs] def get_uint16(self): """Read the next token and interpret it as a 16-bit unsigned integer. @raises dns_exception.SyntaxError: @rtype: int """ value = self.get_int() if value < 0 or value > 65535: raise dns_exception.SyntaxError('%d is not an unsigned 16-bit integer' % value) return value
[docs] def get_uint32(self): """Read the next token and interpret it as a 32-bit unsigned integer. @raises dns_exception.SyntaxError: @rtype: int """ token = self.get().unescape() if not token.is_identifier(): raise dns_exception.SyntaxError('expecting an identifier') if not token.value.isdigit(): raise dns_exception.SyntaxError('expecting an integer') value = long(token.value) if value < 0 or value > 4294967296L: raise dns_exception.SyntaxError('%d is not an unsigned 32-bit integer' % value) return value
[docs] def get_string(self, origin=None): """Read the next token and interpret it as a string. @raises dns_exception.SyntaxError: @rtype: string """ token = self.get().unescape() if not (token.is_identifier() or token.is_quoted_string()): raise dns_exception.SyntaxError('expecting a string') return token.value
[docs] def get_identifier(self, origin=None): """Read the next token and raise an exception if it is not an identifier. @raises dns_exception.SyntaxError: @rtype: string """ token = self.get().unescape() if not token.is_identifier(): raise dns_exception.SyntaxError('expecting an identifier') return token.value
[docs] def get_name(self, origin=None): """Read the next token and interpret it as a DNS name. @raises dns_exception.SyntaxError: @rtype: dns_name.Name object""" token = self.get() if not token.is_identifier(): raise dns_exception.SyntaxError('expecting an identifier') return dns_name.from_text(token.value, origin)
[docs] def get_eol(self): """Read the next token and raise an exception if it isn't EOL or EOF. @raises dns_exception.SyntaxError: @rtype: string """ token = self.get() if not token.is_eol_or_eof(): raise dns_exception.SyntaxError('expected EOL or EOF, got %d "%s"' % (token.ttype, token.value)) return token.value
[docs] def get_ttl(self): token = self.get().unescape() if not token.is_identifier(): raise dns_exception.SyntaxError('expecting an identifier') return dns_ttl.from_text(token.value)