Source code for ptk.lexer

# -*- coding: UTF-8 -*-

# (c) Jérôme Laheurte 2015
# See LICENSE.txt

import six
import inspect
import re
import collections

from ptk.regex import buildRegex, DeadState, RegexTokenizer
from ptk.utils import Singleton, callbackByName


# In Python 3 we'd use __prepare__ and an ordered dict...
_TOKREGISTER = list()


class _LexerMeta(type):
    def __new__(metacls, name, bases, attrs):
        global _TOKREGISTER # pylint: disable=W0603
        try:
            attrs['__tokens__'] = (set(), list()) # Set of token names, list of (rx, callback, defaultType)
            klass = super(_LexerMeta, metacls).__new__(metacls, name, bases, attrs)
            for func, rx, toktypes in _TOKREGISTER:
                klass.addTokenType(func.__name__, callbackByName(func.__name__), rx, toktypes)
            return klass
        finally:
            _TOKREGISTER = list()


def token(rx, types=None):
    def _wrap(func):
        if any([func.__name__ == aFunc.__name__ and func != aFunc for aFunc, _, _ in _TOKREGISTER]):
            raise TypeError('Duplicate token method name "%s"' % func.__name__)
        _TOKREGISTER.append((func, rx, types))
        return func
    return _wrap


[docs]class LexerError(Exception): """ Unrecognized token in input :ivar lineno: Line in input :ivar colno: Column in input """ def __init__(self, char, colno, lineno): super(LexerError, self).__init__('Unrecognized token "%s" at line %d, column %d' % (char, lineno, colno)) self.lineno = lineno self.colno = colno
[docs]class EOF(six.with_metaclass(Singleton, object)): """ End symbol """ __reprval__ = six.u('$') @property def type(self): """Read-only attribute for Token duck-typing""" return self @property def value(self): """Read-only attribute for Token duck-typing""" return self
_LexerPosition = collections.namedtuple('_LexerPosition', ['column', 'line'])
[docs]class LexerBase(six.with_metaclass(_LexerMeta, object)): """ This defines the interface for lexer classes. For concrete implementations, see :py:class:`ProgressiveLexer` and :py:class:`ReLexer`. """ Token = RegexTokenizer.Token # Shut up pychecker. Those are actually set by the metaclass. __tokens__ = () class _MutableToken(object): def __init__(self, type_, value): self.type = type_ self.value = value def token(self): """Returns the unmutable equivalent""" return EOF if EOF in [self.type, self.value] else RegexTokenizer.Token(self.type, self.value) def __init__(self): super(LexerBase, self).__init__() self.restartLexer() def restartLexer(self, resetPos=True): if resetPos: self.__pos = _LexerPosition(0, 1) self._input = list() self.__consumer = None
[docs] def position(self): """ :return: The current position in stream as a 2-tuple (column, line). """ return self.__pos
[docs] def advanceColumn(self, count=1): """ Advances the current position by *count* columns. """ col, row = self.__pos self.__pos = _LexerPosition(col + count, row)
[docs] def advanceLine(self, count=1): """ Advances the current position by *count* lines. """ _, row = self.__pos self.__pos = _LexerPosition(0, row + count)
@staticmethod
[docs] def ignore(char): """ Override this to ignore characters in input stream. The default is to ignore spaces and tabs. :param char: The character to test :return: True if *char* should be ignored """ return char in [six.b(' '), six.u(' '), six.b('\t'), six.u('\t')]
[docs] def setConsumer(self, consumer): """ Sets the current consumer. A consumer is an object with a *feed* method; all characters seen on the input stream after the consumer is set are passed directly to it. When the *feed* method returns a 2-tuple (type, value), the corresponding token is generated and the consumer reset to None. This may be handy to parse tokens that are not easily recognized by a regular expression but easily by code; for instance the following lexer recognizes C strings without having to use negative lookahead: .. code-block:: python class MyLexer(ReLexer): @token('"') def cstring(self, tok): class CString(object): def __init__(self): self.state = 0 self.value = StringIO.StringIO() def feed(self, char): if self.state == 0: if char == '"': return 'cstring', self.value.getvalue() if char == '\\\\': self.state = 1 else: self.value.write(char) elif self.state == 1: self.value.write(char) self.state = 0 self.setConsumer(CString()) """ self.__consumer = consumer
def consumer(self): return self.__consumer
[docs] def parse(self, string): # pragma: no cover """ Parses the whole *string* """ raise NotImplementedError
[docs] def newToken(self, tok): # pragma: no cover """ This method will be invoked as soon as a token is recognized on input. :param tok: The token. This is a named tuple with *type* and *value* attributes. """ raise NotImplementedError
@classmethod def addTokenType(cls, name, callback, regex, types=None): for typeName in [name] if types is None else types: if typeName is not EOF: cls.__tokens__[0].add(typeName) cls.__tokens__[1].append((regex, callback, name if types is None else None)) @classmethod def _allTokens(cls): tokens = (set(), list()) for base in inspect.getmro(cls): if issubclass(base, LexerBase): tokens[0].update(base.__tokens__[0]) tokens[1].extend(base.__tokens__[1]) return tokens @classmethod
[docs] def tokenTypes(cls): """ :return: the set of all token names, as strings. """ return cls._allTokens()[0]
[docs]class ReLexer(LexerBase): # pylint: disable=W0223 """ Concrete lexer based on Python regular expressions. this is **way** faster than :py:class:`ProgressiveLexer` but it can only tokenize whole strings. """ def __init__(self): self.__regexes = list() for rx, callback, defaultType in self._allTokens()[1]: if six.PY2 and isinstance(rx, str) or six.PY3 and isinstance(rx, bytes): crx = re.compile(six.b('^') + rx) else: crx = re.compile(six.u('^') + rx) self.__regexes.append((crx, callback, defaultType)) super(ReLexer, self).__init__() def parse(self, string): pos = 0 while pos < len(string): char = string[pos] if char == '\n': self.advanceLine() else: self.advanceColumn() if self.consumer() is None: if self.ignore(char): pos += 1 continue pos = self.__findMatch(string, pos) else: tok = self.consumer().feed(char) if tok is not None: self.setConsumer(None) if tok[0] is not None: self.newToken(self.Token(*tok)) pos += 1 self.newToken(EOF) def __findMatch(self, string, pos): match = None matchlen = 0 for rx, callback, defaultType in self.__regexes: mtc = rx.search(string[pos:]) if mtc: value = mtc.group(0) if len(value) > matchlen: match = value, callback, defaultType matchlen = len(value) if match: value, callback, defaultType = match tok = self._MutableToken(defaultType, value) callback(self, tok) pos += matchlen if self.consumer() is None and tok.type is not None: self.newToken(tok.token()) return pos else: raise LexerError(string[pos:pos+10], *self.position())
[docs]class ProgressiveLexer(LexerBase): # pylint: disable=W0223 """ Concrete lexer based on a simple pure-Python regular expression engine. This lexer is able to tokenize an input stream in a progressive fashion; just call the :py:func:`ProgressiveLexer.feed` method with whatever bytes are available when they're available. Useful for asynchronous contexts. Starting with Python 3.5 there is also an asynchronous version, see :py:class:`AsyncLexer`. This is **slow as hell**. """ def restartLexer(self, resetPos=True): self.__currentState = [(buildRegex(rx).start(), callback, defaultType, [0]) for rx, callback, defaultType in self._allTokens()[1]] self.__currentMatch = list() self.__matches = list() self.__maxPos = 0 self.__state = 0 self._input = list() super(ProgressiveLexer, self).restartLexer(resetPos=resetPos) def parse(self, string): if six.PY3 and isinstance(string, bytes): string = [chr(c).encode('ascii') for c in string] for char in string: self.feed(char) self.feed(EOF)
[docs] def feed(self, char, charPos=None): """ Handle a single input character. When you're finished, call this with EOF as argument. """ self._input.append((char, charPos)) while self._input: char, charPos = self._input.pop(0) for tok in self._feed(char, charPos): self.newToken(tok)
def _feed(self, char, charPos): # pylint: disable=R0912,R0915 if char == '\n': self.advanceLine() else: self.advanceColumn() if self.consumer() is not None: tok = self.consumer().feed(char) if tok is not None: self.setConsumer(None) if tok[0] is not None: yield self.Token(*tok) return try: if char is EOF: if self.__state == 0: self.restartLexer() yield EOF return self.__maxPos = max(self.__maxPos, max(pos[0] for regex, callback, defaultType, pos in self.__currentState)) if self.__maxPos == 0 and self.__currentMatch: raise LexerError(self.__currentMatch[0][0], *self.__currentMatch[0][1]) self.__matches.extend([(pos[0], callback) for regex, callback, defaultType, pos in self.__currentState if pos[0] == self.__maxPos]) self.__matches = [(pos, callback) for pos, callback in self.__matches if pos == self.__maxPos] else: if self.__state == 0 and self.ignore(char): return self.__state = 1 newState = list() for regex, callback, defaultType, pos in self.__currentState: try: if regex.feed(char): pos[0] = len(self.__currentMatch) + 1 except DeadState: if pos[0]: self.__matches.append((pos[0], callback)) self.__maxPos = max(self.__maxPos, pos[0]) else: newState.append((regex, callback, defaultType, pos)) if all([regex.isDeadEnd() for regex, callback, defaultType, pos in newState]): for regex, callback, defaultType, pos in newState: self.__matches.append((len(self.__currentMatch) + 1, callback)) self.__maxPos = max(self.__maxPos, len(self.__currentMatch) + 1) newState = list() self.__matches = [(pos, callback) for pos, callback in self.__matches if pos == self.__maxPos] self.__currentState = newState self.__currentMatch.append((char, self.position() if charPos is None else charPos)) if self.__currentState: return if self.__maxPos == 0: raise LexerError(char, *self.position()) except LexerError: self.restartLexer() raise tok = self.__finalizeMatch() if tok is not None: yield tok if char is EOF: self.restartLexer() yield EOF def __finalizeMatch(self): # First declared token method matches = set([callback for _, callback in self.__matches]) match = type(self.__currentMatch[0][0])().join([char for char, pos in self.__currentMatch[:self.__maxPos]]) # byte or unicode remain = self.__currentMatch[self.__maxPos:] self.restartLexer(False) self._input.extend(remain) for _, callback, defaultType in self._allTokens()[1]: if callback in matches: tok = self._MutableToken(defaultType, match) callback(self, tok) if tok.type is None or self.consumer() is not None: break return tok.token()