view python/baselex.py @ 382:0c44e494ef58

Made lexer more generic
author Windel Bouwman
date Sun, 27 Apr 2014 12:24:21 +0200
parents 818be710e13d
children fb3c1f029b30
line wrap: on
line source


import re
from ppci import Token, CompilerError
from pyyacc import EOF


class BaseLexer:
    """ Base class for a lexer """
    def __init__(self, tok_spec):
        tok_re = '|'.join('(?P<{}>{})'.format(pair[0], pair[1]) for pair in tok_spec)
        self.gettok = re.compile(tok_re).match
        self.func_map = {pair[0]: pair[2] for pair in tok_spec}

    def feed(self, txt):
        """ Feeds the lexer with extra input """
        self.tokens = self.tokenize(txt)

    def tokenize(self, txt):
        """ Generator that generates tokens from text """
        mo = self.gettok(txt)
        pos = 0
        while mo:
            typ = mo.lastgroup
            val = mo.group(typ)
            func = self.func_map[typ]
            if func:
                typ, val = func(typ, val)
                yield Token(typ, val)
            pos = mo.end()
            mo = self.gettok(txt, pos)
        if len(txt) != pos:
            raise CompilerError('Lex fault at {}'.format(txt[pos:]))

    def next_token(self):
        try:
            return self.tokens.__next__()
        except StopIteration:
            return Token(EOF, EOF)