Mercurial > lcfOS
diff python/ppci/assembler.py @ 382:0c44e494ef58
Made lexer more generic
author | Windel Bouwman |
---|---|
date | Sun, 27 Apr 2014 12:24:21 +0200 |
parents | 6df89163e114 |
children | 94f5b719ad0b |
line wrap: on
line diff
--- a/python/ppci/assembler.py Sat Apr 26 17:41:56 2014 +0200 +++ b/python/ppci/assembler.py Sun Apr 27 12:24:21 2014 +0200 @@ -1,6 +1,7 @@ import re import pyyacc +from baselex import BaseLexer from . import Token, CompilerError, SourceLocation from .target import Target, Label @@ -14,73 +15,34 @@ t = 'val{}'.format(n) return t -def tokenize(s, kws): - """ - Tokenizer, generates an iterator that - returns tokens! - This GREAT example was taken from python re doc page! - """ - tok_spec = [ - ('REAL', r'\d+\.\d+'), - ('HEXNUMBER', r'0x[\da-fA-F]+'), - ('NUMBER', r'\d+'), - ('ID', r'[A-Za-z][A-Za-z\d_]*'), - ('SKIP', r'[ \t]'), - ('LEESTEKEN', r':=|[\.,=:\-+*\[\]/\(\)]|>=|<=|<>|>|<|}|{'), - ('STRING', r"'.*?'"), - ('COMMENT', r";.*") - ] - tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) - gettok = re.compile(tok_re).match - line = 1 - pos = line_start = 0 - mo = gettok(s) - while mo is not None: - typ = mo.lastgroup - val = mo.group(typ) - if typ == 'NEWLINE': - line_start = pos - line += 1 - elif typ != 'SKIP': - if typ == 'LEESTEKEN': - typ = val - elif typ == 'NUMBER': - val = int(val) - elif typ == 'HEXNUMBER': - val = int(val[2:], 16) - typ = 'NUMBER' - elif typ == 'REAL': - val = float(val) - elif typ == 'STRING': - val = val[1:-1] - elif typ == 'ID': - if val.lower() in kws: # ['r3', 'sp', 'add', 'yield', 'r4', 'r0', 'r1', 'sub', 'r5', 'r6', 'r2']: - typ = val.lower() - col = mo.start() - line_start - loc = SourceLocation('', line, col, 0) # TODO retrieve length? - if typ == 'NUMBER': - typ = bit_type(val) - yield Token(typ, val, loc) - pos = mo.end() - mo = gettok(s, pos) - if pos != len(s): - col = pos - line_start - loc = SourceLocation('', line, col, 0) - raise CompilerError('Unexpected character {0}'.format(s[pos]), loc) - yield Token('EOF', pyyacc.EOF) +class AsmLexer(BaseLexer): + def __init__(self, kws): + tok_spec = [ + ('REAL', r'\d+\.\d+', lambda typ, val: (typ, float(val))), + ('HEXNUMBER', r'0x[\da-fA-F]+', self.handle_number), + ('NUMBER', r'\d+', self.handle_number), + ('ID', r'[A-Za-z][A-Za-z\d_]*', self.handle_id), + ('SKIP', r'[ \t]', None), + ('LEESTEKEN', r':=|[\.,=:\-+*\[\]/\(\)]|>=|<=|<>|>|<|}|{', lambda typ, val: (val, val)), + ('STRING', r"'.*?'", lambda typ, val: (typ, val[1:-1])), + ('COMMENT', r";.*", None) + ] + super().__init__(tok_spec) + self.kws = kws - -class Lexer: - def __init__(self, src, kws): - self.tokens = tokenize(src, kws) - self.curTok = self.tokens.__next__() + def handle_id(self, typ, val): + if val.lower() in self.kws: + typ = val.lower() + return (typ, val) - def next_token(self): - t = self.curTok - if t.typ != 'EOF': - self.curTok = self.tokens.__next__() - return t + def handle_number(self, typ, val): + if val.startswith('0x'): + val = int(val[2:], 16) + else: + val = int(val) + typ = bit_type(val) + return typ, val class Parser: @@ -142,6 +104,7 @@ def make_parser(self): self.parser = Parser(self.target.asm_keywords, self.target.assembler_rules, self.emit) + self.lexer = AsmLexer(self.target.asm_keywords) def emit(self, *args): self.stream.emit(*args) @@ -149,8 +112,8 @@ # Top level interface: def parse_line(self, line): """ Parse line into assembly instructions """ - tokens = Lexer(line, self.target.asm_keywords) - self.parser.parse(tokens) + self.lexer.feed(line) + self.parser.parse(self.lexer) def assemble(self, asmsrc, stream): """ Assemble this source snippet """