diff python/ppci/assembler.py @ 382:0c44e494ef58

Made lexer more generic
author Windel Bouwman
date Sun, 27 Apr 2014 12:24:21 +0200
parents 6df89163e114
children 94f5b719ad0b
line wrap: on
line diff
--- a/python/ppci/assembler.py	Sat Apr 26 17:41:56 2014 +0200
+++ b/python/ppci/assembler.py	Sun Apr 27 12:24:21 2014 +0200
@@ -1,6 +1,7 @@
 
 import re
 import pyyacc
+from baselex import BaseLexer
 from . import Token, CompilerError, SourceLocation
 from .target import Target, Label
 
@@ -14,73 +15,34 @@
             t = 'val{}'.format(n)
     return t
 
-def tokenize(s, kws):
-    """
-       Tokenizer, generates an iterator that
-       returns tokens!
 
-       This GREAT example was taken from python re doc page!
-    """
-    tok_spec = [
-       ('REAL', r'\d+\.\d+'),
-       ('HEXNUMBER', r'0x[\da-fA-F]+'),
-       ('NUMBER', r'\d+'),
-       ('ID', r'[A-Za-z][A-Za-z\d_]*'),
-       ('SKIP', r'[ \t]'),
-       ('LEESTEKEN', r':=|[\.,=:\-+*\[\]/\(\)]|>=|<=|<>|>|<|}|{'),
-       ('STRING', r"'.*?'"),
-       ('COMMENT', r";.*")
-    ]
-    tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
-    gettok = re.compile(tok_re).match
-    line = 1
-    pos = line_start = 0
-    mo = gettok(s)
-    while mo is not None:
-       typ = mo.lastgroup
-       val = mo.group(typ)
-       if typ == 'NEWLINE':
-         line_start = pos
-         line += 1
-       elif typ != 'SKIP':
-         if typ == 'LEESTEKEN':
-           typ = val
-         elif typ == 'NUMBER':
-           val = int(val)
-         elif typ == 'HEXNUMBER':
-           val = int(val[2:], 16)
-           typ = 'NUMBER'
-         elif typ == 'REAL':
-           val = float(val)
-         elif typ == 'STRING':
-           val = val[1:-1]
-         elif typ == 'ID':
-            if val.lower() in kws: # ['r3', 'sp', 'add', 'yield', 'r4', 'r0', 'r1', 'sub', 'r5', 'r6', 'r2']:
-                typ = val.lower()
-         col = mo.start() - line_start
-         loc = SourceLocation('', line, col, 0)   # TODO retrieve length?
-         if typ == 'NUMBER':
-            typ = bit_type(val)
-         yield Token(typ, val, loc)
-       pos = mo.end()
-       mo = gettok(s, pos)
-    if pos != len(s):
-       col = pos - line_start
-       loc = SourceLocation('', line, col, 0)
-       raise CompilerError('Unexpected character {0}'.format(s[pos]), loc)
-    yield Token('EOF', pyyacc.EOF)
+class AsmLexer(BaseLexer):
+    def __init__(self, kws):
+        tok_spec = [
+           ('REAL', r'\d+\.\d+', lambda typ, val: (typ, float(val))),
+           ('HEXNUMBER', r'0x[\da-fA-F]+', self.handle_number),
+           ('NUMBER', r'\d+', self.handle_number),
+           ('ID', r'[A-Za-z][A-Za-z\d_]*', self.handle_id),
+           ('SKIP', r'[ \t]', None),
+           ('LEESTEKEN', r':=|[\.,=:\-+*\[\]/\(\)]|>=|<=|<>|>|<|}|{', lambda typ, val: (val, val)),
+           ('STRING', r"'.*?'", lambda typ, val: (typ, val[1:-1])),
+           ('COMMENT', r";.*", None)
+        ]
+        super().__init__(tok_spec)
+        self.kws = kws
 
-
-class Lexer:
-    def __init__(self, src, kws):
-        self.tokens = tokenize(src, kws)
-        self.curTok = self.tokens.__next__()
+    def handle_id(self, typ, val):
+        if val.lower() in self.kws:
+            typ = val.lower()
+        return (typ, val)
 
-    def next_token(self):
-        t = self.curTok
-        if t.typ != 'EOF':
-            self.curTok = self.tokens.__next__()
-        return t
+    def handle_number(self, typ, val):
+        if val.startswith('0x'):
+            val = int(val[2:], 16)
+        else:
+            val = int(val)
+        typ = bit_type(val)
+        return typ, val
 
 
 class Parser:
@@ -142,6 +104,7 @@
 
     def make_parser(self):
         self.parser = Parser(self.target.asm_keywords, self.target.assembler_rules, self.emit)
+        self.lexer = AsmLexer(self.target.asm_keywords)
 
     def emit(self, *args):
         self.stream.emit(*args)
@@ -149,8 +112,8 @@
     # Top level interface:
     def parse_line(self, line):
         """ Parse line into assembly instructions """
-        tokens = Lexer(line, self.target.asm_keywords)
-        self.parser.parse(tokens)
+        self.lexer.feed(line)
+        self.parser.parse(self.lexer)
 
     def assemble(self, asmsrc, stream):
         """ Assemble this source snippet """