changeset 382:0c44e494ef58

Made lexer more generic
author Windel Bouwman
date Sun, 27 Apr 2014 12:24:21 +0200
parents 6df89163e114
children 173e20a47fda
files kernel/arch/qemu_vexpress/vexpressA9.mmap python/baselex.py python/ppci/assembler.py python/ppci/layout.py python/ppci/objectfile.py python/pyburg.py test/testasm.py
diffstat 7 files changed, 172 insertions(+), 125 deletions(-) [+]
line wrap: on
line diff
--- a/kernel/arch/qemu_vexpress/vexpressA9.mmap	Sat Apr 26 17:41:56 2014 +0200
+++ b/kernel/arch/qemu_vexpress/vexpressA9.mmap	Sun Apr 27 12:24:21 2014 +0200
@@ -1,7 +1,7 @@
 
 {
     "code": "0x10000",
-    "mem_tables": "0x11000",
+    "mem_tables": "0x60000",
     "data": "0x20000"
 }
 
--- a/python/baselex.py	Sat Apr 26 17:41:56 2014 +0200
+++ b/python/baselex.py	Sun Apr 27 12:24:21 2014 +0200
@@ -1,24 +1,38 @@
 
 import re
 from ppci import Token, CompilerError
+from pyyacc import EOF
 
-def tokenize(tok_spec, txt):
-    tok_re = '|'.join('(?P<{}>{})'.format(pair[0], pair[1]) for pair in tok_spec)
-    gettok = re.compile(tok_re).match
-    func_map = {pair[0]: pair[2] for pair in tok_spec}
+
+class BaseLexer:
+    """ Base class for a lexer """
+    def __init__(self, tok_spec):
+        tok_re = '|'.join('(?P<{}>{})'.format(pair[0], pair[1]) for pair in tok_spec)
+        self.gettok = re.compile(tok_re).match
+        self.func_map = {pair[0]: pair[2] for pair in tok_spec}
+
+    def feed(self, txt):
+        """ Feeds the lexer with extra input """
+        self.tokens = self.tokenize(txt)
 
-    # Parse line:
-    line = txt
-    mo = gettok(line)
-    pos = 0
-    while mo:
-        typ = mo.lastgroup
-        val = mo.group(typ)
-        func = func_map[typ]
-        if func:
-            typ, val = func(typ, val)
-            yield Token(typ, val)
-        pos = mo.end()
-        mo = gettok(line, pos)
-    if len(line) != pos:
-        raise CompilerError('Lex fault at {}'.format(line[pos:]))
+    def tokenize(self, txt):
+        """ Generator that generates tokens from text """
+        mo = self.gettok(txt)
+        pos = 0
+        while mo:
+            typ = mo.lastgroup
+            val = mo.group(typ)
+            func = self.func_map[typ]
+            if func:
+                typ, val = func(typ, val)
+                yield Token(typ, val)
+            pos = mo.end()
+            mo = self.gettok(txt, pos)
+        if len(txt) != pos:
+            raise CompilerError('Lex fault at {}'.format(txt[pos:]))
+
+    def next_token(self):
+        try:
+            return self.tokens.__next__()
+        except StopIteration:
+            return Token(EOF, EOF)
--- a/python/ppci/assembler.py	Sat Apr 26 17:41:56 2014 +0200
+++ b/python/ppci/assembler.py	Sun Apr 27 12:24:21 2014 +0200
@@ -1,6 +1,7 @@
 
 import re
 import pyyacc
+from baselex import BaseLexer
 from . import Token, CompilerError, SourceLocation
 from .target import Target, Label
 
@@ -14,73 +15,34 @@
             t = 'val{}'.format(n)
     return t
 
-def tokenize(s, kws):
-    """
-       Tokenizer, generates an iterator that
-       returns tokens!
 
-       This GREAT example was taken from python re doc page!
-    """
-    tok_spec = [
-       ('REAL', r'\d+\.\d+'),
-       ('HEXNUMBER', r'0x[\da-fA-F]+'),
-       ('NUMBER', r'\d+'),
-       ('ID', r'[A-Za-z][A-Za-z\d_]*'),
-       ('SKIP', r'[ \t]'),
-       ('LEESTEKEN', r':=|[\.,=:\-+*\[\]/\(\)]|>=|<=|<>|>|<|}|{'),
-       ('STRING', r"'.*?'"),
-       ('COMMENT', r";.*")
-    ]
-    tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
-    gettok = re.compile(tok_re).match
-    line = 1
-    pos = line_start = 0
-    mo = gettok(s)
-    while mo is not None:
-       typ = mo.lastgroup
-       val = mo.group(typ)
-       if typ == 'NEWLINE':
-         line_start = pos
-         line += 1
-       elif typ != 'SKIP':
-         if typ == 'LEESTEKEN':
-           typ = val
-         elif typ == 'NUMBER':
-           val = int(val)
-         elif typ == 'HEXNUMBER':
-           val = int(val[2:], 16)
-           typ = 'NUMBER'
-         elif typ == 'REAL':
-           val = float(val)
-         elif typ == 'STRING':
-           val = val[1:-1]
-         elif typ == 'ID':
-            if val.lower() in kws: # ['r3', 'sp', 'add', 'yield', 'r4', 'r0', 'r1', 'sub', 'r5', 'r6', 'r2']:
-                typ = val.lower()
-         col = mo.start() - line_start
-         loc = SourceLocation('', line, col, 0)   # TODO retrieve length?
-         if typ == 'NUMBER':
-            typ = bit_type(val)
-         yield Token(typ, val, loc)
-       pos = mo.end()
-       mo = gettok(s, pos)
-    if pos != len(s):
-       col = pos - line_start
-       loc = SourceLocation('', line, col, 0)
-       raise CompilerError('Unexpected character {0}'.format(s[pos]), loc)
-    yield Token('EOF', pyyacc.EOF)
+class AsmLexer(BaseLexer):
+    def __init__(self, kws):
+        tok_spec = [
+           ('REAL', r'\d+\.\d+', lambda typ, val: (typ, float(val))),
+           ('HEXNUMBER', r'0x[\da-fA-F]+', self.handle_number),
+           ('NUMBER', r'\d+', self.handle_number),
+           ('ID', r'[A-Za-z][A-Za-z\d_]*', self.handle_id),
+           ('SKIP', r'[ \t]', None),
+           ('LEESTEKEN', r':=|[\.,=:\-+*\[\]/\(\)]|>=|<=|<>|>|<|}|{', lambda typ, val: (val, val)),
+           ('STRING', r"'.*?'", lambda typ, val: (typ, val[1:-1])),
+           ('COMMENT', r";.*", None)
+        ]
+        super().__init__(tok_spec)
+        self.kws = kws
 
-
-class Lexer:
-    def __init__(self, src, kws):
-        self.tokens = tokenize(src, kws)
-        self.curTok = self.tokens.__next__()
+    def handle_id(self, typ, val):
+        if val.lower() in self.kws:
+            typ = val.lower()
+        return (typ, val)
 
-    def next_token(self):
-        t = self.curTok
-        if t.typ != 'EOF':
-            self.curTok = self.tokens.__next__()
-        return t
+    def handle_number(self, typ, val):
+        if val.startswith('0x'):
+            val = int(val[2:], 16)
+        else:
+            val = int(val)
+        typ = bit_type(val)
+        return typ, val
 
 
 class Parser:
@@ -142,6 +104,7 @@
 
     def make_parser(self):
         self.parser = Parser(self.target.asm_keywords, self.target.assembler_rules, self.emit)
+        self.lexer = AsmLexer(self.target.asm_keywords)
 
     def emit(self, *args):
         self.stream.emit(*args)
@@ -149,8 +112,8 @@
     # Top level interface:
     def parse_line(self, line):
         """ Parse line into assembly instructions """
-        tokens = Lexer(line, self.target.asm_keywords)
-        self.parser.parse(tokens)
+        self.lexer.feed(line)
+        self.parser.parse(self.lexer)
 
     def assemble(self, asmsrc, stream):
         """ Assemble this source snippet """
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/python/ppci/layout.py	Sun Apr 27 12:24:21 2014 +0200
@@ -0,0 +1,58 @@
+
+class Layout:
+    def __init__(self):
+        self.mems = []
+
+    def __eq__(self, other):
+        return self.mems == other.mems
+
+
+class Memory:
+    def __init__(self, address=0x0):
+        self.inputs = []
+        self.address = address
+        self.size = 0x0
+
+    def add_input(self, inp):
+        assert isinstance(inp, Input)
+        self.inputs.append(inp)
+
+
+class Input:
+    pass
+
+
+class SectionInput(Input):
+    def __init__(self, section_name):
+        self.section_name = section_name
+
+
+def load_layout(f):
+    return deserialize(json.load(f))
+
+
+def make_int(txt):
+    if txt.startswith('0x'):
+        return int(txt[2:], 16)
+    else:
+        return int(txt)
+
+
+class LayoutParser:
+    def __init__(self):
+        toks = ['ID', '{', '}', 'MEMORY', 'ALIGN', '.', pyyacc.EPS, pyyacc.EOF]
+        g = pyyacc.Grammar(toks)
+        g.add_production('layout', ['MEMORY', '{', 'input_list', '}'])
+        g.add_production('input_list', ['MEMORY', '{', 'input_list', '}'])
+
+
+def deserialize(d):
+    layout = Layout()
+    for mem_node in d['memories']:
+        m = Memory()
+        m.address = make_int(mem_node['address'])
+        m.size = make_int(mem_node['size'])
+        for input_node in mem_node['inputs']:
+            pass
+    return layout
+
--- a/python/ppci/objectfile.py	Sat Apr 26 17:41:56 2014 +0200
+++ b/python/ppci/objectfile.py	Sun Apr 27 12:24:21 2014 +0200
@@ -139,16 +139,23 @@
     return res
 
 
+def make_int(txt):
+    if txt.startswith('0x'):
+        return int(txt[2:], 16)
+    else:
+        return int(txt)
+
+
 def deserialize(d):
     obj = ObjectFile()
     for section in d['sections']:
         so = obj.get_section(section['name'])
-        so.address = int(section['address'][2:], 16)
+        so.address = make_int(section['address'])
         so.data = bytearray(binascii.unhexlify(section['data'].encode('ascii')))
     for reloc in d['relocations']:
-        obj.add_relocation(reloc['symbol'], int(reloc['offset'][2:], 16),
+        obj.add_relocation(reloc['symbol'], make_int(reloc['offset']),
             reloc['type'], reloc['section'])
     for sym in d['symbols']:
-        obj.add_symbol(sym['name'], int(sym['value'][2:], 16), sym['section'])
+        obj.add_symbol(sym['name'], make_int(sym['value']), sym['section'])
     return obj
 
--- a/python/pyburg.py	Sat Apr 26 17:41:56 2014 +0200
+++ b/python/pyburg.py	Sun Apr 27 12:24:21 2014 +0200
@@ -60,7 +60,7 @@
 import types
 import argparse
 from ppci import Token
-from pyyacc import ParserException, EOF
+from pyyacc import ParserException
 import yacc
 import baselex
 from tree import Tree
@@ -70,8 +70,8 @@
 burg_parser = yacc.load_as_module(spec_file)
 
 
-class BurgLexer:
-    def feed(self, txt):
+class BurgLexer(baselex.BaseLexer):
+    def __init__(self):
         tok_spec = [
            ('id', r'[A-Za-z][A-Za-z\d_]*', lambda typ, val: (typ, val)),
            ('kw', r'%[A-Za-z][A-Za-z\d_]*', lambda typ, val: (val, val)),
@@ -80,36 +80,28 @@
            ('OTHER', r'[:;\|\(\),]', lambda typ, val: (val, val)),
            ('SKIP', r'[ ]', None)
             ]
+        super().__init__(tok_spec)
 
+    def tokenize(self, txt):
         lines = txt.split('\n')
         header_lines = []
-
-        def tokenize():
-            section = 0
-            for line in lines:
-                line = line.strip()
-                if not line:
-                    continue  # Skip empty lines
-                elif line == '%%':
-                    section += 1
-                    if section == 1:
-                        yield Token('header', header_lines)
-                    yield Token('%%', '%%')
+        section = 0
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue  # Skip empty lines
+            elif line == '%%':
+                section += 1
+                if section == 1:
+                    yield Token('header', header_lines)
+                yield Token('%%', '%%')
+            else:
+                if section == 0:
+                    header_lines.append(line)
                 else:
-                    if section == 0:
-                        header_lines.append(line)
-                    else:
-                        for tk in baselex.tokenize(tok_spec, line):
-                            yield tk
-            yield Token(EOF, EOF)
-        self.tokens = tokenize()
-        self.token = self.tokens.__next__()
-
-    def next_token(self):
-        t = self.token
-        if t.typ != EOF:
-            self.token = self.tokens.__next__()
-        return t
+                    # we could use yield from below, but python 3.2 does not work then:
+                    for tk in super().tokenize(line):
+                        yield tk
 
 
 class Rule:
@@ -317,6 +309,7 @@
         default=sys.stdout)
     return parser
 
+
 def load_as_module(filename):
     """ Load a parser spec file, generate LR tables and create module """
     ob = io.StringIO()
--- a/test/testasm.py	Sat Apr 26 17:41:56 2014 +0200
+++ b/test/testasm.py	Sun Apr 27 12:24:21 2014 +0200
@@ -2,7 +2,7 @@
 
 import unittest
 from ppci import CompilerError
-from ppci.assembler import tokenize
+from ppci.assembler import AsmLexer
 from ppci.objectfile import ObjectFile
 from ppci.outstream import BinaryOutputStream
 from ppci.target.basetarget import Label
@@ -12,26 +12,38 @@
 class AssemblerLexingCase(unittest.TestCase):
     """ Tests the assemblers lexer """
 
+    def setUp(self):
+        self.lexer = AsmLexer([])
+
+    def do(self, asmline, toks):
+        output = []
+        self.lexer.feed(asmline)
+        while 'EOF' not in output:
+            output.append(self.lexer.next_token().typ)
+        self.assertSequenceEqual(toks, output)
+
     def testLex0(self):
         """ Check if the lexer is OK """
-        asmline, toks = 'mov rax, rbx ', ['ID', 'ID', ',', 'ID', 'EOF']
-        self.assertSequenceEqual([tok.typ for tok in tokenize(asmline, [])], toks)
+        asmline = 'mov rax, rbx '
+        toks = ['ID', 'ID', ',', 'ID', 'EOF']
+        self.do(asmline, toks)
 
     def testLex1(self):
         """ Test if lexer correctly maps some tokens """
-        asmline, toks = 'lab1: mov rax, rbx ', ['ID', ':', 'ID', 'ID', ',', 'ID', 'EOF']
-        self.assertSequenceEqual([tok.typ for tok in tokenize(asmline, [])], toks)
+        asmline = 'lab1: mov rax, rbx '
+        toks = ['ID', ':', 'ID', 'ID', ',', 'ID', 'EOF']
+        self.do(asmline, toks)
 
     def testLex2(self):
         """ Test if lexer correctly maps some tokens """
         asmline, toks = 'mov 3.13 0xC 13', ['ID', 'REAL', 'val5', 'val5', 'EOF']
-        self.assertSequenceEqual([tok.typ for tok in tokenize(asmline, [])], toks)
+        self.do(asmline, toks)
 
     def testLex3(self):
         """ Test if lexer fails on a token that is invalid """
         asmline = '0z4: mov rax, rbx $ '
         with self.assertRaises(CompilerError):
-            list(tokenize(asmline, []))
+            self.do(asmline, [])
 
 
 class OustreamTestCase(unittest.TestCase):