Mercurial > lcfOS
changeset 382:0c44e494ef58
Made lexer more generic
author | Windel Bouwman |
---|---|
date | Sun, 27 Apr 2014 12:24:21 +0200 |
parents | 6df89163e114 |
children | 173e20a47fda |
files | kernel/arch/qemu_vexpress/vexpressA9.mmap python/baselex.py python/ppci/assembler.py python/ppci/layout.py python/ppci/objectfile.py python/pyburg.py test/testasm.py |
diffstat | 7 files changed, 172 insertions(+), 125 deletions(-) [+] |
line wrap: on
line diff
--- a/kernel/arch/qemu_vexpress/vexpressA9.mmap Sat Apr 26 17:41:56 2014 +0200 +++ b/kernel/arch/qemu_vexpress/vexpressA9.mmap Sun Apr 27 12:24:21 2014 +0200 @@ -1,7 +1,7 @@ { "code": "0x10000", - "mem_tables": "0x11000", + "mem_tables": "0x60000", "data": "0x20000" }
--- a/python/baselex.py Sat Apr 26 17:41:56 2014 +0200 +++ b/python/baselex.py Sun Apr 27 12:24:21 2014 +0200 @@ -1,24 +1,38 @@ import re from ppci import Token, CompilerError +from pyyacc import EOF -def tokenize(tok_spec, txt): - tok_re = '|'.join('(?P<{}>{})'.format(pair[0], pair[1]) for pair in tok_spec) - gettok = re.compile(tok_re).match - func_map = {pair[0]: pair[2] for pair in tok_spec} + +class BaseLexer: + """ Base class for a lexer """ + def __init__(self, tok_spec): + tok_re = '|'.join('(?P<{}>{})'.format(pair[0], pair[1]) for pair in tok_spec) + self.gettok = re.compile(tok_re).match + self.func_map = {pair[0]: pair[2] for pair in tok_spec} + + def feed(self, txt): + """ Feeds the lexer with extra input """ + self.tokens = self.tokenize(txt) - # Parse line: - line = txt - mo = gettok(line) - pos = 0 - while mo: - typ = mo.lastgroup - val = mo.group(typ) - func = func_map[typ] - if func: - typ, val = func(typ, val) - yield Token(typ, val) - pos = mo.end() - mo = gettok(line, pos) - if len(line) != pos: - raise CompilerError('Lex fault at {}'.format(line[pos:])) + def tokenize(self, txt): + """ Generator that generates tokens from text """ + mo = self.gettok(txt) + pos = 0 + while mo: + typ = mo.lastgroup + val = mo.group(typ) + func = self.func_map[typ] + if func: + typ, val = func(typ, val) + yield Token(typ, val) + pos = mo.end() + mo = self.gettok(txt, pos) + if len(txt) != pos: + raise CompilerError('Lex fault at {}'.format(txt[pos:])) + + def next_token(self): + try: + return self.tokens.__next__() + except StopIteration: + return Token(EOF, EOF)
--- a/python/ppci/assembler.py Sat Apr 26 17:41:56 2014 +0200 +++ b/python/ppci/assembler.py Sun Apr 27 12:24:21 2014 +0200 @@ -1,6 +1,7 @@ import re import pyyacc +from baselex import BaseLexer from . import Token, CompilerError, SourceLocation from .target import Target, Label @@ -14,73 +15,34 @@ t = 'val{}'.format(n) return t -def tokenize(s, kws): - """ - Tokenizer, generates an iterator that - returns tokens! - This GREAT example was taken from python re doc page! - """ - tok_spec = [ - ('REAL', r'\d+\.\d+'), - ('HEXNUMBER', r'0x[\da-fA-F]+'), - ('NUMBER', r'\d+'), - ('ID', r'[A-Za-z][A-Za-z\d_]*'), - ('SKIP', r'[ \t]'), - ('LEESTEKEN', r':=|[\.,=:\-+*\[\]/\(\)]|>=|<=|<>|>|<|}|{'), - ('STRING', r"'.*?'"), - ('COMMENT', r";.*") - ] - tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) - gettok = re.compile(tok_re).match - line = 1 - pos = line_start = 0 - mo = gettok(s) - while mo is not None: - typ = mo.lastgroup - val = mo.group(typ) - if typ == 'NEWLINE': - line_start = pos - line += 1 - elif typ != 'SKIP': - if typ == 'LEESTEKEN': - typ = val - elif typ == 'NUMBER': - val = int(val) - elif typ == 'HEXNUMBER': - val = int(val[2:], 16) - typ = 'NUMBER' - elif typ == 'REAL': - val = float(val) - elif typ == 'STRING': - val = val[1:-1] - elif typ == 'ID': - if val.lower() in kws: # ['r3', 'sp', 'add', 'yield', 'r4', 'r0', 'r1', 'sub', 'r5', 'r6', 'r2']: - typ = val.lower() - col = mo.start() - line_start - loc = SourceLocation('', line, col, 0) # TODO retrieve length? - if typ == 'NUMBER': - typ = bit_type(val) - yield Token(typ, val, loc) - pos = mo.end() - mo = gettok(s, pos) - if pos != len(s): - col = pos - line_start - loc = SourceLocation('', line, col, 0) - raise CompilerError('Unexpected character {0}'.format(s[pos]), loc) - yield Token('EOF', pyyacc.EOF) +class AsmLexer(BaseLexer): + def __init__(self, kws): + tok_spec = [ + ('REAL', r'\d+\.\d+', lambda typ, val: (typ, float(val))), + ('HEXNUMBER', r'0x[\da-fA-F]+', self.handle_number), + ('NUMBER', r'\d+', self.handle_number), + ('ID', r'[A-Za-z][A-Za-z\d_]*', self.handle_id), + ('SKIP', r'[ \t]', None), + ('LEESTEKEN', r':=|[\.,=:\-+*\[\]/\(\)]|>=|<=|<>|>|<|}|{', lambda typ, val: (val, val)), + ('STRING', r"'.*?'", lambda typ, val: (typ, val[1:-1])), + ('COMMENT', r";.*", None) + ] + super().__init__(tok_spec) + self.kws = kws - -class Lexer: - def __init__(self, src, kws): - self.tokens = tokenize(src, kws) - self.curTok = self.tokens.__next__() + def handle_id(self, typ, val): + if val.lower() in self.kws: + typ = val.lower() + return (typ, val) - def next_token(self): - t = self.curTok - if t.typ != 'EOF': - self.curTok = self.tokens.__next__() - return t + def handle_number(self, typ, val): + if val.startswith('0x'): + val = int(val[2:], 16) + else: + val = int(val) + typ = bit_type(val) + return typ, val class Parser: @@ -142,6 +104,7 @@ def make_parser(self): self.parser = Parser(self.target.asm_keywords, self.target.assembler_rules, self.emit) + self.lexer = AsmLexer(self.target.asm_keywords) def emit(self, *args): self.stream.emit(*args) @@ -149,8 +112,8 @@ # Top level interface: def parse_line(self, line): """ Parse line into assembly instructions """ - tokens = Lexer(line, self.target.asm_keywords) - self.parser.parse(tokens) + self.lexer.feed(line) + self.parser.parse(self.lexer) def assemble(self, asmsrc, stream): """ Assemble this source snippet """
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/python/ppci/layout.py Sun Apr 27 12:24:21 2014 +0200 @@ -0,0 +1,58 @@ + +class Layout: + def __init__(self): + self.mems = [] + + def __eq__(self, other): + return self.mems == other.mems + + +class Memory: + def __init__(self, address=0x0): + self.inputs = [] + self.address = address + self.size = 0x0 + + def add_input(self, inp): + assert isinstance(inp, Input) + self.inputs.append(inp) + + +class Input: + pass + + +class SectionInput(Input): + def __init__(self, section_name): + self.section_name = section_name + + +def load_layout(f): + return deserialize(json.load(f)) + + +def make_int(txt): + if txt.startswith('0x'): + return int(txt[2:], 16) + else: + return int(txt) + + +class LayoutParser: + def __init__(self): + toks = ['ID', '{', '}', 'MEMORY', 'ALIGN', '.', pyyacc.EPS, pyyacc.EOF] + g = pyyacc.Grammar(toks) + g.add_production('layout', ['MEMORY', '{', 'input_list', '}']) + g.add_production('input_list', ['MEMORY', '{', 'input_list', '}']) + + +def deserialize(d): + layout = Layout() + for mem_node in d['memories']: + m = Memory() + m.address = make_int(mem_node['address']) + m.size = make_int(mem_node['size']) + for input_node in mem_node['inputs']: + pass + return layout +
--- a/python/ppci/objectfile.py Sat Apr 26 17:41:56 2014 +0200 +++ b/python/ppci/objectfile.py Sun Apr 27 12:24:21 2014 +0200 @@ -139,16 +139,23 @@ return res +def make_int(txt): + if txt.startswith('0x'): + return int(txt[2:], 16) + else: + return int(txt) + + def deserialize(d): obj = ObjectFile() for section in d['sections']: so = obj.get_section(section['name']) - so.address = int(section['address'][2:], 16) + so.address = make_int(section['address']) so.data = bytearray(binascii.unhexlify(section['data'].encode('ascii'))) for reloc in d['relocations']: - obj.add_relocation(reloc['symbol'], int(reloc['offset'][2:], 16), + obj.add_relocation(reloc['symbol'], make_int(reloc['offset']), reloc['type'], reloc['section']) for sym in d['symbols']: - obj.add_symbol(sym['name'], int(sym['value'][2:], 16), sym['section']) + obj.add_symbol(sym['name'], make_int(sym['value']), sym['section']) return obj
--- a/python/pyburg.py Sat Apr 26 17:41:56 2014 +0200 +++ b/python/pyburg.py Sun Apr 27 12:24:21 2014 +0200 @@ -60,7 +60,7 @@ import types import argparse from ppci import Token -from pyyacc import ParserException, EOF +from pyyacc import ParserException import yacc import baselex from tree import Tree @@ -70,8 +70,8 @@ burg_parser = yacc.load_as_module(spec_file) -class BurgLexer: - def feed(self, txt): +class BurgLexer(baselex.BaseLexer): + def __init__(self): tok_spec = [ ('id', r'[A-Za-z][A-Za-z\d_]*', lambda typ, val: (typ, val)), ('kw', r'%[A-Za-z][A-Za-z\d_]*', lambda typ, val: (val, val)), @@ -80,36 +80,28 @@ ('OTHER', r'[:;\|\(\),]', lambda typ, val: (val, val)), ('SKIP', r'[ ]', None) ] + super().__init__(tok_spec) + def tokenize(self, txt): lines = txt.split('\n') header_lines = [] - - def tokenize(): - section = 0 - for line in lines: - line = line.strip() - if not line: - continue # Skip empty lines - elif line == '%%': - section += 1 - if section == 1: - yield Token('header', header_lines) - yield Token('%%', '%%') + section = 0 + for line in lines: + line = line.strip() + if not line: + continue # Skip empty lines + elif line == '%%': + section += 1 + if section == 1: + yield Token('header', header_lines) + yield Token('%%', '%%') + else: + if section == 0: + header_lines.append(line) else: - if section == 0: - header_lines.append(line) - else: - for tk in baselex.tokenize(tok_spec, line): - yield tk - yield Token(EOF, EOF) - self.tokens = tokenize() - self.token = self.tokens.__next__() - - def next_token(self): - t = self.token - if t.typ != EOF: - self.token = self.tokens.__next__() - return t + # we could use yield from below, but python 3.2 does not work then: + for tk in super().tokenize(line): + yield tk class Rule: @@ -317,6 +309,7 @@ default=sys.stdout) return parser + def load_as_module(filename): """ Load a parser spec file, generate LR tables and create module """ ob = io.StringIO()
--- a/test/testasm.py Sat Apr 26 17:41:56 2014 +0200 +++ b/test/testasm.py Sun Apr 27 12:24:21 2014 +0200 @@ -2,7 +2,7 @@ import unittest from ppci import CompilerError -from ppci.assembler import tokenize +from ppci.assembler import AsmLexer from ppci.objectfile import ObjectFile from ppci.outstream import BinaryOutputStream from ppci.target.basetarget import Label @@ -12,26 +12,38 @@ class AssemblerLexingCase(unittest.TestCase): """ Tests the assemblers lexer """ + def setUp(self): + self.lexer = AsmLexer([]) + + def do(self, asmline, toks): + output = [] + self.lexer.feed(asmline) + while 'EOF' not in output: + output.append(self.lexer.next_token().typ) + self.assertSequenceEqual(toks, output) + def testLex0(self): """ Check if the lexer is OK """ - asmline, toks = 'mov rax, rbx ', ['ID', 'ID', ',', 'ID', 'EOF'] - self.assertSequenceEqual([tok.typ for tok in tokenize(asmline, [])], toks) + asmline = 'mov rax, rbx ' + toks = ['ID', 'ID', ',', 'ID', 'EOF'] + self.do(asmline, toks) def testLex1(self): """ Test if lexer correctly maps some tokens """ - asmline, toks = 'lab1: mov rax, rbx ', ['ID', ':', 'ID', 'ID', ',', 'ID', 'EOF'] - self.assertSequenceEqual([tok.typ for tok in tokenize(asmline, [])], toks) + asmline = 'lab1: mov rax, rbx ' + toks = ['ID', ':', 'ID', 'ID', ',', 'ID', 'EOF'] + self.do(asmline, toks) def testLex2(self): """ Test if lexer correctly maps some tokens """ asmline, toks = 'mov 3.13 0xC 13', ['ID', 'REAL', 'val5', 'val5', 'EOF'] - self.assertSequenceEqual([tok.typ for tok in tokenize(asmline, [])], toks) + self.do(asmline, toks) def testLex3(self): """ Test if lexer fails on a token that is invalid """ asmline = '0z4: mov rax, rbx $ ' with self.assertRaises(CompilerError): - list(tokenize(asmline, [])) + self.do(asmline, []) class OustreamTestCase(unittest.TestCase):