view python/ppci/assembler.py @ 347:742588fb8cd6 devel

Merge into devel branch
author Windel Bouwman
date Fri, 07 Mar 2014 17:10:21 +0100
parents 3bb7dcfe5529
children 19eacf4f7270
line wrap: on
line source


import re
import pyyacc
from . import Token, CompilerError, SourceLocation
from .target import Target, Label


def bit_type(value):
    assert value < (2**32)
    assert value >= 0
    t = 'val32'
    for n in [16, 12, 8, 5, 3]:
        if value < (2**n):
            t = 'val{}'.format(n)
    return t

def tokenize(s, kws):
    """
       Tokenizer, generates an iterator that
       returns tokens!

       This GREAT example was taken from python re doc page!
    """
    tok_spec = [
       ('REAL', r'\d+\.\d+'),
       ('HEXNUMBER', r'0x[\da-fA-F]+'),
       ('NUMBER', r'\d+'),
       ('ID', r'[A-Za-z][A-Za-z\d_]*'),
       ('SKIP', r'[ \t]'),
       ('LEESTEKEN', r':=|[\.,=:\-+*\[\]/\(\)]|>=|<=|<>|>|<|}|{'),
       ('STRING', r"'.*?'"),
       ('COMMENT', r";.*")
    ]
    tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
    gettok = re.compile(tok_re).match
    line = 1
    pos = line_start = 0
    mo = gettok(s)
    while mo is not None:
       typ = mo.lastgroup
       val = mo.group(typ)
       if typ == 'NEWLINE':
         line_start = pos
         line += 1
       elif typ != 'SKIP':
         if typ == 'LEESTEKEN':
           typ = val
         elif typ == 'NUMBER':
           val = int(val)
         elif typ == 'HEXNUMBER':
           val = int(val[2:], 16)
           typ = 'NUMBER'
         elif typ == 'REAL':
           val = float(val)
         elif typ == 'STRING':
           val = val[1:-1]
         elif typ == 'ID':
            if val.lower() in kws: # ['r3', 'sp', 'add', 'yield', 'r4', 'r0', 'r1', 'sub', 'r5', 'r6', 'r2']:
                typ = val.lower()
         col = mo.start() - line_start
         loc = SourceLocation('', line, col, 0)   # TODO retrieve length?
         if typ == 'NUMBER':
            typ = bit_type(val)
         yield Token(typ, val, loc)
       pos = mo.end()
       mo = gettok(s, pos)
    if pos != len(s):
       col = pos - line_start
       loc = SourceLocation('', line, col, 0)
       raise CompilerError('Unexpected character {0}'.format(s[pos]), loc)
    yield Token('EOF', pyyacc.EOF)


class Lexer:
    def __init__(self, src, kws):
        self.tokens = tokenize(src, kws)
        self.curTok = self.tokens.__next__()

    def next_token(self):
        t = self.curTok
        if t.typ != 'EOF':
            self.curTok = self.tokens.__next__()
        return t


class Parser:
    def add_rule(self, prod, rhs, f):
        """ Helper function to add a rule, why this is required? """
        if prod == 'instruction':
            def f_wrap(*args):
                i = f(args)
                self.emit(i)
        else:
            def f_wrap(*rhs):
                return f(rhs)
        self.g.add_production(prod, rhs, f_wrap)

    def __init__(self, kws, instruction_rules, emit):
        # Construct a parser given a grammar:
        tokens2 = ['ID', 'NUMBER', ',', '[', ']', ':', '+', '-', '*',
                   pyyacc.EPS, 'COMMENT', '{', '}',
                   pyyacc.EOF, 'val32', 'val16', 'val12', 'val8', 'val5', 'val3']
        tokens2.extend(kws)
        self.kws = kws
        g = pyyacc.Grammar(tokens2)
        self.g = g
        # Global structure of assembly line:
        g.add_production('asmline', ['asmline2'])
        g.add_production('asmline', ['asmline2', 'COMMENT'])
        g.add_production('asmline2', ['label', 'instruction'])
        g.add_production('asmline2', ['instruction'])
        g.add_production('asmline2', ['label'])
        g.add_production('asmline2', [])
        g.add_production('label', ['ID', ':'], self.p_label)
        #g.add_production('label', [])

        # Add instruction rules for the target in question:
        for prod, rhs, f in instruction_rules:
            self.add_rule(prod, rhs, f)

        #g.add_production('instruction', [])
        g.add_production('expression', ['term'], lambda x: x)
        g.add_production('expression', ['expression', 'addop', 'term'], self.p_binop)
        g.add_production('addop', ['-'], lambda x: x.val)
        g.add_production('addop', ['+'], lambda x: x.val)
        g.add_production('mulop', ['*'], lambda x: x.val)
        g.add_production('term', ['factor'], lambda x: x)
        g.add_production('term', ['term', 'mulop', 'factor'], self.p_binop)
        g.add_production('factor', ['ID'], lambda name: ASymbol(name.val))
        g.add_production('factor', ['NUMBER'], lambda num: ANumber(int(num.val)))
        g.start_symbol = 'asmline'
        self.emit = emit
        self.p = g.generate_parser()
        # print('length of table:', len(self.p.action_table))

    # Parser handlers:
    def p_ins_1(self, opc, ops):
        ins = AInstruction(opc, ops)
        self.emit(ins)

    def p_ins_2(self, opc):
        self.p_ins_1(opc, [])

    def p_operands_1(self, op1):
        return [op1]

    def p_operands_2(self, ops, comma, op2):
        assert type(ops) is list
        ops.append(op2)
        return ops

    def p_listitems_1(self, li1):
        return [li1]

    def p_listitems_2(self, lis, comma, li2):
        assert type(lis) is list
        lis.append(li2)
        return lis

    def p_list_op(self, brace_open, lst, brace_close):
        return AUnop('{}', lst)

    def p_mem_op(self, brace_open, exp, brace_close):
        return AUnop('[]', exp)

    def p_label(self, lname, cn):
        lab = Label(lname.val)
        self.emit(lab)

    def p_binop(self, exp1, op, exp2):
        return ABinop(op, exp1, exp2)

    def parse(self, lexer):
        self.p.parse(lexer)


class Assembler:
    def __init__(self, target):
        self.target = target
        assert isinstance(target, Target)
        self.parser = Parser(target.asm_keywords, target.assembler_rules, self.emit)

    def emit(self, *args):
        self.stream.emit(*args)

    # Top level interface:
    def parse_line(self, line):
        """ Parse line into assembly instructions """
        tokens = Lexer(line, self.target.asm_keywords)
        self.parser.parse(tokens)

    def assemble(self, asmsrc, stream):
        """ Assemble this source snippet """
        if hasattr(asmsrc, 'read'):
            asmsrc2 = asmsrc.read()
            asmsrc.close()
            asmsrc = asmsrc2
        # TODO: use generic newline??
        # TODO: the bothersome newline ...
        self.stream = stream
        for line in asmsrc.split('\n'):
            self.parse_line(line)
        self.stream = None