view python/libasm.py @ 195:37ac6c016e0f

Expanded asm subsystem
author Windel Bouwman
date Fri, 31 May 2013 21:06:44 +0200
parents b01429a5d695
children ec2b423cdbea
line wrap: on
line source

import re
import pyyacc
from ppci import Token, CompilerError, SourceLocation

# Different instruction sets:
class InstructionSet:
   pass

class X86(InstructionSet):
   pass

# Generic assembler:
keywords = ['global', 'db']

def tokenize(s):
     """
       Tokenizer, generates an iterator that
       returns tokens!

       This GREAT example was taken from python re doc page!
     """
     tok_spec = [
       ('REAL', r'\d+\.\d+'),
       ('HEXNUMBER', r'0x[\da-fA-F]+'),
       ('NUMBER', r'\d+'),
       ('ID', r'[A-Za-z][A-Za-z\d_]*'),
       ('SKIP', r'[ \t]'),
       ('LEESTEKEN', r':=|[\.,=:\-+*\[\]/\(\)]|>=|<=|<>|>|<'),
       ('STRING', r"'.*?'")
     ]
     tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
     gettok = re.compile(tok_re).match
     line = 1
     pos = line_start = 0
     mo = gettok(s)
     while mo is not None:
       typ = mo.lastgroup
       val = mo.group(typ)
       if typ == 'NEWLINE':
         line_start = pos
         line += 1
       elif typ == 'COMMENTS':
         pass
       elif typ != 'SKIP':
         if typ == 'ID':
           if val in keywords:
             typ = val
         elif typ == 'LEESTEKEN':
           typ = val
         elif typ == 'NUMBER':
           val = int(val)
         elif typ == 'HEXNUMBER':
           val = int(val[2:], 16)
           typ = 'NUMBER'
         elif typ == 'REAL':
           val = float(val)
         elif typ == 'STRING':
           val = val[1:-1]
         col = mo.start() - line_start
         loc = SourceLocation(line, col, 0)   # TODO retrieve length?
         yield Token(typ, val, loc)
       pos = mo.end()
       mo = gettok(s, pos)
     if pos != len(s):
       col = pos - line_start
       loc = SourceLocation(line, col, 0)
       raise CompilerError('Unexpected character {0}'.format(s[pos]), loc)

class Lexer:
   def __init__(self, src):
      self.tokens = tokenize(src)
      self.curTok = self.tokens.__next__()
   def eat(self):
      t = self.curTok
      self.curTok = self.tokens.__next__()
      return t
   @property
   def Peak(self):
      return self.curTok

class ANode:
    def __eq__(self, other):
        return self.__repr__() == other.__repr__()

class ALabel(ANode):
    def __init__(self, name):
        self.name = name
    def __repr__(self):
        return '{0}:'.format(self.name)

class AInstruction(ANode):
    def __init__(self, opcode, operands):
        self.opcode = opcode
        self.operands = operands
    def __repr__(self):
        ops = ', '.join(map(str, self.operands))
        return '{0} {1}'.format(self.opcode, ops)

class AExpression(ANode):
    def __add__(self, other):
        return ABinop('+', self, other)
    def __mul__(self, other):
        return ABinop('*', self, other)

class ABinop(AExpression):
    def __init__(self, op, arg1, arg2):
        self.op = op
        self.arg1 = arg1
        self.arg2 = arg2
    def __repr__(self):
        return '{0} {1} {2}'.format(self.op, self.arg1, self.arg2)

class AUnop(AExpression):
    def __init__(self, op, arg):
        self.op = op
        self.arg = arg
    def __repr__(self):
        return '{0} {1}'.format(self.op, self.arg)

class ASymbol(AExpression):
    def __init__(self, name):
        self.name = name
    def __repr__(self):
        return self.name

class ANumber(AExpression):
    def __init__(self, n):
        self.n = n
    def __repr__(self):
        return '{0}'.format(self.n)

class Assembler:
    def __init__(self):
        self.output = []
        # Construct a parser given a grammar:
        ident = lambda x: x   # Identity helper function
        g = pyyacc.Grammar(['ID', 'NUMBER', ',', '[', ']', ':', '+', '-', '*', pyyacc.EPS])
        g.add_production('asmline', ['label', 'instruction'])
        g.add_production('asmline', ['instruction'])
        g.add_production('asmline', ['label'])
        g.add_production('label', ['ID', ':'], self.p_label)
        g.add_production('instruction', ['opcode', 'operands'], self.p_ins_1)
        g.add_production('instruction', ['opcode'], self.p_ins_2)
        g.add_production('opcode', ['ID'], ident)
        g.add_production('operands', ['operand'], self.p_operands_1)
        g.add_production('operands', ['operands', ',', 'operand'], self.p_operands_2)
        g.add_production('operand', ['expression'], ident)
        g.add_production('operand', ['[', 'expression', ']'], self.p_mem_op)
        g.add_production('expression', ['term'], ident)
        g.add_production('expression', ['expression', 'addop', 'term'], self.p_binop)
        g.add_production('addop', ['-'], ident)
        g.add_production('addop', ['+'], ident)
        g.add_production('mulop', ['*'], ident)
        g.add_production('term', ['factor'], ident)
        g.add_production('term', ['term', 'mulop', 'factor'], self.p_binop)
        g.add_production('factor', ['ID'], self.p_symbol)
        g.add_production('factor', ['NUMBER'], self.p_number)
        g.start_symbol = 'asmline'
        self.p = g.genParser()

    # Parser handlers:
    def p_ins_1(self, opc, ops):
        ins = AInstruction(opc, ops)
        self.emit(ins)
    def p_ins_2(self, opc):
        self.p_ins_1(opc, [])
    def p_operands_1(self, op1):
        return [op1]
    def p_operands_2(self, ops, comma, op2):
        assert type(ops) is list
        ops.append(op2)
        return ops
    def p_mem_op(self, brace_open, exp, brace_close):
        return AUnop('[]', exp)
    def handle_ins(self, id0, operands):
        ins = AInstruction(id0)
        self.emit(ins)
    def p_label(self, lname, cn):
        lab = ALabel(lname)
        self.emit(lab)
    def p_binop(self, exp1, op, exp2):
        return ABinop(op, exp1, exp2)
    def p_symbol(self, name):
        return ASymbol(name)
    def p_number(self, n):
        n = int(n)
        return ANumber(n)

    # Top level:
    def emit(self, a):
        self.output.append(a)

    def parse_line(self, line):
        """ Parse line into asm AST """
        tokens = tokenize(line)
        self.p.parse(tokens)
        aast = 1 # TODO
        return aast

    def assemble(self, asmsrc):
        lxr = Lexer(asmsrc)
        prsr = Parser(lxr)
        instructions = prsr.parse()
        return instructions

    def assembleLine(self, line):
        """ 
            Assemble a single source line. 
            Do not take newlines into account 
        """
        aast = self.parseLine(line)
        self.assemble_aast(aast)

    def assemble_aast(self, at):
        """ Assemble a parsed asm line """
        pass