view python/libasm.py @ 193:f091e7d70996

Added even more checks
author Windel Bouwman
date Sun, 26 May 2013 23:58:59 +0200
parents 6b2bec5653f1
children b01429a5d695
line wrap: on
line source

import re
import pyyacc
from ppci import Token, CompilerError, SourceLocation

# Different instruction sets:
class InstructionSet:
   pass

class X86(InstructionSet):
   pass

# Generic assembler:
keywords = ['global', 'db']

def tokenize(s):
     """
       Tokenizer, generates an iterator that
       returns tokens!

       This GREAT example was taken from python re doc page!
     """
     tok_spec = [
       ('REAL', r'\d+\.\d+'),
       ('HEXNUMBER', r'0x[\da-fA-F]+'),
       ('NUMBER', r'\d+'),
       ('ID', r'[A-Za-z][A-Za-z\d_]*'),
       ('SKIP', r'[ \t]'),
       ('LEESTEKEN', r':=|[\.,=:\-+*\[\]/\(\)]|>=|<=|<>|>|<'),
       ('STRING', r"'.*?'")
     ]
     tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
     gettok = re.compile(tok_re).match
     line = 1
     pos = line_start = 0
     mo = gettok(s)
     while mo is not None:
       typ = mo.lastgroup
       val = mo.group(typ)
       if typ == 'NEWLINE':
         line_start = pos
         line += 1
       elif typ == 'COMMENTS':
         pass
       elif typ != 'SKIP':
         if typ == 'ID':
           if val in keywords:
             typ = val
         elif typ == 'LEESTEKEN':
           typ = val
         elif typ == 'NUMBER':
           val = int(val)
         elif typ == 'HEXNUMBER':
           val = int(val[2:], 16)
           typ = 'NUMBER'
         elif typ == 'REAL':
           val = float(val)
         elif typ == 'STRING':
           val = val[1:-1]
         col = mo.start() - line_start
         loc = SourceLocation(line, col, 0)   # TODO retrieve length?
         yield Token(typ, val, loc)
       pos = mo.end()
       mo = gettok(s, pos)
     if pos != len(s):
       col = pos - line_start
       loc = SourceLocation(line, col, 0)
       raise CompilerError('Unexpected character {0}'.format(s[pos]), loc)

class Lexer:
   def __init__(self, src):
      self.tokens = tokenize(src)
      self.curTok = self.tokens.__next__()
   def eat(self):
      t = self.curTok
      self.curTok = self.tokens.__next__()
      return t
   @property
   def Peak(self):
      return self.curTok

class Assembler:
    def __init__(self):
        # Construct a parser given a grammar:
        g = pyyacc.Grammar(['ID', 'NUMBER', ',', '[', ']', ':', '+', '-', pyyacc.EPS])

        g.add_production('asmline', ['label', 'instruction', 'operands'])
        g.add_production('label', ['ID', ':'])
        g.add_production('label', [pyyacc.EPS])   # label is optional
        g.add_production('instruction', ['ID'])
        g.add_production('operands', ['operand'])
        g.add_production('operands', ['operands', ',', 'operand'])
        g.add_production('operand', ['expression'])
        g.add_production('operand', ['[', 'expression', ']'])
        g.add_production('expression', ['term'])
        g.add_production('expression', ['expression', 'addop', 'term'])
        g.add_production('addop', ['-'])
        g.add_production('addop', ['+'])
        g.add_production('term', ['factor'])
        g.add_production('factor', ['ID'])
        g.add_production('factor', ['NUMBER'])
        # TODO: expand grammar
        g.start_symbol = 'asmline'

        self.p = g.genParser()

    def assemble(self, asmsrc):
      lxr = Lexer(asmsrc)
      prsr = Parser(lxr)
      instructions = prsr.parse()
      return instructions

    def assembleLine(self, line):
        """ 
            Assemble a single source line. 
            Do not take newlines into account 
        """
        tokens = tokenize(line)
        self.p.parse(tokens)

    def assembleAst(self, at):
        """ Assemble a parsed asm line """
        pass