view python/ppci/c3/lexer.py @ 300:158068af716c

yafm
author Windel Bouwman
date Tue, 03 Dec 2013 18:00:22 +0100
parents python/c3/lexer.py@6aa721e7b10b
children 0615b5308710
line wrap: on
line source

import collections
import re

from ppci import CompilerError, SourceLocation, Token

"""
 Lexical analyzer part. Splits the input character stream into tokens.
"""

keywords = ['and', 'or', 'not', 'true', 'false',
        'else', 'if', 'while', 'return',
        'function', 'var', 'type', 'const',
        'struct', 'cast',
        'import', 'module']


class Lexer:
    def __init__(self, diag):
        self.diag = diag

    def tokenize(self, input_file):
        """
           Tokenizer, generates an iterator that
           returns tokens!

           Input is a file like object.

           This GREAT example was taken from python re doc page!
        """
        filename = input_file.name if hasattr(input_file, 'name') else ''
        s = input_file.read()
        input_file.close()
        self.diag.addSource(filename, s)
        tok_spec = [
           ('REAL', r'\d+\.\d+'),
           ('HEXNUMBER', r'0x[\da-fA-F]+'),
           ('NUMBER', r'\d+'),
           ('ID', r'[A-Za-z][A-Za-z\d_]*'),
           ('NEWLINE', r'\n'),
           ('SKIP', r'[ \t]'),
           ('COMMENTS', r'//.*'),
           ('LONGCOMMENTBEGIN', r'\/\*'),
           ('LONGCOMMENTEND', r'\*\/'),
           ('LEESTEKEN', r'==|->|<<|>>|!=|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|'),
           ('STRING', r"'.*?'")
            ]
        tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
        gettok = re.compile(tok_re).match
        line = 1
        pos = line_start = 0
        mo = gettok(s)
        incomment = False
        while mo is not None:
            typ = mo.lastgroup
            val = mo.group(typ)
            if typ == 'NEWLINE':
                line_start = pos
                line += 1
            elif typ == 'COMMENTS':
                pass
            elif typ == 'LONGCOMMENTBEGIN':
                incomment = True
            elif typ == 'LONGCOMMENTEND':
                incomment = False
            elif typ == 'SKIP':
                pass
            elif incomment:
                pass # Wait until we are not in a comment section
            else:
                if typ == 'ID':
                    if val in keywords:
                        typ = val
                elif typ == 'LEESTEKEN':
                    typ = val
                elif typ == 'NUMBER':
                    val = int(val)
                elif typ == 'HEXNUMBER':
                    val = int(val[2:], 16)
                    typ = 'NUMBER'
                elif typ == 'REAL':
                    val = float(val)
                elif typ == 'STRING':
                    val = val[1:-1]
                loc = SourceLocation(filename, line, mo.start() - line_start,
                        mo.end() - mo.start())
                yield Token(typ, val, loc)
            pos = mo.end()
            mo = gettok(s, pos)
        if pos != len(s):
            col = pos - line_start
            loc = SourceLocation(filename, line, col, 1)
            raise CompilerError('Unexpected: "{0}"'.format(s[pos]), loc)
        loc = SourceLocation(filename, line, 0, 0)
        yield Token('END', '', loc)