diff python/ppci/c3/lexer.py @ 300:158068af716c

yafm
author Windel Bouwman
date Tue, 03 Dec 2013 18:00:22 +0100
parents python/c3/lexer.py@6aa721e7b10b
children 0615b5308710
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/python/ppci/c3/lexer.py	Tue Dec 03 18:00:22 2013 +0100
@@ -0,0 +1,94 @@
+import collections
+import re
+
+from ppci import CompilerError, SourceLocation, Token
+
+"""
+ Lexical analyzer part. Splits the input character stream into tokens.
+"""
+
+keywords = ['and', 'or', 'not', 'true', 'false',
+        'else', 'if', 'while', 'return',
+        'function', 'var', 'type', 'const',
+        'struct', 'cast',
+        'import', 'module']
+
+
+class Lexer:
+    def __init__(self, diag):
+        self.diag = diag
+
+    def tokenize(self, input_file):
+        """
+           Tokenizer, generates an iterator that
+           returns tokens!
+
+           Input is a file like object.
+
+           This GREAT example was taken from python re doc page!
+        """
+        filename = input_file.name if hasattr(input_file, 'name') else ''
+        s = input_file.read()
+        input_file.close()
+        self.diag.addSource(filename, s)
+        tok_spec = [
+           ('REAL', r'\d+\.\d+'),
+           ('HEXNUMBER', r'0x[\da-fA-F]+'),
+           ('NUMBER', r'\d+'),
+           ('ID', r'[A-Za-z][A-Za-z\d_]*'),
+           ('NEWLINE', r'\n'),
+           ('SKIP', r'[ \t]'),
+           ('COMMENTS', r'//.*'),
+           ('LONGCOMMENTBEGIN', r'\/\*'),
+           ('LONGCOMMENTEND', r'\*\/'),
+           ('LEESTEKEN', r'==|->|<<|>>|!=|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|'),
+           ('STRING', r"'.*?'")
+            ]
+        tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
+        gettok = re.compile(tok_re).match
+        line = 1
+        pos = line_start = 0
+        mo = gettok(s)
+        incomment = False
+        while mo is not None:
+            typ = mo.lastgroup
+            val = mo.group(typ)
+            if typ == 'NEWLINE':
+                line_start = pos
+                line += 1
+            elif typ == 'COMMENTS':
+                pass
+            elif typ == 'LONGCOMMENTBEGIN':
+                incomment = True
+            elif typ == 'LONGCOMMENTEND':
+                incomment = False
+            elif typ == 'SKIP':
+                pass
+            elif incomment:
+                pass # Wait until we are not in a comment section
+            else:
+                if typ == 'ID':
+                    if val in keywords:
+                        typ = val
+                elif typ == 'LEESTEKEN':
+                    typ = val
+                elif typ == 'NUMBER':
+                    val = int(val)
+                elif typ == 'HEXNUMBER':
+                    val = int(val[2:], 16)
+                    typ = 'NUMBER'
+                elif typ == 'REAL':
+                    val = float(val)
+                elif typ == 'STRING':
+                    val = val[1:-1]
+                loc = SourceLocation(filename, line, mo.start() - line_start,
+                        mo.end() - mo.start())
+                yield Token(typ, val, loc)
+            pos = mo.end()
+            mo = gettok(s, pos)
+        if pos != len(s):
+            col = pos - line_start
+            loc = SourceLocation(filename, line, col, 1)
+            raise CompilerError('Unexpected: "{0}"'.format(s[pos]), loc)
+        loc = SourceLocation(filename, line, 0, 0)
+        yield Token('END', '', loc)