diff python/ppci/c3/lexer.py @ 396:fb3c1f029b30

Added baselexer into c3 lexer
author Windel Bouwman
date Tue, 27 May 2014 22:19:32 +0200
parents 6ae782a085e0
children 5d03c10fe19d
line wrap: on
line diff
--- a/python/ppci/c3/lexer.py	Fri May 23 14:28:03 2014 +0200
+++ b/python/ppci/c3/lexer.py	Tue May 27 22:19:32 2014 +0200
@@ -1,5 +1,5 @@
 import re
-from ppci import CompilerError, SourceLocation, Token
+from ppci import CompilerError, SourceLocation, Token, make_num
 from baselex import BaseLexer
 
 """
@@ -14,85 +14,51 @@
             'import', 'module']
 
 
-class Lexer:
+class Lexer(BaseLexer):
     """ Generates a sequence of token from an input stream """
     def __init__(self, diag):
         self.diag = diag
-
-    def lex(self, source):
-        return self.tokenize(source)
+        tok_spec = [
+           ('REAL', r'\d+\.\d+', lambda typ, val: (typ, float(val))),
+           ('HEXNUMBER', r'0x[\da-fA-F]+', lambda typ, val: ('NUMBER', make_num(val))),
+           ('NUMBER', r'\d+', lambda typ, val: (typ, int(val))),
+           ('ID', r'[A-Za-z][A-Za-z\d_]*', self.handle_id),
+           ('NEWLINE', r'\n', lambda typ, val: self.newline()),
+           ('SKIP', r'[ \t]', None),
+           ('COMMENTS', r'//.*', None),
+           ('LONGCOMMENTBEGIN', r'\/\*', self.handle_comment_start),
+           ('LONGCOMMENTEND', r'\*\/', self.handle_comment_stop),
+           ('LEESTEKEN', r'==|->|<<|>>|!=|\+\+|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|', lambda typ, val: (val, val)),
+           ('STRING', r'".*?"', lambda typ, val: (typ, val[1:-1]))
+            ]
+        super().__init__(tok_spec)
 
-    def tokenize(self, input_file):
-        """
-           Tokenizer, generates an iterator that
-           returns tokens!
-
-           Input is a file like object.
-
-           This GREAT example was taken from python re doc page!
-        """
+    def lex(self, input_file):
         filename = input_file.name if hasattr(input_file, 'name') else ''
         s = input_file.read()
         input_file.close()
         self.diag.addSource(filename, s)
-        tok_spec = [
-           ('REAL', r'\d+\.\d+'),
-           ('HEXNUMBER', r'0x[\da-fA-F]+'),
-           ('NUMBER', r'\d+'),
-           ('ID', r'[A-Za-z][A-Za-z\d_]*'),
-           ('NEWLINE', r'\n'),
-           ('SKIP', r'[ \t]'),
-           ('COMMENTS', r'//.*'),
-           ('LONGCOMMENTBEGIN', r'\/\*'),
-           ('LONGCOMMENTEND', r'\*\/'),
-           ('LEESTEKEN', r'==|->|<<|>>|!=|\+\+|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|'),
-           ('STRING', r'".*?"')
-            ]
-        tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
-        gettok = re.compile(tok_re).match
-        line = 1
-        pos = line_start = 0
-        mo = gettok(s)
-        incomment = False
-        while mo is not None:
-            typ = mo.lastgroup
-            val = mo.group(typ)
-            if typ == 'NEWLINE':
-                line_start = pos
-                line += 1
-            elif typ == 'COMMENTS':
-                pass
-            elif typ == 'LONGCOMMENTBEGIN':
-                incomment = True
-            elif typ == 'LONGCOMMENTEND':
-                incomment = False
-            elif typ == 'SKIP':
-                pass
-            elif incomment:
+        self.filename = filename
+        return self.tokenize(s)
+
+    def handle_comment_start(self, typ, val):
+        self.incomment = True
+
+    def handle_comment_stop(self, typ, val):
+        self.incomment = False
+
+    def tokenize(self, text):
+        """ Keeps track of the long comments """
+        self.incomment = False
+        for token in super().tokenize(text):
+            if self.incomment:
                 pass    # Wait until we are not in a comment section
             else:
-                if typ == 'ID':
-                    if val in keywords:
-                        typ = val
-                elif typ == 'LEESTEKEN':
-                    typ = val
-                elif typ == 'NUMBER':
-                    val = int(val)
-                elif typ == 'HEXNUMBER':
-                    val = int(val[2:], 16)
-                    typ = 'NUMBER'
-                elif typ == 'REAL':
-                    val = float(val)
-                elif typ == 'STRING':
-                    val = val[1:-1]
-                loc = SourceLocation(filename, line, mo.start() - line_start,
-                                     mo.end() - mo.start())
-                yield Token(typ, val, loc)
-            pos = mo.end()
-            mo = gettok(s, pos)
-        if pos != len(s):
-            col = pos - line_start
-            loc = SourceLocation(filename, line, col, 1)
-            raise CompilerError('Unexpected: "{0}"'.format(s[pos]), loc)
-        loc = SourceLocation(filename, line, 0, 0)
-        yield Token('END', '', loc)
+                yield token
+        loc = SourceLocation(self.filename, self.line, 0, 0)
+        yield Token('EOF', 'EOF', loc)
+
+    def handle_id(self, typ, val):
+        if val in keywords:
+            typ = val
+        return typ, val