changeset 396:fb3c1f029b30

Added baselexer into c3 lexer
author Windel Bouwman
date Tue, 27 May 2014 22:19:32 +0200
parents 3b0c495e3008
children 5d03c10fe19d
files python/baselex.py python/ppci/c3/builder.py python/ppci/c3/lexer.py python/ppci/c3/parser.py python/ppci/common.py python/pyburg.py python/yacc.py test/testc3.py test/testpyy.py
diffstat 9 files changed, 110 insertions(+), 111 deletions(-) [+]
line wrap: on
line diff
--- a/python/baselex.py	Fri May 23 14:28:03 2014 +0200
+++ b/python/baselex.py	Tue May 27 22:19:32 2014 +0200
@@ -1,38 +1,57 @@
 
 import re
-from ppci import Token, CompilerError
+from ppci import Token, CompilerError, SourceLocation
 from pyyacc import EOF
 
 
 class BaseLexer:
-    """ Base class for a lexer """
+    """ Base class for a lexer. This class can be overridden to create a
+        lexer. This class handles the regular expression generation and
+        source position accounting.
+    """
     def __init__(self, tok_spec):
         tok_re = '|'.join('(?P<{}>{})'.format(pair[0], pair[1]) for pair in tok_spec)
         self.gettok = re.compile(tok_re).match
         self.func_map = {pair[0]: pair[2] for pair in tok_spec}
+        self.filename = None
 
     def feed(self, txt):
         """ Feeds the lexer with extra input """
         self.tokens = self.tokenize(txt)
 
     def tokenize(self, txt):
-        """ Generator that generates tokens from text """
+        """ Generator that generates tokens from text
+            It does not yield the EOF token.
+        """
+        self.line = 1
+        self.line_start = 0
+        self.pos = 0
         mo = self.gettok(txt)
-        pos = 0
         while mo:
             typ = mo.lastgroup
             val = mo.group(typ)
+            column = mo.start() - self.line_start
+            length = mo.end() - mo.start()
+            loc = SourceLocation(self.filename, self.line, column, length)
             func = self.func_map[typ]
             if func:
-                typ, val = func(typ, val)
-                yield Token(typ, val)
-            pos = mo.end()
-            mo = self.gettok(txt, pos)
-        if len(txt) != pos:
-            raise CompilerError('Lex fault at {}'.format(txt[pos:]))
+                res = func(typ, val)
+                if res:
+                    typ, val = res
+                    yield Token(typ, val, loc)
+            self.pos = mo.end()
+            mo = self.gettok(txt, self.pos)
+        if len(txt) != self.pos:
+            raise CompilerError('Lex fault at {}'.format(txt[self.pos:]))
+
+    def newline(self):
+        """ Enters a new line """
+        self.line_start = self.pos
+        self.line = self.line + 1
 
     def next_token(self):
         try:
             return self.tokens.__next__()
         except StopIteration:
-            return Token(EOF, EOF)
+            loc = SourceLocation(self.filename, self.line, 0, 0)
+            return Token(EOF, EOF, loc)
--- a/python/ppci/c3/builder.py	Fri May 23 14:28:03 2014 +0200
+++ b/python/ppci/c3/builder.py	Tue May 27 22:19:32 2014 +0200
@@ -95,23 +95,27 @@
 
     def build(self, srcs, imps=[]):
         """ Create IR-code from sources """
-        self.logger.debug('Building {} source files'.format(len(srcs)))
+        self.logger.debug('Building {} source files'.format(len(srcs + imps)))
         iter(srcs)  # Check if srcs are iterable
         iter(imps)
         self.ok = True
         self.pkgs = {}
 
-        # Parsing stage (phase 1)
+        # Lexing and parsing stage (phase 1)
         def doParse(src):
             tokens = self.lexer.lex(src)
-            return self.parser.parseSource(tokens)
+            pkg = self.parser.parseSource(tokens)
+            return pkg
         s_pkgs = list(map(doParse, srcs))
         i_pkgs = list(map(doParse, imps))
         all_pkgs = s_pkgs + i_pkgs
         if not all(all_pkgs):
             self.ok = False
+            self.logger.debug('Parsing failed')
             return
 
+        self.logger.debug('Parsed {} packages'.format(len(all_pkgs)))
+
         # Fix scopes and package refs (phase 1.5)
         packages = {pkg.name: pkg for pkg in all_pkgs}
         self.pkgs = packages
@@ -122,6 +126,7 @@
             scopeFiller.addScope(pkg)
         if not all(pkg.ok for pkg in all_pkgs):
             self.ok = False
+            self.logger.debug('Scope filling failed')
             return
 
         # Generate intermediate code (phase 2)
@@ -129,4 +134,6 @@
         for pkg in s_pkgs:
             yield self.cg.gencode(pkg)
         if not all(pkg.ok for pkg in all_pkgs):
+            self.logger.debug('Code generation failed')
             self.ok = False
+        self.logger.debug('C3 build complete!')
--- a/python/ppci/c3/lexer.py	Fri May 23 14:28:03 2014 +0200
+++ b/python/ppci/c3/lexer.py	Tue May 27 22:19:32 2014 +0200
@@ -1,5 +1,5 @@
 import re
-from ppci import CompilerError, SourceLocation, Token
+from ppci import CompilerError, SourceLocation, Token, make_num
 from baselex import BaseLexer
 
 """
@@ -14,85 +14,51 @@
             'import', 'module']
 
 
-class Lexer:
+class Lexer(BaseLexer):
     """ Generates a sequence of token from an input stream """
     def __init__(self, diag):
         self.diag = diag
-
-    def lex(self, source):
-        return self.tokenize(source)
+        tok_spec = [
+           ('REAL', r'\d+\.\d+', lambda typ, val: (typ, float(val))),
+           ('HEXNUMBER', r'0x[\da-fA-F]+', lambda typ, val: ('NUMBER', make_num(val))),
+           ('NUMBER', r'\d+', lambda typ, val: (typ, int(val))),
+           ('ID', r'[A-Za-z][A-Za-z\d_]*', self.handle_id),
+           ('NEWLINE', r'\n', lambda typ, val: self.newline()),
+           ('SKIP', r'[ \t]', None),
+           ('COMMENTS', r'//.*', None),
+           ('LONGCOMMENTBEGIN', r'\/\*', self.handle_comment_start),
+           ('LONGCOMMENTEND', r'\*\/', self.handle_comment_stop),
+           ('LEESTEKEN', r'==|->|<<|>>|!=|\+\+|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|', lambda typ, val: (val, val)),
+           ('STRING', r'".*?"', lambda typ, val: (typ, val[1:-1]))
+            ]
+        super().__init__(tok_spec)
 
-    def tokenize(self, input_file):
-        """
-           Tokenizer, generates an iterator that
-           returns tokens!
-
-           Input is a file like object.
-
-           This GREAT example was taken from python re doc page!
-        """
+    def lex(self, input_file):
         filename = input_file.name if hasattr(input_file, 'name') else ''
         s = input_file.read()
         input_file.close()
         self.diag.addSource(filename, s)
-        tok_spec = [
-           ('REAL', r'\d+\.\d+'),
-           ('HEXNUMBER', r'0x[\da-fA-F]+'),
-           ('NUMBER', r'\d+'),
-           ('ID', r'[A-Za-z][A-Za-z\d_]*'),
-           ('NEWLINE', r'\n'),
-           ('SKIP', r'[ \t]'),
-           ('COMMENTS', r'//.*'),
-           ('LONGCOMMENTBEGIN', r'\/\*'),
-           ('LONGCOMMENTEND', r'\*\/'),
-           ('LEESTEKEN', r'==|->|<<|>>|!=|\+\+|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|'),
-           ('STRING', r'".*?"')
-            ]
-        tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
-        gettok = re.compile(tok_re).match
-        line = 1
-        pos = line_start = 0
-        mo = gettok(s)
-        incomment = False
-        while mo is not None:
-            typ = mo.lastgroup
-            val = mo.group(typ)
-            if typ == 'NEWLINE':
-                line_start = pos
-                line += 1
-            elif typ == 'COMMENTS':
-                pass
-            elif typ == 'LONGCOMMENTBEGIN':
-                incomment = True
-            elif typ == 'LONGCOMMENTEND':
-                incomment = False
-            elif typ == 'SKIP':
-                pass
-            elif incomment:
+        self.filename = filename
+        return self.tokenize(s)
+
+    def handle_comment_start(self, typ, val):
+        self.incomment = True
+
+    def handle_comment_stop(self, typ, val):
+        self.incomment = False
+
+    def tokenize(self, text):
+        """ Keeps track of the long comments """
+        self.incomment = False
+        for token in super().tokenize(text):
+            if self.incomment:
                 pass    # Wait until we are not in a comment section
             else:
-                if typ == 'ID':
-                    if val in keywords:
-                        typ = val
-                elif typ == 'LEESTEKEN':
-                    typ = val
-                elif typ == 'NUMBER':
-                    val = int(val)
-                elif typ == 'HEXNUMBER':
-                    val = int(val[2:], 16)
-                    typ = 'NUMBER'
-                elif typ == 'REAL':
-                    val = float(val)
-                elif typ == 'STRING':
-                    val = val[1:-1]
-                loc = SourceLocation(filename, line, mo.start() - line_start,
-                                     mo.end() - mo.start())
-                yield Token(typ, val, loc)
-            pos = mo.end()
-            mo = gettok(s, pos)
-        if pos != len(s):
-            col = pos - line_start
-            loc = SourceLocation(filename, line, col, 1)
-            raise CompilerError('Unexpected: "{0}"'.format(s[pos]), loc)
-        loc = SourceLocation(filename, line, 0, 0)
-        yield Token('END', '', loc)
+                yield token
+        loc = SourceLocation(self.filename, self.line, 0, 0)
+        yield Token('EOF', 'EOF', loc)
+
+    def handle_id(self, typ, val):
+        if val in keywords:
+            typ = val
+        return typ, val
--- a/python/ppci/c3/parser.py	Fri May 23 14:28:03 2014 +0200
+++ b/python/ppci/c3/parser.py	Tue May 27 22:19:32 2014 +0200
@@ -24,6 +24,7 @@
         self.token = self.tokens.__next__()
         try:
             self.parse_package()
+            self.logger.debug('Parsing complete')
             self.mod.ok = True  # Valid until proven wrong :)
             return self.mod
         except CompilerError as e:
@@ -55,7 +56,7 @@
 
     def NextToken(self):
         t = self.token
-        if t.typ != 'END':
+        if t.typ != 'EOF':
             self.token = self.tokens.__next__()
         return t
 
@@ -73,11 +74,12 @@
         self.Consume('module')
         name = self.Consume('ID')
         self.Consume(';')
+        self.logger.debug('Parsing package {}'.format(name.val))
         self.mod = Package(name.val, name.loc)
         self.currentPart = self.mod
-        while self.Peak != 'END':
+        while self.Peak != 'EOF':
             self.parse_top_level()
-        self.Consume('END')
+        self.Consume('EOF')
 
     def parse_top_level(self):
         """ Parse toplevel declaration """
@@ -161,6 +163,7 @@
         self.Consume('type')
         newtype = self.parse_type_spec()
         typename = self.Consume('ID')
+        self.logger.debug('Parsing type {}'.format(typename.val))
         self.Consume(';')
         df = DefinedType(typename.val, newtype, typename.loc)
         self.addDeclaration(df)
@@ -194,6 +197,7 @@
         loc = self.Consume('function').loc
         returntype = self.parse_type_spec()
         fname = self.Consume('ID').val
+        self.logger.debug('Parsing function {}'.format(fname))
         f = Function(fname, loc)
         self.addDeclaration(f)
         savePart = self.currentPart
--- a/python/ppci/common.py	Fri May 23 14:28:03 2014 +0200
+++ b/python/ppci/common.py	Tue May 27 22:19:32 2014 +0200
@@ -7,13 +7,14 @@
    Source location structures
 """
 
-# Token is used in the lexical analyzer:
 class Token:
-    def __init__(self, typ, val, loc=None):
+    """
+        Token is used in the lexical analyzer. The lexical analyzer takes
+        a text and splits it into tokens.
+    """
+    def __init__(self, typ, val, loc):
         self.typ = typ
         self.val = val
-        if loc is None:
-            loc = SourceLocation('', 0, 0, 0)
         assert type(loc) is SourceLocation
         self.loc = loc
 
@@ -58,7 +59,7 @@
         self.logger = logging.getLogger('diagnostics')
 
     def addSource(self, name, src):
-        self.logger.debug('Adding source {}'.format(name))
+        self.logger.debug('Adding source, filename="{}"'.format(name))
         self.sources[name] = src
 
     def addDiag(self, d):
@@ -80,7 +81,7 @@
 
     def printError(self, e):
         def printLine(row, txt):
-            print(str(row)+':'+txt)
+            print(str(row) + ':' + txt)
         print('==============')
         if not e.loc:
             print('Error: {0}'.format(e))
--- a/python/pyburg.py	Fri May 23 14:28:03 2014 +0200
+++ b/python/pyburg.py	Tue May 27 22:19:32 2014 +0200
@@ -59,7 +59,7 @@
 import io
 import types
 import argparse
-from ppci import Token
+from ppci import Token, SourceLocation
 from pyyacc import ParserException
 import yacc
 import baselex
@@ -87,14 +87,15 @@
         header_lines = []
         section = 0
         for line in lines:
+            loc = SourceLocation(self.filename, 0, 0, 0)
             line = line.strip()
             if not line:
                 continue  # Skip empty lines
             elif line == '%%':
                 section += 1
                 if section == 1:
-                    yield Token('header', header_lines)
-                yield Token('%%', '%%')
+                    yield Token('header', header_lines, loc)
+                yield Token('%%', '%%', loc)
             else:
                 if section == 0:
                     header_lines.append(line)
--- a/python/yacc.py	Fri May 23 14:28:03 2014 +0200
+++ b/python/yacc.py	Tue May 27 22:19:32 2014 +0200
@@ -44,7 +44,7 @@
 import logging
 from pyyacc import Grammar
 from baselex import BaseLexer
-from ppci import Token
+from ppci import Token, SourceLocation
 
 
 class XaccLexer(BaseLexer):
@@ -63,19 +63,20 @@
         section = 0
         for line in lines:
             line = line.strip()
+            loc = SourceLocation(self.filename, 0, 0, 0)
             if not line:
                 continue  # Skip empty lines
             if line == '%%':
                 section += 1
-                yield Token('%%', '%%')
+                yield Token('%%', '%%', loc)
                 continue
             if section == 0:
                 if line.startswith('%tokens'):
-                    yield Token('%tokens', '%tokens')
+                    yield Token('%tokens', '%tokens', loc)
                     for tk in super().tokenize(line[7:]):
                         yield tk
                 else:
-                    yield Token('HEADER', line)
+                    yield Token('HEADER', line, loc)
             elif section == 1:
                 for tk in super().tokenize(line):
                     yield tk
--- a/test/testc3.py	Fri May 23 14:28:03 2014 +0200
+++ b/test/testc3.py	Tue May 27 22:19:32 2014 +0200
@@ -14,10 +14,10 @@
     def testUnexpectedCharacter(self):
         snippet = io.StringIO(""" var s \u6c34 """)
         with self.assertRaises(ppci.CompilerError):
-            list(self.l.tokenize(snippet))
+            list(self.l.lex(snippet))
 
     def check(self, snippet, toks):
-        toks2 = list(tok.typ for tok in self.l.tokenize(io.StringIO(snippet)))
+        toks2 = list(tok.typ for tok in self.l.lex(io.StringIO(snippet)))
         self.assertSequenceEqual(toks, toks2)
 
     def testBlockComment(self):
@@ -25,7 +25,7 @@
           /* Demo */
           var int x = 0;
         """
-        toks = ['var', 'ID', 'ID', '=', 'NUMBER', ';', 'END']
+        toks = ['var', 'ID', 'ID', '=', 'NUMBER', ';', 'EOF']
         self.check(snippet, toks)
 
     def testBlockCommentMultiLine(self):
@@ -36,7 +36,7 @@
           */
           var int x = 0;
         """
-        toks = ['var', 'ID', 'ID', '=', 'NUMBER', ';', 'END']
+        toks = ['var', 'ID', 'ID', '=', 'NUMBER', ';', 'EOF']
         self.check(snippet, toks)
 
 
@@ -69,7 +69,6 @@
         if rows != actualErrors:
             self.diag.printErrors()
         self.assertSequenceEqual(rows, actualErrors)
-        # self.assertFalse(all(ircode))
 
     def expectOK(self, snippet):
         """ Expect a snippet to be OK """
--- a/test/testpyy.py	Fri May 23 14:28:03 2014 +0200
+++ b/test/testpyy.py	Tue May 27 22:19:32 2014 +0200
@@ -1,16 +1,17 @@
 import unittest
 from pyyacc import Grammar, Item, ParserGenerationException, ParserException
 from pyyacc import EPS, EOF, calculate_first_sets
-from ppci import Token
+from ppci import Token, SourceLocation
 
 
 class genTokens:
     def __init__(self, lst):
         def tokGen():
+            loc = SourceLocation('', 0, 0, 0)
             for t in lst:
-                yield Token(t, t)
+                yield Token(t, t, loc)
             while True:
-                yield Token(EOF, EOF)
+                yield Token(EOF, EOF, loc)
         self.tokens = tokGen()
         self.token = self.tokens.__next__()