# HG changeset patch # User Windel Bouwman # Date 1401221972 -7200 # Node ID fb3c1f029b3021f71013211a568101aa469da796 # Parent 3b0c495e3008ef60de57e00e75ad781d7aea644d Added baselexer into c3 lexer diff -r 3b0c495e3008 -r fb3c1f029b30 python/baselex.py --- a/python/baselex.py Fri May 23 14:28:03 2014 +0200 +++ b/python/baselex.py Tue May 27 22:19:32 2014 +0200 @@ -1,38 +1,57 @@ import re -from ppci import Token, CompilerError +from ppci import Token, CompilerError, SourceLocation from pyyacc import EOF class BaseLexer: - """ Base class for a lexer """ + """ Base class for a lexer. This class can be overridden to create a + lexer. This class handles the regular expression generation and + source position accounting. + """ def __init__(self, tok_spec): tok_re = '|'.join('(?P<{}>{})'.format(pair[0], pair[1]) for pair in tok_spec) self.gettok = re.compile(tok_re).match self.func_map = {pair[0]: pair[2] for pair in tok_spec} + self.filename = None def feed(self, txt): """ Feeds the lexer with extra input """ self.tokens = self.tokenize(txt) def tokenize(self, txt): - """ Generator that generates tokens from text """ + """ Generator that generates tokens from text + It does not yield the EOF token. + """ + self.line = 1 + self.line_start = 0 + self.pos = 0 mo = self.gettok(txt) - pos = 0 while mo: typ = mo.lastgroup val = mo.group(typ) + column = mo.start() - self.line_start + length = mo.end() - mo.start() + loc = SourceLocation(self.filename, self.line, column, length) func = self.func_map[typ] if func: - typ, val = func(typ, val) - yield Token(typ, val) - pos = mo.end() - mo = self.gettok(txt, pos) - if len(txt) != pos: - raise CompilerError('Lex fault at {}'.format(txt[pos:])) + res = func(typ, val) + if res: + typ, val = res + yield Token(typ, val, loc) + self.pos = mo.end() + mo = self.gettok(txt, self.pos) + if len(txt) != self.pos: + raise CompilerError('Lex fault at {}'.format(txt[self.pos:])) + + def newline(self): + """ Enters a new line """ + self.line_start = self.pos + self.line = self.line + 1 def next_token(self): try: return self.tokens.__next__() except StopIteration: - return Token(EOF, EOF) + loc = SourceLocation(self.filename, self.line, 0, 0) + return Token(EOF, EOF, loc) diff -r 3b0c495e3008 -r fb3c1f029b30 python/ppci/c3/builder.py --- a/python/ppci/c3/builder.py Fri May 23 14:28:03 2014 +0200 +++ b/python/ppci/c3/builder.py Tue May 27 22:19:32 2014 +0200 @@ -95,23 +95,27 @@ def build(self, srcs, imps=[]): """ Create IR-code from sources """ - self.logger.debug('Building {} source files'.format(len(srcs))) + self.logger.debug('Building {} source files'.format(len(srcs + imps))) iter(srcs) # Check if srcs are iterable iter(imps) self.ok = True self.pkgs = {} - # Parsing stage (phase 1) + # Lexing and parsing stage (phase 1) def doParse(src): tokens = self.lexer.lex(src) - return self.parser.parseSource(tokens) + pkg = self.parser.parseSource(tokens) + return pkg s_pkgs = list(map(doParse, srcs)) i_pkgs = list(map(doParse, imps)) all_pkgs = s_pkgs + i_pkgs if not all(all_pkgs): self.ok = False + self.logger.debug('Parsing failed') return + self.logger.debug('Parsed {} packages'.format(len(all_pkgs))) + # Fix scopes and package refs (phase 1.5) packages = {pkg.name: pkg for pkg in all_pkgs} self.pkgs = packages @@ -122,6 +126,7 @@ scopeFiller.addScope(pkg) if not all(pkg.ok for pkg in all_pkgs): self.ok = False + self.logger.debug('Scope filling failed') return # Generate intermediate code (phase 2) @@ -129,4 +134,6 @@ for pkg in s_pkgs: yield self.cg.gencode(pkg) if not all(pkg.ok for pkg in all_pkgs): + self.logger.debug('Code generation failed') self.ok = False + self.logger.debug('C3 build complete!') diff -r 3b0c495e3008 -r fb3c1f029b30 python/ppci/c3/lexer.py --- a/python/ppci/c3/lexer.py Fri May 23 14:28:03 2014 +0200 +++ b/python/ppci/c3/lexer.py Tue May 27 22:19:32 2014 +0200 @@ -1,5 +1,5 @@ import re -from ppci import CompilerError, SourceLocation, Token +from ppci import CompilerError, SourceLocation, Token, make_num from baselex import BaseLexer """ @@ -14,85 +14,51 @@ 'import', 'module'] -class Lexer: +class Lexer(BaseLexer): """ Generates a sequence of token from an input stream """ def __init__(self, diag): self.diag = diag - - def lex(self, source): - return self.tokenize(source) + tok_spec = [ + ('REAL', r'\d+\.\d+', lambda typ, val: (typ, float(val))), + ('HEXNUMBER', r'0x[\da-fA-F]+', lambda typ, val: ('NUMBER', make_num(val))), + ('NUMBER', r'\d+', lambda typ, val: (typ, int(val))), + ('ID', r'[A-Za-z][A-Za-z\d_]*', self.handle_id), + ('NEWLINE', r'\n', lambda typ, val: self.newline()), + ('SKIP', r'[ \t]', None), + ('COMMENTS', r'//.*', None), + ('LONGCOMMENTBEGIN', r'\/\*', self.handle_comment_start), + ('LONGCOMMENTEND', r'\*\/', self.handle_comment_stop), + ('LEESTEKEN', r'==|->|<<|>>|!=|\+\+|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|', lambda typ, val: (val, val)), + ('STRING', r'".*?"', lambda typ, val: (typ, val[1:-1])) + ] + super().__init__(tok_spec) - def tokenize(self, input_file): - """ - Tokenizer, generates an iterator that - returns tokens! - - Input is a file like object. - - This GREAT example was taken from python re doc page! - """ + def lex(self, input_file): filename = input_file.name if hasattr(input_file, 'name') else '' s = input_file.read() input_file.close() self.diag.addSource(filename, s) - tok_spec = [ - ('REAL', r'\d+\.\d+'), - ('HEXNUMBER', r'0x[\da-fA-F]+'), - ('NUMBER', r'\d+'), - ('ID', r'[A-Za-z][A-Za-z\d_]*'), - ('NEWLINE', r'\n'), - ('SKIP', r'[ \t]'), - ('COMMENTS', r'//.*'), - ('LONGCOMMENTBEGIN', r'\/\*'), - ('LONGCOMMENTEND', r'\*\/'), - ('LEESTEKEN', r'==|->|<<|>>|!=|\+\+|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|'), - ('STRING', r'".*?"') - ] - tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) - gettok = re.compile(tok_re).match - line = 1 - pos = line_start = 0 - mo = gettok(s) - incomment = False - while mo is not None: - typ = mo.lastgroup - val = mo.group(typ) - if typ == 'NEWLINE': - line_start = pos - line += 1 - elif typ == 'COMMENTS': - pass - elif typ == 'LONGCOMMENTBEGIN': - incomment = True - elif typ == 'LONGCOMMENTEND': - incomment = False - elif typ == 'SKIP': - pass - elif incomment: + self.filename = filename + return self.tokenize(s) + + def handle_comment_start(self, typ, val): + self.incomment = True + + def handle_comment_stop(self, typ, val): + self.incomment = False + + def tokenize(self, text): + """ Keeps track of the long comments """ + self.incomment = False + for token in super().tokenize(text): + if self.incomment: pass # Wait until we are not in a comment section else: - if typ == 'ID': - if val in keywords: - typ = val - elif typ == 'LEESTEKEN': - typ = val - elif typ == 'NUMBER': - val = int(val) - elif typ == 'HEXNUMBER': - val = int(val[2:], 16) - typ = 'NUMBER' - elif typ == 'REAL': - val = float(val) - elif typ == 'STRING': - val = val[1:-1] - loc = SourceLocation(filename, line, mo.start() - line_start, - mo.end() - mo.start()) - yield Token(typ, val, loc) - pos = mo.end() - mo = gettok(s, pos) - if pos != len(s): - col = pos - line_start - loc = SourceLocation(filename, line, col, 1) - raise CompilerError('Unexpected: "{0}"'.format(s[pos]), loc) - loc = SourceLocation(filename, line, 0, 0) - yield Token('END', '', loc) + yield token + loc = SourceLocation(self.filename, self.line, 0, 0) + yield Token('EOF', 'EOF', loc) + + def handle_id(self, typ, val): + if val in keywords: + typ = val + return typ, val diff -r 3b0c495e3008 -r fb3c1f029b30 python/ppci/c3/parser.py --- a/python/ppci/c3/parser.py Fri May 23 14:28:03 2014 +0200 +++ b/python/ppci/c3/parser.py Tue May 27 22:19:32 2014 +0200 @@ -24,6 +24,7 @@ self.token = self.tokens.__next__() try: self.parse_package() + self.logger.debug('Parsing complete') self.mod.ok = True # Valid until proven wrong :) return self.mod except CompilerError as e: @@ -55,7 +56,7 @@ def NextToken(self): t = self.token - if t.typ != 'END': + if t.typ != 'EOF': self.token = self.tokens.__next__() return t @@ -73,11 +74,12 @@ self.Consume('module') name = self.Consume('ID') self.Consume(';') + self.logger.debug('Parsing package {}'.format(name.val)) self.mod = Package(name.val, name.loc) self.currentPart = self.mod - while self.Peak != 'END': + while self.Peak != 'EOF': self.parse_top_level() - self.Consume('END') + self.Consume('EOF') def parse_top_level(self): """ Parse toplevel declaration """ @@ -161,6 +163,7 @@ self.Consume('type') newtype = self.parse_type_spec() typename = self.Consume('ID') + self.logger.debug('Parsing type {}'.format(typename.val)) self.Consume(';') df = DefinedType(typename.val, newtype, typename.loc) self.addDeclaration(df) @@ -194,6 +197,7 @@ loc = self.Consume('function').loc returntype = self.parse_type_spec() fname = self.Consume('ID').val + self.logger.debug('Parsing function {}'.format(fname)) f = Function(fname, loc) self.addDeclaration(f) savePart = self.currentPart diff -r 3b0c495e3008 -r fb3c1f029b30 python/ppci/common.py --- a/python/ppci/common.py Fri May 23 14:28:03 2014 +0200 +++ b/python/ppci/common.py Tue May 27 22:19:32 2014 +0200 @@ -7,13 +7,14 @@ Source location structures """ -# Token is used in the lexical analyzer: class Token: - def __init__(self, typ, val, loc=None): + """ + Token is used in the lexical analyzer. The lexical analyzer takes + a text and splits it into tokens. + """ + def __init__(self, typ, val, loc): self.typ = typ self.val = val - if loc is None: - loc = SourceLocation('', 0, 0, 0) assert type(loc) is SourceLocation self.loc = loc @@ -58,7 +59,7 @@ self.logger = logging.getLogger('diagnostics') def addSource(self, name, src): - self.logger.debug('Adding source {}'.format(name)) + self.logger.debug('Adding source, filename="{}"'.format(name)) self.sources[name] = src def addDiag(self, d): @@ -80,7 +81,7 @@ def printError(self, e): def printLine(row, txt): - print(str(row)+':'+txt) + print(str(row) + ':' + txt) print('==============') if not e.loc: print('Error: {0}'.format(e)) diff -r 3b0c495e3008 -r fb3c1f029b30 python/pyburg.py --- a/python/pyburg.py Fri May 23 14:28:03 2014 +0200 +++ b/python/pyburg.py Tue May 27 22:19:32 2014 +0200 @@ -59,7 +59,7 @@ import io import types import argparse -from ppci import Token +from ppci import Token, SourceLocation from pyyacc import ParserException import yacc import baselex @@ -87,14 +87,15 @@ header_lines = [] section = 0 for line in lines: + loc = SourceLocation(self.filename, 0, 0, 0) line = line.strip() if not line: continue # Skip empty lines elif line == '%%': section += 1 if section == 1: - yield Token('header', header_lines) - yield Token('%%', '%%') + yield Token('header', header_lines, loc) + yield Token('%%', '%%', loc) else: if section == 0: header_lines.append(line) diff -r 3b0c495e3008 -r fb3c1f029b30 python/yacc.py --- a/python/yacc.py Fri May 23 14:28:03 2014 +0200 +++ b/python/yacc.py Tue May 27 22:19:32 2014 +0200 @@ -44,7 +44,7 @@ import logging from pyyacc import Grammar from baselex import BaseLexer -from ppci import Token +from ppci import Token, SourceLocation class XaccLexer(BaseLexer): @@ -63,19 +63,20 @@ section = 0 for line in lines: line = line.strip() + loc = SourceLocation(self.filename, 0, 0, 0) if not line: continue # Skip empty lines if line == '%%': section += 1 - yield Token('%%', '%%') + yield Token('%%', '%%', loc) continue if section == 0: if line.startswith('%tokens'): - yield Token('%tokens', '%tokens') + yield Token('%tokens', '%tokens', loc) for tk in super().tokenize(line[7:]): yield tk else: - yield Token('HEADER', line) + yield Token('HEADER', line, loc) elif section == 1: for tk in super().tokenize(line): yield tk diff -r 3b0c495e3008 -r fb3c1f029b30 test/testc3.py --- a/test/testc3.py Fri May 23 14:28:03 2014 +0200 +++ b/test/testc3.py Tue May 27 22:19:32 2014 +0200 @@ -14,10 +14,10 @@ def testUnexpectedCharacter(self): snippet = io.StringIO(""" var s \u6c34 """) with self.assertRaises(ppci.CompilerError): - list(self.l.tokenize(snippet)) + list(self.l.lex(snippet)) def check(self, snippet, toks): - toks2 = list(tok.typ for tok in self.l.tokenize(io.StringIO(snippet))) + toks2 = list(tok.typ for tok in self.l.lex(io.StringIO(snippet))) self.assertSequenceEqual(toks, toks2) def testBlockComment(self): @@ -25,7 +25,7 @@ /* Demo */ var int x = 0; """ - toks = ['var', 'ID', 'ID', '=', 'NUMBER', ';', 'END'] + toks = ['var', 'ID', 'ID', '=', 'NUMBER', ';', 'EOF'] self.check(snippet, toks) def testBlockCommentMultiLine(self): @@ -36,7 +36,7 @@ */ var int x = 0; """ - toks = ['var', 'ID', 'ID', '=', 'NUMBER', ';', 'END'] + toks = ['var', 'ID', 'ID', '=', 'NUMBER', ';', 'EOF'] self.check(snippet, toks) @@ -69,7 +69,6 @@ if rows != actualErrors: self.diag.printErrors() self.assertSequenceEqual(rows, actualErrors) - # self.assertFalse(all(ircode)) def expectOK(self, snippet): """ Expect a snippet to be OK """ diff -r 3b0c495e3008 -r fb3c1f029b30 test/testpyy.py --- a/test/testpyy.py Fri May 23 14:28:03 2014 +0200 +++ b/test/testpyy.py Tue May 27 22:19:32 2014 +0200 @@ -1,16 +1,17 @@ import unittest from pyyacc import Grammar, Item, ParserGenerationException, ParserException from pyyacc import EPS, EOF, calculate_first_sets -from ppci import Token +from ppci import Token, SourceLocation class genTokens: def __init__(self, lst): def tokGen(): + loc = SourceLocation('', 0, 0, 0) for t in lst: - yield Token(t, t) + yield Token(t, t, loc) while True: - yield Token(EOF, EOF) + yield Token(EOF, EOF, loc) self.tokens = tokGen() self.token = self.tokens.__next__()