Mercurial > lcfOS
diff python/ppci/c3/lexer.py @ 396:fb3c1f029b30
Added baselexer into c3 lexer
author | Windel Bouwman |
---|---|
date | Tue, 27 May 2014 22:19:32 +0200 |
parents | 6ae782a085e0 |
children | 5d03c10fe19d |
line wrap: on
line diff
--- a/python/ppci/c3/lexer.py Fri May 23 14:28:03 2014 +0200 +++ b/python/ppci/c3/lexer.py Tue May 27 22:19:32 2014 +0200 @@ -1,5 +1,5 @@ import re -from ppci import CompilerError, SourceLocation, Token +from ppci import CompilerError, SourceLocation, Token, make_num from baselex import BaseLexer """ @@ -14,85 +14,51 @@ 'import', 'module'] -class Lexer: +class Lexer(BaseLexer): """ Generates a sequence of token from an input stream """ def __init__(self, diag): self.diag = diag - - def lex(self, source): - return self.tokenize(source) + tok_spec = [ + ('REAL', r'\d+\.\d+', lambda typ, val: (typ, float(val))), + ('HEXNUMBER', r'0x[\da-fA-F]+', lambda typ, val: ('NUMBER', make_num(val))), + ('NUMBER', r'\d+', lambda typ, val: (typ, int(val))), + ('ID', r'[A-Za-z][A-Za-z\d_]*', self.handle_id), + ('NEWLINE', r'\n', lambda typ, val: self.newline()), + ('SKIP', r'[ \t]', None), + ('COMMENTS', r'//.*', None), + ('LONGCOMMENTBEGIN', r'\/\*', self.handle_comment_start), + ('LONGCOMMENTEND', r'\*\/', self.handle_comment_stop), + ('LEESTEKEN', r'==|->|<<|>>|!=|\+\+|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|', lambda typ, val: (val, val)), + ('STRING', r'".*?"', lambda typ, val: (typ, val[1:-1])) + ] + super().__init__(tok_spec) - def tokenize(self, input_file): - """ - Tokenizer, generates an iterator that - returns tokens! - - Input is a file like object. - - This GREAT example was taken from python re doc page! - """ + def lex(self, input_file): filename = input_file.name if hasattr(input_file, 'name') else '' s = input_file.read() input_file.close() self.diag.addSource(filename, s) - tok_spec = [ - ('REAL', r'\d+\.\d+'), - ('HEXNUMBER', r'0x[\da-fA-F]+'), - ('NUMBER', r'\d+'), - ('ID', r'[A-Za-z][A-Za-z\d_]*'), - ('NEWLINE', r'\n'), - ('SKIP', r'[ \t]'), - ('COMMENTS', r'//.*'), - ('LONGCOMMENTBEGIN', r'\/\*'), - ('LONGCOMMENTEND', r'\*\/'), - ('LEESTEKEN', r'==|->|<<|>>|!=|\+\+|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|'), - ('STRING', r'".*?"') - ] - tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) - gettok = re.compile(tok_re).match - line = 1 - pos = line_start = 0 - mo = gettok(s) - incomment = False - while mo is not None: - typ = mo.lastgroup - val = mo.group(typ) - if typ == 'NEWLINE': - line_start = pos - line += 1 - elif typ == 'COMMENTS': - pass - elif typ == 'LONGCOMMENTBEGIN': - incomment = True - elif typ == 'LONGCOMMENTEND': - incomment = False - elif typ == 'SKIP': - pass - elif incomment: + self.filename = filename + return self.tokenize(s) + + def handle_comment_start(self, typ, val): + self.incomment = True + + def handle_comment_stop(self, typ, val): + self.incomment = False + + def tokenize(self, text): + """ Keeps track of the long comments """ + self.incomment = False + for token in super().tokenize(text): + if self.incomment: pass # Wait until we are not in a comment section else: - if typ == 'ID': - if val in keywords: - typ = val - elif typ == 'LEESTEKEN': - typ = val - elif typ == 'NUMBER': - val = int(val) - elif typ == 'HEXNUMBER': - val = int(val[2:], 16) - typ = 'NUMBER' - elif typ == 'REAL': - val = float(val) - elif typ == 'STRING': - val = val[1:-1] - loc = SourceLocation(filename, line, mo.start() - line_start, - mo.end() - mo.start()) - yield Token(typ, val, loc) - pos = mo.end() - mo = gettok(s, pos) - if pos != len(s): - col = pos - line_start - loc = SourceLocation(filename, line, col, 1) - raise CompilerError('Unexpected: "{0}"'.format(s[pos]), loc) - loc = SourceLocation(filename, line, 0, 0) - yield Token('END', '', loc) + yield token + loc = SourceLocation(self.filename, self.line, 0, 0) + yield Token('EOF', 'EOF', loc) + + def handle_id(self, typ, val): + if val in keywords: + typ = val + return typ, val