comparison python/ppci/c3/lexer.py @ 396:fb3c1f029b30

Added baselexer into c3 lexer
author Windel Bouwman
date Tue, 27 May 2014 22:19:32 +0200
parents 6ae782a085e0
children 5d03c10fe19d
comparison
equal deleted inserted replaced
395:3b0c495e3008 396:fb3c1f029b30
1 import re 1 import re
2 from ppci import CompilerError, SourceLocation, Token 2 from ppci import CompilerError, SourceLocation, Token, make_num
3 from baselex import BaseLexer 3 from baselex import BaseLexer
4 4
5 """ 5 """
6 Lexical analyzer part. Splits the input character stream into tokens. 6 Lexical analyzer part. Splits the input character stream into tokens.
7 """ 7 """
12 'function', 'var', 'type', 'const', 12 'function', 'var', 'type', 'const',
13 'struct', 'cast', 'sizeof', 13 'struct', 'cast', 'sizeof',
14 'import', 'module'] 14 'import', 'module']
15 15
16 16
17 class Lexer: 17 class Lexer(BaseLexer):
18 """ Generates a sequence of token from an input stream """ 18 """ Generates a sequence of token from an input stream """
19 def __init__(self, diag): 19 def __init__(self, diag):
20 self.diag = diag 20 self.diag = diag
21 tok_spec = [
22 ('REAL', r'\d+\.\d+', lambda typ, val: (typ, float(val))),
23 ('HEXNUMBER', r'0x[\da-fA-F]+', lambda typ, val: ('NUMBER', make_num(val))),
24 ('NUMBER', r'\d+', lambda typ, val: (typ, int(val))),
25 ('ID', r'[A-Za-z][A-Za-z\d_]*', self.handle_id),
26 ('NEWLINE', r'\n', lambda typ, val: self.newline()),
27 ('SKIP', r'[ \t]', None),
28 ('COMMENTS', r'//.*', None),
29 ('LONGCOMMENTBEGIN', r'\/\*', self.handle_comment_start),
30 ('LONGCOMMENTEND', r'\*\/', self.handle_comment_stop),
31 ('LEESTEKEN', r'==|->|<<|>>|!=|\+\+|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|', lambda typ, val: (val, val)),
32 ('STRING', r'".*?"', lambda typ, val: (typ, val[1:-1]))
33 ]
34 super().__init__(tok_spec)
21 35
22 def lex(self, source): 36 def lex(self, input_file):
23 return self.tokenize(source)
24
25 def tokenize(self, input_file):
26 """
27 Tokenizer, generates an iterator that
28 returns tokens!
29
30 Input is a file like object.
31
32 This GREAT example was taken from python re doc page!
33 """
34 filename = input_file.name if hasattr(input_file, 'name') else '' 37 filename = input_file.name if hasattr(input_file, 'name') else ''
35 s = input_file.read() 38 s = input_file.read()
36 input_file.close() 39 input_file.close()
37 self.diag.addSource(filename, s) 40 self.diag.addSource(filename, s)
38 tok_spec = [ 41 self.filename = filename
39 ('REAL', r'\d+\.\d+'), 42 return self.tokenize(s)
40 ('HEXNUMBER', r'0x[\da-fA-F]+'), 43
41 ('NUMBER', r'\d+'), 44 def handle_comment_start(self, typ, val):
42 ('ID', r'[A-Za-z][A-Za-z\d_]*'), 45 self.incomment = True
43 ('NEWLINE', r'\n'), 46
44 ('SKIP', r'[ \t]'), 47 def handle_comment_stop(self, typ, val):
45 ('COMMENTS', r'//.*'), 48 self.incomment = False
46 ('LONGCOMMENTBEGIN', r'\/\*'), 49
47 ('LONGCOMMENTEND', r'\*\/'), 50 def tokenize(self, text):
48 ('LEESTEKEN', r'==|->|<<|>>|!=|\+\+|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|'), 51 """ Keeps track of the long comments """
49 ('STRING', r'".*?"') 52 self.incomment = False
50 ] 53 for token in super().tokenize(text):
51 tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) 54 if self.incomment:
52 gettok = re.compile(tok_re).match
53 line = 1
54 pos = line_start = 0
55 mo = gettok(s)
56 incomment = False
57 while mo is not None:
58 typ = mo.lastgroup
59 val = mo.group(typ)
60 if typ == 'NEWLINE':
61 line_start = pos
62 line += 1
63 elif typ == 'COMMENTS':
64 pass
65 elif typ == 'LONGCOMMENTBEGIN':
66 incomment = True
67 elif typ == 'LONGCOMMENTEND':
68 incomment = False
69 elif typ == 'SKIP':
70 pass
71 elif incomment:
72 pass # Wait until we are not in a comment section 55 pass # Wait until we are not in a comment section
73 else: 56 else:
74 if typ == 'ID': 57 yield token
75 if val in keywords: 58 loc = SourceLocation(self.filename, self.line, 0, 0)
76 typ = val 59 yield Token('EOF', 'EOF', loc)
77 elif typ == 'LEESTEKEN': 60
78 typ = val 61 def handle_id(self, typ, val):
79 elif typ == 'NUMBER': 62 if val in keywords:
80 val = int(val) 63 typ = val
81 elif typ == 'HEXNUMBER': 64 return typ, val
82 val = int(val[2:], 16)
83 typ = 'NUMBER'
84 elif typ == 'REAL':
85 val = float(val)
86 elif typ == 'STRING':
87 val = val[1:-1]
88 loc = SourceLocation(filename, line, mo.start() - line_start,
89 mo.end() - mo.start())
90 yield Token(typ, val, loc)
91 pos = mo.end()
92 mo = gettok(s, pos)
93 if pos != len(s):
94 col = pos - line_start
95 loc = SourceLocation(filename, line, col, 1)
96 raise CompilerError('Unexpected: "{0}"'.format(s[pos]), loc)
97 loc = SourceLocation(filename, line, 0, 0)
98 yield Token('END', '', loc)