Mercurial > lcfOS
comparison python/ppci/c3/lexer.py @ 396:fb3c1f029b30
Added baselexer into c3 lexer
author | Windel Bouwman |
---|---|
date | Tue, 27 May 2014 22:19:32 +0200 |
parents | 6ae782a085e0 |
children | 5d03c10fe19d |
comparison
equal
deleted
inserted
replaced
395:3b0c495e3008 | 396:fb3c1f029b30 |
---|---|
1 import re | 1 import re |
2 from ppci import CompilerError, SourceLocation, Token | 2 from ppci import CompilerError, SourceLocation, Token, make_num |
3 from baselex import BaseLexer | 3 from baselex import BaseLexer |
4 | 4 |
5 """ | 5 """ |
6 Lexical analyzer part. Splits the input character stream into tokens. | 6 Lexical analyzer part. Splits the input character stream into tokens. |
7 """ | 7 """ |
12 'function', 'var', 'type', 'const', | 12 'function', 'var', 'type', 'const', |
13 'struct', 'cast', 'sizeof', | 13 'struct', 'cast', 'sizeof', |
14 'import', 'module'] | 14 'import', 'module'] |
15 | 15 |
16 | 16 |
17 class Lexer: | 17 class Lexer(BaseLexer): |
18 """ Generates a sequence of token from an input stream """ | 18 """ Generates a sequence of token from an input stream """ |
19 def __init__(self, diag): | 19 def __init__(self, diag): |
20 self.diag = diag | 20 self.diag = diag |
21 tok_spec = [ | |
22 ('REAL', r'\d+\.\d+', lambda typ, val: (typ, float(val))), | |
23 ('HEXNUMBER', r'0x[\da-fA-F]+', lambda typ, val: ('NUMBER', make_num(val))), | |
24 ('NUMBER', r'\d+', lambda typ, val: (typ, int(val))), | |
25 ('ID', r'[A-Za-z][A-Za-z\d_]*', self.handle_id), | |
26 ('NEWLINE', r'\n', lambda typ, val: self.newline()), | |
27 ('SKIP', r'[ \t]', None), | |
28 ('COMMENTS', r'//.*', None), | |
29 ('LONGCOMMENTBEGIN', r'\/\*', self.handle_comment_start), | |
30 ('LONGCOMMENTEND', r'\*\/', self.handle_comment_stop), | |
31 ('LEESTEKEN', r'==|->|<<|>>|!=|\+\+|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|', lambda typ, val: (val, val)), | |
32 ('STRING', r'".*?"', lambda typ, val: (typ, val[1:-1])) | |
33 ] | |
34 super().__init__(tok_spec) | |
21 | 35 |
22 def lex(self, source): | 36 def lex(self, input_file): |
23 return self.tokenize(source) | |
24 | |
25 def tokenize(self, input_file): | |
26 """ | |
27 Tokenizer, generates an iterator that | |
28 returns tokens! | |
29 | |
30 Input is a file like object. | |
31 | |
32 This GREAT example was taken from python re doc page! | |
33 """ | |
34 filename = input_file.name if hasattr(input_file, 'name') else '' | 37 filename = input_file.name if hasattr(input_file, 'name') else '' |
35 s = input_file.read() | 38 s = input_file.read() |
36 input_file.close() | 39 input_file.close() |
37 self.diag.addSource(filename, s) | 40 self.diag.addSource(filename, s) |
38 tok_spec = [ | 41 self.filename = filename |
39 ('REAL', r'\d+\.\d+'), | 42 return self.tokenize(s) |
40 ('HEXNUMBER', r'0x[\da-fA-F]+'), | 43 |
41 ('NUMBER', r'\d+'), | 44 def handle_comment_start(self, typ, val): |
42 ('ID', r'[A-Za-z][A-Za-z\d_]*'), | 45 self.incomment = True |
43 ('NEWLINE', r'\n'), | 46 |
44 ('SKIP', r'[ \t]'), | 47 def handle_comment_stop(self, typ, val): |
45 ('COMMENTS', r'//.*'), | 48 self.incomment = False |
46 ('LONGCOMMENTBEGIN', r'\/\*'), | 49 |
47 ('LONGCOMMENTEND', r'\*\/'), | 50 def tokenize(self, text): |
48 ('LEESTEKEN', r'==|->|<<|>>|!=|\+\+|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|'), | 51 """ Keeps track of the long comments """ |
49 ('STRING', r'".*?"') | 52 self.incomment = False |
50 ] | 53 for token in super().tokenize(text): |
51 tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) | 54 if self.incomment: |
52 gettok = re.compile(tok_re).match | |
53 line = 1 | |
54 pos = line_start = 0 | |
55 mo = gettok(s) | |
56 incomment = False | |
57 while mo is not None: | |
58 typ = mo.lastgroup | |
59 val = mo.group(typ) | |
60 if typ == 'NEWLINE': | |
61 line_start = pos | |
62 line += 1 | |
63 elif typ == 'COMMENTS': | |
64 pass | |
65 elif typ == 'LONGCOMMENTBEGIN': | |
66 incomment = True | |
67 elif typ == 'LONGCOMMENTEND': | |
68 incomment = False | |
69 elif typ == 'SKIP': | |
70 pass | |
71 elif incomment: | |
72 pass # Wait until we are not in a comment section | 55 pass # Wait until we are not in a comment section |
73 else: | 56 else: |
74 if typ == 'ID': | 57 yield token |
75 if val in keywords: | 58 loc = SourceLocation(self.filename, self.line, 0, 0) |
76 typ = val | 59 yield Token('EOF', 'EOF', loc) |
77 elif typ == 'LEESTEKEN': | 60 |
78 typ = val | 61 def handle_id(self, typ, val): |
79 elif typ == 'NUMBER': | 62 if val in keywords: |
80 val = int(val) | 63 typ = val |
81 elif typ == 'HEXNUMBER': | 64 return typ, val |
82 val = int(val[2:], 16) | |
83 typ = 'NUMBER' | |
84 elif typ == 'REAL': | |
85 val = float(val) | |
86 elif typ == 'STRING': | |
87 val = val[1:-1] | |
88 loc = SourceLocation(filename, line, mo.start() - line_start, | |
89 mo.end() - mo.start()) | |
90 yield Token(typ, val, loc) | |
91 pos = mo.end() | |
92 mo = gettok(s, pos) | |
93 if pos != len(s): | |
94 col = pos - line_start | |
95 loc = SourceLocation(filename, line, col, 1) | |
96 raise CompilerError('Unexpected: "{0}"'.format(s[pos]), loc) | |
97 loc = SourceLocation(filename, line, 0, 0) | |
98 yield Token('END', '', loc) |