comparison python/ppci/c3/lexer.py @ 300:158068af716c

yafm
author Windel Bouwman
date Tue, 03 Dec 2013 18:00:22 +0100
parents python/c3/lexer.py@6aa721e7b10b
children 0615b5308710
comparison
equal deleted inserted replaced
299:674789d9ff37 300:158068af716c
1 import collections
2 import re
3
4 from ppci import CompilerError, SourceLocation, Token
5
6 """
7 Lexical analyzer part. Splits the input character stream into tokens.
8 """
9
10 keywords = ['and', 'or', 'not', 'true', 'false',
11 'else', 'if', 'while', 'return',
12 'function', 'var', 'type', 'const',
13 'struct', 'cast',
14 'import', 'module']
15
16
17 class Lexer:
18 def __init__(self, diag):
19 self.diag = diag
20
21 def tokenize(self, input_file):
22 """
23 Tokenizer, generates an iterator that
24 returns tokens!
25
26 Input is a file like object.
27
28 This GREAT example was taken from python re doc page!
29 """
30 filename = input_file.name if hasattr(input_file, 'name') else ''
31 s = input_file.read()
32 input_file.close()
33 self.diag.addSource(filename, s)
34 tok_spec = [
35 ('REAL', r'\d+\.\d+'),
36 ('HEXNUMBER', r'0x[\da-fA-F]+'),
37 ('NUMBER', r'\d+'),
38 ('ID', r'[A-Za-z][A-Za-z\d_]*'),
39 ('NEWLINE', r'\n'),
40 ('SKIP', r'[ \t]'),
41 ('COMMENTS', r'//.*'),
42 ('LONGCOMMENTBEGIN', r'\/\*'),
43 ('LONGCOMMENTEND', r'\*\/'),
44 ('LEESTEKEN', r'==|->|<<|>>|!=|[\.,=:;\-+*\[\]/\(\)]|>=|<=|<>|>|<|{|}|&|\^|\|'),
45 ('STRING', r"'.*?'")
46 ]
47 tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
48 gettok = re.compile(tok_re).match
49 line = 1
50 pos = line_start = 0
51 mo = gettok(s)
52 incomment = False
53 while mo is not None:
54 typ = mo.lastgroup
55 val = mo.group(typ)
56 if typ == 'NEWLINE':
57 line_start = pos
58 line += 1
59 elif typ == 'COMMENTS':
60 pass
61 elif typ == 'LONGCOMMENTBEGIN':
62 incomment = True
63 elif typ == 'LONGCOMMENTEND':
64 incomment = False
65 elif typ == 'SKIP':
66 pass
67 elif incomment:
68 pass # Wait until we are not in a comment section
69 else:
70 if typ == 'ID':
71 if val in keywords:
72 typ = val
73 elif typ == 'LEESTEKEN':
74 typ = val
75 elif typ == 'NUMBER':
76 val = int(val)
77 elif typ == 'HEXNUMBER':
78 val = int(val[2:], 16)
79 typ = 'NUMBER'
80 elif typ == 'REAL':
81 val = float(val)
82 elif typ == 'STRING':
83 val = val[1:-1]
84 loc = SourceLocation(filename, line, mo.start() - line_start,
85 mo.end() - mo.start())
86 yield Token(typ, val, loc)
87 pos = mo.end()
88 mo = gettok(s, pos)
89 if pos != len(s):
90 col = pos - line_start
91 loc = SourceLocation(filename, line, col, 1)
92 raise CompilerError('Unexpected: "{0}"'.format(s[pos]), loc)
93 loc = SourceLocation(filename, line, 0, 0)
94 yield Token('END', '', loc)