Mercurial > lcfOS
comparison python/yacc.py @ 318:e84047f29c78
Add burg and yacc initial attempts
author | Windel Bouwman |
---|---|
date | Tue, 31 Dec 2013 12:38:15 +0100 |
parents | |
children | 8d07a4254f04 |
comparison
equal
deleted
inserted
replaced
317:e30a77ae359b | 318:e84047f29c78 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 """ Parser generator utility """ | |
4 | |
5 import argparse | |
6 import re | |
7 import sys | |
8 import datetime | |
9 from pyyacc import Grammar, print_grammar | |
10 | |
11 | |
12 class XaccLexer: | |
13 def __init__(self): | |
14 pass | |
15 | |
16 def feed(self, txt): | |
17 # Create a regular expression for the lexing part: | |
18 tok_spec = [ | |
19 ('ID', r'[A-Za-z][A-Za-z\d_]*'), | |
20 ('STRING', r"'[^']*'"), | |
21 ('BRACEDCODE', r"\{[^\}]*\}"), | |
22 ('OTHER', r'[:;\|]'), | |
23 ('SKIP', r'[ ]') | |
24 ] | |
25 tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) | |
26 gettok = re.compile(tok_re).match | |
27 | |
28 lines = txt.split('\n') | |
29 | |
30 def tokenize_line(line): | |
31 """ Generator that splits up a line into tokens """ | |
32 mo = gettok(line) | |
33 pos = 0 | |
34 while mo: | |
35 typ = mo.lastgroup | |
36 val = mo.group(typ) | |
37 if typ == 'ID': | |
38 yield (typ, val) | |
39 elif typ == 'STRING': | |
40 typ = 'ID' | |
41 yield (typ, val[1:-1]) | |
42 elif typ == 'OTHER': | |
43 typ = val | |
44 yield (typ, val) | |
45 elif typ == 'BRACEDCODE': | |
46 yield (typ, val) | |
47 elif typ == 'SKIP': | |
48 pass | |
49 else: | |
50 raise NotImplementedError(str(typ)) | |
51 pos = mo.end() | |
52 mo = gettok(line, pos) | |
53 if len(line) != pos: | |
54 raise ParseError('Lex fault at {}'.format(line)) | |
55 | |
56 def tokenize(): | |
57 section = 0 | |
58 for line in lines: | |
59 line = line.strip() | |
60 if not line: | |
61 continue # Skip empty lines | |
62 if line == '%%': | |
63 section += 1 | |
64 yield('%%', '%%') | |
65 continue | |
66 if section == 0: | |
67 if line.startswith('%tokens'): | |
68 yield('%tokens', '%tokens') | |
69 yield from tokenize_line(line[7:]) | |
70 else: | |
71 yield ('HEADER', line) | |
72 elif section == 1: | |
73 yield from tokenize_line(line) | |
74 yield ('eof', 'eof') | |
75 self.tokens = tokenize() | |
76 self.token = self.tokens.__next__() | |
77 | |
78 def next_token(self): | |
79 t = self.token | |
80 if t[0] != 'eof': | |
81 self.token = self.tokens.__next__() | |
82 #print(t) | |
83 return t | |
84 | |
85 | |
86 class ParseError(Exception): | |
87 pass | |
88 | |
89 | |
90 class XaccParser: | |
91 """ Implements a recursive descent parser to parse grammar rules. | |
92 We could have made an generated parser, but that would yield a chicken | |
93 egg issue. | |
94 """ | |
95 def __init__(self, lexer): | |
96 self.lexer = lexer | |
97 | |
98 @property | |
99 def Peak(self): | |
100 """ Sneak peak to the next token in line """ | |
101 return self.lexer.token[0] | |
102 | |
103 def next_token(self): | |
104 """ Take the next token """ | |
105 return self.lexer.next_token() | |
106 | |
107 def consume(self, typ): | |
108 """ Eat next token of type typ or raise an exception """ | |
109 if self.Peak == typ: | |
110 return self.next_token() | |
111 else: | |
112 raise ParseError('Expected {}, but got {}'.format(typ, self.Peak)) | |
113 | |
114 def has_consumed(self, typ): | |
115 """ Consume typ if possible and return true if so """ | |
116 if self.Peak == typ: | |
117 self.consume(typ) | |
118 return True | |
119 return False | |
120 | |
121 def parse_grammar(self): | |
122 """ Entry parse function into recursive descent parser """ | |
123 # parse header | |
124 headers = [] | |
125 terminals = [] | |
126 while self.Peak in ['HEADER', '%tokens']: | |
127 if self.Peak == '%tokens': | |
128 self.consume('%tokens') | |
129 while self.Peak == 'ID': | |
130 terminals.append(self.consume('ID')[1]) | |
131 else: | |
132 headers.append(self.consume('HEADER')[1]) | |
133 self.consume('%%') | |
134 self.grammar = Grammar(terminals) | |
135 while self.Peak != 'eof': | |
136 self.parse_rule() | |
137 return self.grammar | |
138 | |
139 def parse_symbol(self): | |
140 return self.consume('ID')[1] | |
141 | |
142 def parse_rhs(self): | |
143 symbols = [] | |
144 while self.Peak not in [';', 'BRACEDCODE', '|']: | |
145 symbols.append(self.parse_symbol()) | |
146 if self.Peak == 'BRACEDCODE': | |
147 action = self.consume('BRACEDCODE')[1] | |
148 action = action[1:-1].strip() | |
149 else: | |
150 action = None | |
151 return symbols, action | |
152 | |
153 def parse_rule(self): | |
154 p = self.parse_symbol() | |
155 self.consume(':') | |
156 symbols, action = self.parse_rhs() | |
157 self.grammar.add_production(p, symbols, action) | |
158 while self.has_consumed('|'): | |
159 symbols, action = self.parse_rhs() | |
160 self.grammar.add_production(p, symbols, action) | |
161 self.consume(';') | |
162 | |
163 | |
164 class XaccGenerator: | |
165 """ Generator that writes generated parser to file """ | |
166 def __init__(self): | |
167 pass | |
168 | |
169 def generate(self, grammar, output_file): | |
170 print_grammar(grammar) | |
171 self.grammar = grammar | |
172 self.action_table, self.goto_table = grammar.doGenerate() | |
173 self.generate_python_script(output_file) | |
174 | |
175 def generate_python_script(self, f): | |
176 """ Generate python script with the parser table """ | |
177 print('#!/usr/bin/python', file=f) | |
178 stamp = datetime.datetime.now().ctime() | |
179 print('""" Automatically generated by xacc on {} """'.format(stamp), file=f) | |
180 print('from pyyacc import LRParser, Reduce, Shift, Accept, Production, Grammar', file=f) | |
181 print('from ppci import Token', file=f) | |
182 print(file=f) | |
183 print('class Parser(LRParser):', file=f) | |
184 print(' def __init__(self):', file=f) | |
185 # Generate rules: | |
186 print(' self.start_symbol = "{}"'.format(self.grammar.start_symbol), file=f) | |
187 print(' self.grammar = Grammar({})'.format(self.grammar.terminals), file=f) | |
188 for rule_number, rule in enumerate(self.grammar.productions): | |
189 rule.f_name = 'action_{}_{}'.format(rule.name, rule_number) | |
190 print(' self.grammar.add_production("{}", {}, self.{})'.format(rule.name, rule.symbols, rule.f_name), file=f) | |
191 # Fill action table: | |
192 print(' self.action_table = {}', file=f) | |
193 for state in self.action_table: | |
194 action = self.action_table[state] | |
195 print(' self.action_table[{}] = {}'.format(state, action), file=f) | |
196 print('', file=f) | |
197 | |
198 # Fill goto table: | |
199 print(' self.goto_table = {}', file=f) | |
200 for gt in self.goto_table: | |
201 to = self.goto_table[gt] | |
202 print(' self.goto_table[{}] = {}'.format(gt, to), file=f) | |
203 print('', file=f) | |
204 | |
205 # Generate a function for each action: | |
206 for rule in self.grammar.productions: | |
207 M = len(rule.symbols) | |
208 args = ', '.join('arg{}'.format(n + 1) for n in range(M)) | |
209 print(' def {}(self, {}):'.format(rule.f_name, args), file=f) | |
210 if rule.f == None: | |
211 semantics = 'pass' | |
212 else: | |
213 semantics = str(rule.f) | |
214 if semantics.strip() == '': | |
215 semantics = 'pass' | |
216 for n in range(M): | |
217 semantics = semantics.replace('${}'.format(n + 1), 'arg{}'.format(n + 1)) | |
218 print(' {}'.format(semantics), file=f) | |
219 print('', file=f) | |
220 | |
221 | |
222 def main(): | |
223 # Parse arguments: | |
224 parser = argparse.ArgumentParser(description='xacc compiler compiler') | |
225 parser.add_argument('source', type=argparse.FileType('r'), \ | |
226 help='the parser specification') | |
227 parser.add_argument('-o', '--output', type=argparse.FileType('w'), \ | |
228 default=sys.stdout) | |
229 args = parser.parse_args() | |
230 src = args.source.read() | |
231 args.source.close() | |
232 | |
233 # Construction of generator parts: | |
234 lexer = XaccLexer() | |
235 parser = XaccParser(lexer) | |
236 generator = XaccGenerator() | |
237 | |
238 # Sequence source through the generator parts: | |
239 lexer.feed(src) | |
240 grammar = parser.parse_grammar() | |
241 generator.generate(grammar, args.output) | |
242 args.output.close() | |
243 | |
244 | |
245 if __name__ == '__main__': | |
246 main() |