Mercurial > lcfOS
comparison python/pyyacc.py @ 318:e84047f29c78
Add burg and yacc initial attempts
author | Windel Bouwman |
---|---|
date | Tue, 31 Dec 2013 12:38:15 +0100 |
parents | 6259856841a0 |
children | 8d07a4254f04 |
comparison
equal
deleted
inserted
replaced
317:e30a77ae359b | 318:e84047f29c78 |
---|---|
1 """ | 1 """ |
2 Parser generator script | 2 Parser generator script |
3 """ | 3 """ |
4 | 4 |
5 import hashlib | |
6 from ppci import Token | 5 from ppci import Token |
7 | 6 |
8 EPS = 'EPS' | 7 EPS = 'EPS' |
9 EOF = 'EOF' | 8 EOF = 'EOF' |
10 SHIFT = 1 | 9 |
11 REDUCE = 2 | |
12 ACCEPT = 3 | |
13 | 10 |
14 class ParserGenerationException(Exception): | 11 class ParserGenerationException(Exception): |
15 """ Raised when something goes wrong during parser generation """ | 12 """ Raised when something goes wrong during parser generation """ |
16 pass | 13 pass |
17 | 14 |
18 | 15 |
19 class ParserException(Exception): | 16 class ParserException(Exception): |
20 """ Raised during a failure in the parsing process """ | 17 """ Raised during a failure in the parsing process """ |
21 pass | 18 pass |
22 | 19 |
20 | |
21 class Action: | |
22 def __repr__(self): | |
23 return 'Action' | |
24 | |
25 def __eq__(self, other): | |
26 return str(self) == str(other) | |
27 | |
28 | |
29 class Shift(Action): | |
30 def __init__(self, to_state): | |
31 self.to_state = to_state | |
32 | |
33 def __repr__(self): | |
34 return 'Shift({})'.format(self.to_state) | |
35 | |
36 | |
37 class Reduce(Action): | |
38 def __init__(self, rule): | |
39 self.rule = rule | |
40 | |
41 def __repr__(self): | |
42 return 'Reduce({})'.format(self.rule) | |
43 | |
44 | |
45 class Accept(Reduce): | |
46 def __repr__(self): | |
47 return 'Accept({})'.format(self.rule) | |
48 | |
49 | |
50 def print_grammar(g): | |
51 """ Pretty print a grammar """ | |
52 print(g) | |
53 for production in g.productions: | |
54 print(production) | |
23 | 55 |
24 class Grammar: | 56 class Grammar: |
25 """ Defines a grammar of a language """ | 57 """ Defines a grammar of a language """ |
26 def __init__(self, terminals): | 58 def __init__(self, terminals): |
27 self.terminals = terminals | 59 self.terminals = terminals |
28 self.nonterminals = [] | 60 self.nonterminals = [] |
29 self.productions = [] | 61 self.productions = [] |
30 self._first = None # Cached first set | 62 self._first = None # Cached first set |
63 self.start_symbol = None | |
64 | |
65 def __repr__(self): | |
66 return 'Grammar with {} rules'.format(len(self.productions)) | |
31 | 67 |
32 def add_production(self, name, symbols, f=None): | 68 def add_production(self, name, symbols, f=None): |
33 """ Add a production rule to the grammar """ | 69 """ Add a production rule to the grammar """ |
34 production = Production(name, symbols, f) | 70 production = Production(name, symbols, f) |
35 self.productions.append(production) | 71 self.productions.append(production) |
48 """ Get all the symbols defined by this grammar """ | 84 """ Get all the symbols defined by this grammar """ |
49 return self.nonterminals + self.terminals | 85 return self.nonterminals + self.terminals |
50 | 86 |
51 @property | 87 @property |
52 def first(self): | 88 def first(self): |
53 """ | 89 """ |
54 The first set is a mapping from a grammar symbol to a set of | 90 The first set is a mapping from a grammar symbol to a set of |
55 set of all terminal symbols that can be the first terminal when | 91 set of all terminal symbols that can be the first terminal when |
56 looking for the grammar symbol | 92 looking for the grammar symbol |
57 """ | 93 """ |
58 if not self._first: | 94 if not self._first: |
59 self._first = self.calcFirstSets() | 95 self._first = self.calcFirstSets() |
60 return self._first | 96 return self._first |
61 | 97 |
123 for p in self.productionsForName(self.start_symbol): | 159 for p in self.productionsForName(self.start_symbol): |
124 iis.add(Item(p, 0, EOF)) | 160 iis.add(Item(p, 0, EOF)) |
125 return self.closure(iis) | 161 return self.closure(iis) |
126 | 162 |
127 def nextItemSet(self, itemset, symbol): | 163 def nextItemSet(self, itemset, symbol): |
128 """ | 164 """ |
129 Determines the next itemset for the current set and a symbol | 165 Determines the next itemset for the current set and a symbol |
130 This is the goto procedure | 166 This is the goto procedure |
131 """ | 167 """ |
132 next_set = set() | 168 next_set = set() |
133 for item in itemset: | 169 for item in itemset: |
134 if item.can_shift_over(symbol): | 170 if item.can_shift_over(symbol): |
135 next_set.add(item.shifted()) | 171 next_set.add(item.shifted()) |
136 return self.closure(next_set) | 172 return self.closure(next_set) |
137 | 173 |
138 def genCanonicalSet(self, iis): | 174 def genCanonicalSet(self, iis): |
139 states = [] | 175 states = [] |
140 worklist = [] | 176 worklist = [] |
141 transitions = {} | 177 transitions = {} |
142 def addSt(s): | 178 def addSt(s): |
151 if not nis: | 187 if not nis: |
152 continue | 188 continue |
153 addSt(nis) | 189 addSt(nis) |
154 transitions[(states.index(itemset), symbol)] = states.index(nis) | 190 transitions[(states.index(itemset), symbol)] = states.index(nis) |
155 return states, transitions | 191 return states, transitions |
156 | 192 |
157 def checkSymbols(self): | 193 def checkSymbols(self): |
158 """ Checks no symbols are undefined """ | 194 """ Checks no symbols are undefined """ |
159 for production in self.productions: | 195 for production in self.productions: |
160 for symbol in production.symbols: | 196 for symbol in production.symbols: |
161 if symbol not in self.Symbols + [EPS]: | 197 if symbol not in self.Symbols + [EPS]: |
162 raise ParserGenerationException('Symbol {0} undefined'.format(symbol)) | 198 raise ParserGenerationException('Symbol {0} undefined'.format(symbol)) |
163 | 199 |
164 def getSignature(self): | |
165 m = hashlib.md5() | |
166 m.update((str(self.productions) + str(self.start_symbol)).encode('ascii')) | |
167 signature = m.hexdigest() | |
168 | |
169 def genParser(self): | 200 def genParser(self): |
170 """ Generates a parser from the grammar (using a caching algorithm) """ | 201 """ Generates a parser from the grammar (using a caching algorithm) """ |
171 signature = self.getSignature() | |
172 action_table, goto_table = self.doGenerate() | 202 action_table, goto_table = self.doGenerate() |
173 p = LRParser(action_table, goto_table, self.start_symbol) | 203 p = LRParser(action_table, goto_table, self.start_symbol) |
174 p.grammar = self | 204 p.grammar = self |
175 return p | 205 return p |
176 | 206 |
177 def doGenerate(self): | 207 def doGenerate(self): |
208 if not self.start_symbol: | |
209 self.start_symbol = self.productions[0].name | |
178 self.checkSymbols() | 210 self.checkSymbols() |
179 action_table = {} | 211 action_table = {} |
180 goto_table = {} | 212 goto_table = {} |
181 iis = self.initialItemSet() | 213 iis = self.initialItemSet() |
182 | 214 |
183 # First generate all item sets by using the nextItemset function: | 215 # First generate all item sets by using the nextItemset function: |
184 states, transitions = self.genCanonicalSet(iis) | 216 states, transitions = self.genCanonicalSet(iis) |
185 | 217 |
186 def action_str(act): | |
187 a, p = act | |
188 if a is SHIFT: | |
189 return 'Shift {0}'.format(0) | |
190 elif a is REDUCE: | |
191 return 'Reduce {0}'.format(p) | |
192 return 'Other' | |
193 | |
194 def setAction(state, t, action): | 218 def setAction(state, t, action): |
219 assert isinstance(action, Action) | |
195 key = (state, t) | 220 key = (state, t) |
196 assert type(state) is int | 221 assert type(state) is int |
197 assert type(t) is str | 222 assert type(t) is str |
198 if key in action_table: | 223 if key in action_table: |
199 action2 = action_table[key] | 224 action2 = action_table[key] |
200 if action != action2: | 225 if action != action2: |
201 if (action2[0] == REDUCE) and (action[0] == SHIFT): | 226 if (type(action2) is Reduce) and (type(action) is Shift): |
202 # Automatically resolve and do the shift action! | 227 # Automatically resolve and do the shift action! |
203 # Simple, but almost always what you want!! | 228 # Simple, but almost always what you want!! |
204 action_table[key] = action | 229 action_table[key] = action |
230 elif isinstance(action2, Shift) and isinstance(action, Reduce): | |
231 pass | |
205 else: | 232 else: |
206 if (action2[0] == SHIFT) and (action[0] == REDUCE): | 233 a1 = str(action) |
207 pass | 234 a2 = str(action2) |
208 else: | 235 raise ParserGenerationException('LR construction conflict {0} vs {1}'.format(a1, a2)) |
209 a1 = action_str(action) | |
210 a2 = action_str(action2) | |
211 raise ParserGenerationException('LR construction conflict {0} vs {1}'.format(a1, a2)) | |
212 else: | 236 else: |
213 action_table[key] = action | 237 action_table[key] = action |
214 | 238 |
215 # Fill action table: | 239 # Fill action table: |
216 for state in states: | 240 for state in states: |
217 # Detect conflicts: | 241 # Detect conflicts: |
218 for item in state: | 242 for item in state: |
219 if item.IsShift and item.Next in self.terminals: | 243 if item.IsShift and item.Next in self.terminals: |
220 # Rule 1, a shift item: | 244 # Rule 1, a shift item: |
221 nextstate = transitions[(states.index(state), item.Next)] | 245 nextstate = transitions[(states.index(state), item.Next)] |
222 setAction(states.index(state), item.Next, (SHIFT, nextstate)) | 246 setAction(states.index(state), item.Next, Shift(nextstate)) |
223 if item.IsReduce: | 247 if item.IsReduce: |
224 if item.production.name == self.start_symbol and item.look_ahead == EOF: | 248 if item.production.name == self.start_symbol and item.look_ahead == EOF: |
225 # Rule 3: accept: | 249 # Rule 3: accept: |
226 setAction(states.index(state), item.look_ahead, (ACCEPT, self.productions.index(item.production))) | 250 act = Accept(self.productions.index(item.production)) |
227 else: | 251 else: |
228 # Rule 2, reduce item: | 252 # Rule 2, reduce item: |
229 setAction(states.index(state), item.look_ahead, (REDUCE, self.productions.index(item.production))) | 253 act = Reduce(self.productions.index(item.production)) |
254 setAction(states.index(state), item.look_ahead, act) | |
230 for nt in self.nonterminals: | 255 for nt in self.nonterminals: |
231 key = (states.index(state), nt) | 256 key = (states.index(state), nt) |
232 if key in transitions: | 257 if key in transitions: |
233 goto_table[key] = transitions[key] | 258 goto_table[key] = transitions[key] |
234 return action_table, goto_table | 259 return action_table, goto_table |
240 self.name = name | 265 self.name = name |
241 self.symbols = symbols | 266 self.symbols = symbols |
242 self.f = f | 267 self.f = f |
243 | 268 |
244 def __repr__(self): | 269 def __repr__(self): |
245 return '{0} -> {1}'.format(self.name, self.symbols) | 270 action = ' ' + str(self.f) if self.f else '' |
271 return '{0} -> {1}'.format(self.name, self.symbols) + action | |
246 | 272 |
247 | 273 |
248 class Item: | 274 class Item: |
249 """ | 275 """ |
250 Represents a partially parsed item | 276 Represents a partially parsed item |
251 It has a production it is looking for, a position | 277 It has a production it is looking for, a position |
252 in this production called the 'dot' and a look ahead | 278 in this production called the 'dot' and a look ahead |
253 symbol that must follow this item. | 279 symbol that must follow this item. |
254 """ | 280 """ |
255 def __init__(self, production, dotpos, look_ahead): | 281 def __init__(self, production, dotpos, look_ahead): |
309 args = (name, predot, postdot, self.look_ahead) | 335 args = (name, predot, postdot, self.look_ahead) |
310 return '[{0} -> {1} . {2} -> {3}]'.format(*args) | 336 return '[{0} -> {1} . {2} -> {3}]'.format(*args) |
311 | 337 |
312 | 338 |
313 class LRParser: | 339 class LRParser: |
314 """ LR parser """ | 340 """ LR parser automata """ |
315 def __init__(self, action_table, goto_table, start_symbol): | 341 def __init__(self, action_table, goto_table, start_symbol): |
316 self.action_table = action_table | 342 self.action_table = action_table |
317 self.goto_table = goto_table | 343 self.goto_table = goto_table |
318 self.start_symbol = start_symbol | 344 self.start_symbol = start_symbol |
319 | 345 |
320 def parse(self, toks): | 346 def parse(self, lexer): |
321 """ Parse an iterable with tokens """ | 347 """ Parse an iterable with tokens """ |
322 assert hasattr(toks, '__iter__'), '{0} not iter type'.format(type(toks)) | 348 assert hasattr(lexer, 'next_token'), '{0} is no lexer'.format(type(lexer)) |
323 stack = [0] | 349 stack = [0] |
324 r_data_stack = [] | 350 r_data_stack = [] |
325 try: | 351 look_ahead = lexer.next_token() |
326 look_ahead = toks.__next__() | |
327 except StopIteration: | |
328 look_ahead = Token(EOF, EOF) | |
329 assert type(look_ahead) is Token | 352 assert type(look_ahead) is Token |
330 # TODO: exit on this condition: | 353 # TODO: exit on this condition: |
331 while stack != [0, self.start_symbol, 2222]: | 354 while stack != [0, self.start_symbol, 2222]: |
332 #print(stack) | |
333 state = stack[-1] # top of stack | 355 state = stack[-1] # top of stack |
334 key = (state, look_ahead.typ) | 356 key = (state, look_ahead.typ) |
335 if not key in self.action_table: | 357 if not key in self.action_table: |
336 raise ParserException('Error parsing at character {0}'.format(look_ahead)) | 358 raise ParserException('Error parsing at character {0}'.format(look_ahead)) |
337 action, param = self.action_table[key] | 359 action = self.action_table[key] |
338 if action == REDUCE: | 360 if type(action) is Reduce: |
339 f_args = [] | 361 f_args = [] |
340 param = self.grammar.productions[param] | 362 prod = self.grammar.productions[action.rule] |
341 for s in param.symbols: | 363 for s in prod.symbols: |
342 stack.pop() | 364 stack.pop() |
343 stack.pop() | 365 stack.pop() |
344 f_args.append(r_data_stack.pop()) | 366 f_args.append(r_data_stack.pop()) |
345 f_args.reverse() | 367 f_args.reverse() |
346 r_data = None | 368 r_data = None |
347 if param.f: | 369 if prod.f: |
348 r_data = param.f(*f_args) | 370 r_data = prod.f(*f_args) |
349 state = stack[-1] | 371 state = stack[-1] |
350 stack.append(param.name) | 372 stack.append(prod.name) |
351 stack.append(self.goto_table[(state, param.name)]) | 373 stack.append(self.goto_table[(state, prod.name)]) |
352 r_data_stack.append(r_data) | 374 r_data_stack.append(r_data) |
353 elif action == SHIFT: | 375 elif type(action) is Shift: |
354 stack.append(look_ahead.typ) | 376 stack.append(look_ahead.typ) |
355 stack.append(param) | 377 stack.append(action.to_state) |
356 r_data_stack.append(look_ahead.val) | 378 r_data_stack.append(look_ahead) |
357 try: | 379 look_ahead = lexer.next_token() |
358 look_ahead = toks.__next__() | |
359 except StopIteration: | |
360 look_ahead = Token(EOF, EOF) | |
361 assert type(look_ahead) is Token | 380 assert type(look_ahead) is Token |
362 elif action == ACCEPT: | 381 elif type(action) is Accept: |
363 # Pop last rule data off the stack: | 382 # Pop last rule data off the stack: |
364 f_args = [] | 383 f_args = [] |
365 param = self.grammar.productions[param] | 384 param = self.grammar.productions[action.rule] |
366 for s in param.symbols: | 385 for s in param.symbols: |
367 stack.pop() | 386 stack.pop() |
368 stack.pop() | 387 stack.pop() |
369 f_args.append(r_data_stack.pop()) | 388 f_args.append(r_data_stack.pop()) |
370 f_args.reverse() | 389 f_args.reverse() |
371 if param.f: | 390 if param.f: |
372 param.f(*f_args) | 391 ret_val = param.f(*f_args) |
392 else: | |
393 ret_val = None | |
373 # Break out! | 394 # Break out! |
374 break | 395 break |
375 # At exit, the stack must be 1 long | 396 # At exit, the stack must be 1 long |
376 # TODO: fix that this holds: | 397 # TODO: fix that this holds: |
377 #assert len(stack) == 1, 'stack {0} not totally reduce'.format(stack) | 398 #assert len(stack) == 1, 'stack {0} not totally reduce'.format(stack) |
378 | 399 return ret_val |