Mercurial > lcfOS
annotate python/pyyacc.py @ 399:a7c444404df9
Fix hexwrite
author | Windel Bouwman |
---|---|
date | Fri, 20 Jun 2014 16:36:49 +0200 |
parents | 3b0c495e3008 |
children |
rev | line source |
---|---|
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
1 """ |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
2 Parser generator script |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
3 """ |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
4 |
191 | 5 from ppci import Token |
6 | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
7 EPS = 'EPS' |
184 | 8 EOF = 'EOF' |
318 | 9 |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
10 |
185 | 11 class ParserGenerationException(Exception): |
186 | 12 """ Raised when something goes wrong during parser generation """ |
185 | 13 pass |
14 | |
15 | |
16 class ParserException(Exception): | |
186 | 17 """ Raised during a failure in the parsing process """ |
185 | 18 pass |
19 | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
20 |
318 | 21 class Action: |
22 def __repr__(self): | |
23 return 'Action' | |
24 | |
25 def __eq__(self, other): | |
26 return str(self) == str(other) | |
27 | |
28 | |
29 class Shift(Action): | |
30 def __init__(self, to_state): | |
31 self.to_state = to_state | |
32 | |
33 def __repr__(self): | |
34 return 'Shift({})'.format(self.to_state) | |
35 | |
36 | |
37 class Reduce(Action): | |
38 def __init__(self, rule): | |
39 self.rule = rule | |
40 | |
41 def __repr__(self): | |
42 return 'Reduce({})'.format(self.rule) | |
43 | |
44 | |
45 class Accept(Reduce): | |
46 def __repr__(self): | |
47 return 'Accept({})'.format(self.rule) | |
48 | |
49 | |
50 def print_grammar(g): | |
51 """ Pretty print a grammar """ | |
52 print(g) | |
53 for production in g.productions: | |
54 print(production) | |
55 | |
319 | 56 |
57 def calculate_first_sets(grammar): | |
58 """ | |
59 Calculate first sets for each grammar symbol | |
60 This is a dictionary which maps each grammar symbol | |
61 to a set of terminals that can be encountered first | |
62 when looking for the symbol. | |
63 """ | |
64 first = {} | |
65 nullable = {} | |
66 for terminal in grammar.terminals | {EOF, EPS}: | |
67 first[terminal] = set([terminal]) | |
68 nullable[terminal] = False | |
69 for nt in grammar.nonterminals: | |
70 first[nt] = set() | |
71 nullable[nt] = False | |
72 while True: | |
73 some_change = False | |
74 for rule in grammar.productions: | |
75 # Check for null-ability: | |
76 if all(nullable[beta] for beta in rule.symbols): | |
77 if not nullable[rule.name]: | |
78 nullable[rule.name] = True | |
79 some_change = True | |
80 # Update first sets: | |
81 for beta in rule.symbols: | |
82 if not nullable[beta]: | |
83 if first[beta] - first[rule.name]: | |
84 first[rule.name] |= first[beta] | |
85 some_change = True | |
86 break | |
87 if not some_change: | |
88 break | |
89 return first | |
90 | |
91 | |
178 | 92 class Grammar: |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
93 """ Defines a grammar of a language """ |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
94 def __init__(self, terminals): |
319 | 95 self.terminals = set(terminals) |
96 self.nonterminals = set() | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
97 self.productions = [] |
185 | 98 self._first = None # Cached first set |
318 | 99 self.start_symbol = None |
100 | |
101 def __repr__(self): | |
102 return 'Grammar with {} rules'.format(len(self.productions)) | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
103 |
194 | 104 def add_production(self, name, symbols, f=None): |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
105 """ Add a production rule to the grammar """ |
194 | 106 production = Production(name, symbols, f) |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
107 self.productions.append(production) |
192 | 108 if name in self.terminals: |
109 raise ParserGenerationException("Cannot redefine terminal {0}".format(name)) | |
319 | 110 self.nonterminals.add(name) |
194 | 111 self._first = None # Invalidate cached version |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
112 |
383 | 113 def add_one_or_more(self, element_nonterm, list_nonterm): |
114 """ Helper to add the rule | |
115 lst: elem | |
116 lst: lst elem | |
117 """ | |
118 def a(el): | |
119 return [el] | |
120 def b(ls, el): | |
121 ls.append(el) | |
122 return ls | |
123 self.add_production(list_nonterm, [element_nonterm], a) | |
124 self.add_production(list_nonterm, [list_nonterm, element_nonterm], b) | |
125 | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
126 def productionsForName(self, name): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
127 """ Retrieve all productions for a non terminal """ |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
128 return [p for p in self.productions if p.name == name] |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
129 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
130 @property |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
131 def Symbols(self): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
132 """ Get all the symbols defined by this grammar """ |
319 | 133 return self.nonterminals | self.terminals |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
134 |
185 | 135 @property |
136 def first(self): | |
318 | 137 """ |
186 | 138 The first set is a mapping from a grammar symbol to a set of |
139 set of all terminal symbols that can be the first terminal when | |
318 | 140 looking for the grammar symbol |
186 | 141 """ |
185 | 142 if not self._first: |
319 | 143 self._first = calculate_first_sets(self) |
185 | 144 return self._first |
145 | |
184 | 146 def closure(self, itemset): |
147 """ Expand itemset by using epsilon moves """ | |
148 worklist = list(itemset) | |
149 def addIt(itm): | |
150 if not itm in itemset: | |
151 itemset.add(itm) | |
152 worklist.append(itm) | |
186 | 153 def first2(itm): |
184 | 154 # When using the first sets, create a copy: |
186 | 155 f = set(self.first[itm.NextNext]) |
184 | 156 if EPS in f: |
157 f.discard(EPS) | |
186 | 158 f.add(itm.look_ahead) |
184 | 159 return f |
319 | 160 # Start of algorithm: |
184 | 161 while worklist: |
162 item = worklist.pop(0) | |
163 if not item.IsShift: | |
164 continue | |
165 if not (item.Next in self.nonterminals): | |
166 continue | |
167 C = item.Next | |
168 for add_p in self.productionsForName(C): | |
186 | 169 for b in first2(item): |
184 | 170 addIt(Item(add_p, 0, b)) |
171 return frozenset(itemset) | |
172 | |
173 def initialItemSet(self): | |
174 """ Calculates the initial item set """ | |
175 iis = set() | |
176 for p in self.productionsForName(self.start_symbol): | |
177 iis.add(Item(p, 0, EOF)) | |
178 return self.closure(iis) | |
179 | |
180 def nextItemSet(self, itemset, symbol): | |
318 | 181 """ |
319 | 182 Determines the next itemset for the current set and a symbol |
184 | 183 This is the goto procedure |
184 """ | |
185 next_set = set() | |
186 for item in itemset: | |
187 if item.can_shift_over(symbol): | |
188 next_set.add(item.shifted()) | |
189 return self.closure(next_set) | |
318 | 190 |
184 | 191 def genCanonicalSet(self, iis): |
185 | 192 states = [] |
184 | 193 worklist = [] |
185 | 194 transitions = {} |
184 | 195 def addSt(s): |
196 if not (s in states): | |
197 worklist.append(s) | |
185 | 198 states.append(s) |
184 | 199 addSt(iis) |
200 while len(worklist) > 0: | |
201 itemset = worklist.pop(0) | |
202 for symbol in self.Symbols: | |
203 nis = self.nextItemSet(itemset, symbol) | |
204 if not nis: | |
205 continue | |
206 addSt(nis) | |
185 | 207 transitions[(states.index(itemset), symbol)] = states.index(nis) |
208 return states, transitions | |
318 | 209 |
192 | 210 def checkSymbols(self): |
211 """ Checks no symbols are undefined """ | |
212 for production in self.productions: | |
213 for symbol in production.symbols: | |
319 | 214 if symbol not in self.Symbols: |
192 | 215 raise ParserGenerationException('Symbol {0} undefined'.format(symbol)) |
216 | |
340 | 217 def generate_parser(self): |
218 """ Generates a parser from the grammar """ | |
219 action_table, goto_table = self.generate_tables() | |
218 | 220 p = LRParser(action_table, goto_table, self.start_symbol) |
221 p.grammar = self | |
222 return p | |
223 | |
340 | 224 def generate_tables(self): |
225 """ Generate parsing tables """ | |
318 | 226 if not self.start_symbol: |
227 self.start_symbol = self.productions[0].name | |
192 | 228 self.checkSymbols() |
184 | 229 action_table = {} |
185 | 230 goto_table = {} |
184 | 231 iis = self.initialItemSet() |
232 | |
233 # First generate all item sets by using the nextItemset function: | |
185 | 234 states, transitions = self.genCanonicalSet(iis) |
194 | 235 |
185 | 236 def setAction(state, t, action): |
318 | 237 assert isinstance(action, Action) |
185 | 238 key = (state, t) |
218 | 239 assert type(state) is int |
240 assert type(t) is str | |
185 | 241 if key in action_table: |
242 action2 = action_table[key] | |
243 if action != action2: | |
318 | 244 if (type(action2) is Reduce) and (type(action) is Shift): |
194 | 245 # Automatically resolve and do the shift action! |
246 # Simple, but almost always what you want!! | |
247 action_table[key] = action | |
318 | 248 elif isinstance(action2, Shift) and isinstance(action, Reduce): |
249 pass | |
194 | 250 else: |
318 | 251 a1 = str(action) |
252 a2 = str(action2) | |
253 raise ParserGenerationException('LR construction conflict {0} vs {1}'.format(a1, a2)) | |
185 | 254 else: |
255 action_table[key] = action | |
184 | 256 |
257 # Fill action table: | |
258 for state in states: | |
185 | 259 # Detect conflicts: |
184 | 260 for item in state: |
261 if item.IsShift and item.Next in self.terminals: | |
185 | 262 # Rule 1, a shift item: |
263 nextstate = transitions[(states.index(state), item.Next)] | |
318 | 264 setAction(states.index(state), item.Next, Shift(nextstate)) |
185 | 265 if item.IsReduce: |
266 if item.production.name == self.start_symbol and item.look_ahead == EOF: | |
267 # Rule 3: accept: | |
318 | 268 act = Accept(self.productions.index(item.production)) |
184 | 269 else: |
185 | 270 # Rule 2, reduce item: |
318 | 271 act = Reduce(self.productions.index(item.production)) |
272 setAction(states.index(state), item.look_ahead, act) | |
185 | 273 for nt in self.nonterminals: |
274 key = (states.index(state), nt) | |
275 if key in transitions: | |
276 goto_table[key] = transitions[key] | |
218 | 277 return action_table, goto_table |
185 | 278 |
178 | 279 |
280 class Production: | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
281 """ Production rule for a grammar """ |
341 | 282 def __init__(self, name, symbols, f): |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
283 self.name = name |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
284 self.symbols = symbols |
194 | 285 self.f = f |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
286 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
287 def __repr__(self): |
318 | 288 action = ' ' + str(self.f) if self.f else '' |
289 return '{0} -> {1}'.format(self.name, self.symbols) + action | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
290 |
179 | 291 |
178 | 292 class Item: |
318 | 293 """ |
294 Represents a partially parsed item | |
186 | 295 It has a production it is looking for, a position |
296 in this production called the 'dot' and a look ahead | |
297 symbol that must follow this item. | |
298 """ | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
299 def __init__(self, production, dotpos, look_ahead): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
300 self.production = production |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
301 self.dotpos = dotpos |
184 | 302 assert self.dotpos <= len(self.production.symbols) |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
303 self.look_ahead = look_ahead |
346 | 304 self._is_shift = self.dotpos < len(self.production.symbols) |
395 | 305 self.IsShift = self._is_shift |
306 if self.IsShift: | |
307 self.Next = self.production.symbols[self.dotpos] | |
308 self._data = (self.production, self.dotpos, self.look_ahead) | |
309 self._hash = self._data.__hash__() | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
310 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
311 def __eq__(self, other): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
312 if type(other) is type(self): |
395 | 313 return self._data == other._data |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
314 return False |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
315 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
316 def __hash__(self): |
395 | 317 return self._hash |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
318 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
319 @property |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
320 def IsReduce(self): |
186 | 321 """ Check if this item has the dot at the end """ |
346 | 322 return not self._is_shift |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
323 |
184 | 324 def can_shift_over(self, symbol): |
325 """ Determines if this item can shift over the given symbol """ | |
395 | 326 return self._is_shift and self.Next == symbol |
184 | 327 |
328 def shifted(self): | |
329 """ Creates a new item that is shifted one position """ | |
330 return Item(self.production, self.dotpos + 1, self.look_ahead) | |
331 | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
332 @property |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
333 def NextNext(self): |
186 | 334 """ Gets the symbol after the next symbol, or EPS if at the end """ |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
335 if self.dotpos + 1 >= len(self.production.symbols): |
184 | 336 return EPS |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
337 else: |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
338 return self.production.symbols[self.dotpos + 1] |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
339 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
340 def __repr__(self): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
341 prod = self.production |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
342 predot = ' '.join(prod.symbols[0:self.dotpos]) |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
343 postdot = ' '.join(prod.symbols[self.dotpos:]) |
186 | 344 name = prod.name |
345 args = (name, predot, postdot, self.look_ahead) | |
185 | 346 return '[{0} -> {1} . {2} -> {3}]'.format(*args) |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
347 |
179 | 348 |
349 class LRParser: | |
395 | 350 """ LR parser automata. This class takes goto and action table |
351 and can then process a sequence of tokens. | |
352 """ | |
195 | 353 def __init__(self, action_table, goto_table, start_symbol): |
184 | 354 self.action_table = action_table |
185 | 355 self.goto_table = goto_table |
195 | 356 self.start_symbol = start_symbol |
184 | 357 |
318 | 358 def parse(self, lexer): |
191 | 359 """ Parse an iterable with tokens """ |
318 | 360 assert hasattr(lexer, 'next_token'), '{0} is no lexer'.format(type(lexer)) |
185 | 361 stack = [0] |
195 | 362 r_data_stack = [] |
318 | 363 look_ahead = lexer.next_token() |
191 | 364 assert type(look_ahead) is Token |
195 | 365 # TODO: exit on this condition: |
377 | 366 while stack != [0, self.start_symbol, 0]: |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
367 state = stack[-1] # top of stack |
191 | 368 key = (state, look_ahead.typ) |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
369 if not key in self.action_table: |
194 | 370 raise ParserException('Error parsing at character {0}'.format(look_ahead)) |
318 | 371 action = self.action_table[key] |
372 if type(action) is Reduce: | |
194 | 373 f_args = [] |
318 | 374 prod = self.grammar.productions[action.rule] |
375 for s in prod.symbols: | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
376 stack.pop() |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
377 stack.pop() |
195 | 378 f_args.append(r_data_stack.pop()) |
379 f_args.reverse() | |
380 r_data = None | |
318 | 381 if prod.f: |
382 r_data = prod.f(*f_args) | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
383 state = stack[-1] |
318 | 384 stack.append(prod.name) |
385 stack.append(self.goto_table[(state, prod.name)]) | |
195 | 386 r_data_stack.append(r_data) |
318 | 387 elif type(action) is Shift: |
191 | 388 stack.append(look_ahead.typ) |
318 | 389 stack.append(action.to_state) |
390 r_data_stack.append(look_ahead) | |
391 look_ahead = lexer.next_token() | |
191 | 392 assert type(look_ahead) is Token |
318 | 393 elif type(action) is Accept: |
195 | 394 # Pop last rule data off the stack: |
395 f_args = [] | |
318 | 396 param = self.grammar.productions[action.rule] |
195 | 397 for s in param.symbols: |
398 stack.pop() | |
399 stack.pop() | |
400 f_args.append(r_data_stack.pop()) | |
401 f_args.reverse() | |
402 if param.f: | |
318 | 403 ret_val = param.f(*f_args) |
404 else: | |
405 ret_val = None | |
195 | 406 # Break out! |
377 | 407 stack.append(param.name) |
408 stack.append(0) | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
409 break |
195 | 410 # At exit, the stack must be 1 long |
411 # TODO: fix that this holds: | |
377 | 412 #assert stack == [0, self.start_symbol, 0] |
318 | 413 return ret_val |
377 | 414 |