Mercurial > lcfOS
annotate python/pyyacc.py @ 334:6f4753202b9a
Added more recipes
author | Windel Bouwman |
---|---|
date | Thu, 13 Feb 2014 22:02:08 +0100 |
parents | 8d07a4254f04 |
children | c7cc54c0dfdf |
rev | line source |
---|---|
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
1 """ |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
2 Parser generator script |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
3 """ |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
4 |
191 | 5 from ppci import Token |
6 | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
7 EPS = 'EPS' |
184 | 8 EOF = 'EOF' |
318 | 9 |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
10 |
185 | 11 class ParserGenerationException(Exception): |
186 | 12 """ Raised when something goes wrong during parser generation """ |
185 | 13 pass |
14 | |
15 | |
16 class ParserException(Exception): | |
186 | 17 """ Raised during a failure in the parsing process """ |
185 | 18 pass |
19 | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
20 |
318 | 21 class Action: |
22 def __repr__(self): | |
23 return 'Action' | |
24 | |
25 def __eq__(self, other): | |
26 return str(self) == str(other) | |
27 | |
28 | |
29 class Shift(Action): | |
30 def __init__(self, to_state): | |
31 self.to_state = to_state | |
32 | |
33 def __repr__(self): | |
34 return 'Shift({})'.format(self.to_state) | |
35 | |
36 | |
37 class Reduce(Action): | |
38 def __init__(self, rule): | |
39 self.rule = rule | |
40 | |
41 def __repr__(self): | |
42 return 'Reduce({})'.format(self.rule) | |
43 | |
44 | |
45 class Accept(Reduce): | |
46 def __repr__(self): | |
47 return 'Accept({})'.format(self.rule) | |
48 | |
49 | |
50 def print_grammar(g): | |
51 """ Pretty print a grammar """ | |
52 print(g) | |
53 for production in g.productions: | |
54 print(production) | |
55 | |
319 | 56 |
57 def calculate_first_sets(grammar): | |
58 """ | |
59 Calculate first sets for each grammar symbol | |
60 This is a dictionary which maps each grammar symbol | |
61 to a set of terminals that can be encountered first | |
62 when looking for the symbol. | |
63 """ | |
64 first = {} | |
65 nullable = {} | |
66 for terminal in grammar.terminals | {EOF, EPS}: | |
67 first[terminal] = set([terminal]) | |
68 nullable[terminal] = False | |
69 for nt in grammar.nonterminals: | |
70 first[nt] = set() | |
71 nullable[nt] = False | |
72 while True: | |
73 some_change = False | |
74 for rule in grammar.productions: | |
75 # Check for null-ability: | |
76 if all(nullable[beta] for beta in rule.symbols): | |
77 if not nullable[rule.name]: | |
78 nullable[rule.name] = True | |
79 some_change = True | |
80 # Update first sets: | |
81 for beta in rule.symbols: | |
82 if not nullable[beta]: | |
83 if first[beta] - first[rule.name]: | |
84 first[rule.name] |= first[beta] | |
85 some_change = True | |
86 break | |
87 if not some_change: | |
88 break | |
89 return first | |
90 | |
91 | |
178 | 92 class Grammar: |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
93 """ Defines a grammar of a language """ |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
94 def __init__(self, terminals): |
319 | 95 self.terminals = set(terminals) |
96 self.nonterminals = set() | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
97 self.productions = [] |
185 | 98 self._first = None # Cached first set |
318 | 99 self.start_symbol = None |
100 | |
101 def __repr__(self): | |
102 return 'Grammar with {} rules'.format(len(self.productions)) | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
103 |
194 | 104 def add_production(self, name, symbols, f=None): |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
105 """ Add a production rule to the grammar """ |
194 | 106 production = Production(name, symbols, f) |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
107 self.productions.append(production) |
192 | 108 if name in self.terminals: |
109 raise ParserGenerationException("Cannot redefine terminal {0}".format(name)) | |
319 | 110 self.nonterminals.add(name) |
194 | 111 self._first = None # Invalidate cached version |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
112 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
113 def productionsForName(self, name): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
114 """ Retrieve all productions for a non terminal """ |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
115 return [p for p in self.productions if p.name == name] |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
116 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
117 @property |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
118 def Symbols(self): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
119 """ Get all the symbols defined by this grammar """ |
319 | 120 return self.nonterminals | self.terminals |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
121 |
185 | 122 @property |
123 def first(self): | |
318 | 124 """ |
186 | 125 The first set is a mapping from a grammar symbol to a set of |
126 set of all terminal symbols that can be the first terminal when | |
318 | 127 looking for the grammar symbol |
186 | 128 """ |
185 | 129 if not self._first: |
319 | 130 self._first = calculate_first_sets(self) |
185 | 131 return self._first |
132 | |
184 | 133 def closure(self, itemset): |
134 """ Expand itemset by using epsilon moves """ | |
135 worklist = list(itemset) | |
136 def addIt(itm): | |
137 if not itm in itemset: | |
138 itemset.add(itm) | |
139 worklist.append(itm) | |
186 | 140 def first2(itm): |
184 | 141 # When using the first sets, create a copy: |
186 | 142 f = set(self.first[itm.NextNext]) |
184 | 143 if EPS in f: |
144 f.discard(EPS) | |
186 | 145 f.add(itm.look_ahead) |
184 | 146 return f |
319 | 147 # Start of algorithm: |
184 | 148 while worklist: |
149 item = worklist.pop(0) | |
150 if not item.IsShift: | |
151 continue | |
152 if not (item.Next in self.nonterminals): | |
153 continue | |
154 C = item.Next | |
155 for add_p in self.productionsForName(C): | |
186 | 156 for b in first2(item): |
184 | 157 addIt(Item(add_p, 0, b)) |
158 return frozenset(itemset) | |
159 | |
160 def initialItemSet(self): | |
161 """ Calculates the initial item set """ | |
162 iis = set() | |
163 for p in self.productionsForName(self.start_symbol): | |
164 iis.add(Item(p, 0, EOF)) | |
165 return self.closure(iis) | |
166 | |
167 def nextItemSet(self, itemset, symbol): | |
318 | 168 """ |
319 | 169 Determines the next itemset for the current set and a symbol |
184 | 170 This is the goto procedure |
171 """ | |
172 next_set = set() | |
173 for item in itemset: | |
174 if item.can_shift_over(symbol): | |
175 next_set.add(item.shifted()) | |
176 return self.closure(next_set) | |
318 | 177 |
184 | 178 def genCanonicalSet(self, iis): |
185 | 179 states = [] |
184 | 180 worklist = [] |
185 | 181 transitions = {} |
184 | 182 def addSt(s): |
183 if not (s in states): | |
184 worklist.append(s) | |
185 | 185 states.append(s) |
184 | 186 addSt(iis) |
187 while len(worklist) > 0: | |
188 itemset = worklist.pop(0) | |
189 for symbol in self.Symbols: | |
190 nis = self.nextItemSet(itemset, symbol) | |
191 if not nis: | |
192 continue | |
193 addSt(nis) | |
185 | 194 transitions[(states.index(itemset), symbol)] = states.index(nis) |
195 return states, transitions | |
318 | 196 |
192 | 197 def checkSymbols(self): |
198 """ Checks no symbols are undefined """ | |
199 for production in self.productions: | |
200 for symbol in production.symbols: | |
319 | 201 if symbol not in self.Symbols: |
192 | 202 raise ParserGenerationException('Symbol {0} undefined'.format(symbol)) |
203 | |
184 | 204 def genParser(self): |
218 | 205 """ Generates a parser from the grammar (using a caching algorithm) """ |
240 | 206 action_table, goto_table = self.doGenerate() |
218 | 207 p = LRParser(action_table, goto_table, self.start_symbol) |
208 p.grammar = self | |
209 return p | |
210 | |
211 def doGenerate(self): | |
318 | 212 if not self.start_symbol: |
213 self.start_symbol = self.productions[0].name | |
192 | 214 self.checkSymbols() |
184 | 215 action_table = {} |
185 | 216 goto_table = {} |
184 | 217 iis = self.initialItemSet() |
218 | |
219 # First generate all item sets by using the nextItemset function: | |
185 | 220 states, transitions = self.genCanonicalSet(iis) |
194 | 221 |
185 | 222 def setAction(state, t, action): |
318 | 223 assert isinstance(action, Action) |
185 | 224 key = (state, t) |
218 | 225 assert type(state) is int |
226 assert type(t) is str | |
185 | 227 if key in action_table: |
228 action2 = action_table[key] | |
229 if action != action2: | |
318 | 230 if (type(action2) is Reduce) and (type(action) is Shift): |
194 | 231 # Automatically resolve and do the shift action! |
232 # Simple, but almost always what you want!! | |
233 action_table[key] = action | |
318 | 234 elif isinstance(action2, Shift) and isinstance(action, Reduce): |
235 pass | |
194 | 236 else: |
318 | 237 a1 = str(action) |
238 a2 = str(action2) | |
239 raise ParserGenerationException('LR construction conflict {0} vs {1}'.format(a1, a2)) | |
185 | 240 else: |
241 action_table[key] = action | |
184 | 242 |
243 # Fill action table: | |
244 for state in states: | |
185 | 245 # Detect conflicts: |
184 | 246 for item in state: |
247 if item.IsShift and item.Next in self.terminals: | |
185 | 248 # Rule 1, a shift item: |
249 nextstate = transitions[(states.index(state), item.Next)] | |
318 | 250 setAction(states.index(state), item.Next, Shift(nextstate)) |
185 | 251 if item.IsReduce: |
252 if item.production.name == self.start_symbol and item.look_ahead == EOF: | |
253 # Rule 3: accept: | |
318 | 254 act = Accept(self.productions.index(item.production)) |
184 | 255 else: |
185 | 256 # Rule 2, reduce item: |
318 | 257 act = Reduce(self.productions.index(item.production)) |
258 setAction(states.index(state), item.look_ahead, act) | |
185 | 259 for nt in self.nonterminals: |
260 key = (states.index(state), nt) | |
261 if key in transitions: | |
262 goto_table[key] = transitions[key] | |
218 | 263 return action_table, goto_table |
185 | 264 |
178 | 265 |
266 class Production: | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
267 """ Production rule for a grammar """ |
194 | 268 def __init__(self, name, symbols, f=None): |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
269 self.name = name |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
270 self.symbols = symbols |
194 | 271 self.f = f |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
272 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
273 def __repr__(self): |
318 | 274 action = ' ' + str(self.f) if self.f else '' |
275 return '{0} -> {1}'.format(self.name, self.symbols) + action | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
276 |
179 | 277 |
178 | 278 class Item: |
318 | 279 """ |
280 Represents a partially parsed item | |
186 | 281 It has a production it is looking for, a position |
282 in this production called the 'dot' and a look ahead | |
283 symbol that must follow this item. | |
284 """ | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
285 def __init__(self, production, dotpos, look_ahead): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
286 self.production = production |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
287 self.dotpos = dotpos |
184 | 288 assert self.dotpos <= len(self.production.symbols) |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
289 self.look_ahead = look_ahead |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
290 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
291 def getdata(self): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
292 """ Gets the members as a tuple """ |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
293 return (self.production, self.dotpos, self.look_ahead) |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
294 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
295 def __eq__(self, other): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
296 if type(other) is type(self): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
297 return self.getdata() == other.getdata() |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
298 return False |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
299 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
300 def __hash__(self): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
301 return self.getdata().__hash__() |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
302 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
303 @property |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
304 def IsReduce(self): |
186 | 305 """ Check if this item has the dot at the end """ |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
306 return self.dotpos == len(self.production.symbols) |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
307 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
308 @property |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
309 def IsShift(self): |
186 | 310 """ Check if this item is a shift item, i.e. the dot can proceed """ |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
311 return not self.IsReduce |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
312 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
313 @property |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
314 def Next(self): |
186 | 315 """ Returns the symbol after the dot """ |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
316 return self.production.symbols[self.dotpos] |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
317 |
184 | 318 def can_shift_over(self, symbol): |
319 """ Determines if this item can shift over the given symbol """ | |
320 return self.IsShift and self.Next == symbol | |
321 | |
322 def shifted(self): | |
323 """ Creates a new item that is shifted one position """ | |
324 return Item(self.production, self.dotpos + 1, self.look_ahead) | |
325 | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
326 @property |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
327 def NextNext(self): |
186 | 328 """ Gets the symbol after the next symbol, or EPS if at the end """ |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
329 if self.dotpos + 1 >= len(self.production.symbols): |
184 | 330 return EPS |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
331 else: |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
332 return self.production.symbols[self.dotpos + 1] |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
333 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
334 def __repr__(self): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
335 prod = self.production |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
336 predot = ' '.join(prod.symbols[0:self.dotpos]) |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
337 postdot = ' '.join(prod.symbols[self.dotpos:]) |
186 | 338 name = prod.name |
339 args = (name, predot, postdot, self.look_ahead) | |
185 | 340 return '[{0} -> {1} . {2} -> {3}]'.format(*args) |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
341 |
179 | 342 |
343 class LRParser: | |
318 | 344 """ LR parser automata """ |
195 | 345 def __init__(self, action_table, goto_table, start_symbol): |
184 | 346 self.action_table = action_table |
185 | 347 self.goto_table = goto_table |
195 | 348 self.start_symbol = start_symbol |
184 | 349 |
318 | 350 def parse(self, lexer): |
191 | 351 """ Parse an iterable with tokens """ |
318 | 352 assert hasattr(lexer, 'next_token'), '{0} is no lexer'.format(type(lexer)) |
185 | 353 stack = [0] |
195 | 354 r_data_stack = [] |
318 | 355 look_ahead = lexer.next_token() |
191 | 356 assert type(look_ahead) is Token |
195 | 357 # TODO: exit on this condition: |
358 while stack != [0, self.start_symbol, 2222]: | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
359 state = stack[-1] # top of stack |
191 | 360 key = (state, look_ahead.typ) |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
361 if not key in self.action_table: |
194 | 362 raise ParserException('Error parsing at character {0}'.format(look_ahead)) |
318 | 363 action = self.action_table[key] |
364 if type(action) is Reduce: | |
194 | 365 f_args = [] |
318 | 366 prod = self.grammar.productions[action.rule] |
367 for s in prod.symbols: | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
368 stack.pop() |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
369 stack.pop() |
195 | 370 f_args.append(r_data_stack.pop()) |
371 f_args.reverse() | |
372 r_data = None | |
318 | 373 if prod.f: |
374 r_data = prod.f(*f_args) | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
375 state = stack[-1] |
318 | 376 stack.append(prod.name) |
377 stack.append(self.goto_table[(state, prod.name)]) | |
195 | 378 r_data_stack.append(r_data) |
318 | 379 elif type(action) is Shift: |
191 | 380 stack.append(look_ahead.typ) |
318 | 381 stack.append(action.to_state) |
382 r_data_stack.append(look_ahead) | |
383 look_ahead = lexer.next_token() | |
191 | 384 assert type(look_ahead) is Token |
318 | 385 elif type(action) is Accept: |
195 | 386 # Pop last rule data off the stack: |
387 f_args = [] | |
318 | 388 param = self.grammar.productions[action.rule] |
195 | 389 for s in param.symbols: |
390 stack.pop() | |
391 stack.pop() | |
392 f_args.append(r_data_stack.pop()) | |
393 f_args.reverse() | |
394 if param.f: | |
318 | 395 ret_val = param.f(*f_args) |
396 else: | |
397 ret_val = None | |
195 | 398 # Break out! |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
399 break |
195 | 400 # At exit, the stack must be 1 long |
401 # TODO: fix that this holds: | |
319 | 402 #assert len(stack) == 1, 'stack {0} not totally reduce'.format(stack) |
318 | 403 return ret_val |