Mercurial > lcfOS
annotate python/pyyacc.py @ 347:742588fb8cd6 devel
Merge into devel branch
author | Windel Bouwman |
---|---|
date | Fri, 07 Mar 2014 17:10:21 +0100 |
parents | 3bb7dcfe5529 |
children | 9667d78ba79e |
rev | line source |
---|---|
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
1 """ |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
2 Parser generator script |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
3 """ |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
4 |
191 | 5 from ppci import Token |
6 | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
7 EPS = 'EPS' |
184 | 8 EOF = 'EOF' |
318 | 9 |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
10 |
185 | 11 class ParserGenerationException(Exception): |
186 | 12 """ Raised when something goes wrong during parser generation """ |
185 | 13 pass |
14 | |
15 | |
16 class ParserException(Exception): | |
186 | 17 """ Raised during a failure in the parsing process """ |
185 | 18 pass |
19 | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
20 |
318 | 21 class Action: |
22 def __repr__(self): | |
23 return 'Action' | |
24 | |
25 def __eq__(self, other): | |
26 return str(self) == str(other) | |
27 | |
28 | |
29 class Shift(Action): | |
30 def __init__(self, to_state): | |
31 self.to_state = to_state | |
32 | |
33 def __repr__(self): | |
34 return 'Shift({})'.format(self.to_state) | |
35 | |
36 | |
37 class Reduce(Action): | |
38 def __init__(self, rule): | |
39 self.rule = rule | |
40 | |
41 def __repr__(self): | |
42 return 'Reduce({})'.format(self.rule) | |
43 | |
44 | |
45 class Accept(Reduce): | |
46 def __repr__(self): | |
47 return 'Accept({})'.format(self.rule) | |
48 | |
49 | |
50 def print_grammar(g): | |
51 """ Pretty print a grammar """ | |
52 print(g) | |
53 for production in g.productions: | |
54 print(production) | |
55 | |
319 | 56 |
57 def calculate_first_sets(grammar): | |
58 """ | |
59 Calculate first sets for each grammar symbol | |
60 This is a dictionary which maps each grammar symbol | |
61 to a set of terminals that can be encountered first | |
62 when looking for the symbol. | |
63 """ | |
64 first = {} | |
65 nullable = {} | |
66 for terminal in grammar.terminals | {EOF, EPS}: | |
67 first[terminal] = set([terminal]) | |
68 nullable[terminal] = False | |
69 for nt in grammar.nonterminals: | |
70 first[nt] = set() | |
71 nullable[nt] = False | |
72 while True: | |
73 some_change = False | |
74 for rule in grammar.productions: | |
75 # Check for null-ability: | |
76 if all(nullable[beta] for beta in rule.symbols): | |
77 if not nullable[rule.name]: | |
78 nullable[rule.name] = True | |
79 some_change = True | |
80 # Update first sets: | |
81 for beta in rule.symbols: | |
82 if not nullable[beta]: | |
83 if first[beta] - first[rule.name]: | |
84 first[rule.name] |= first[beta] | |
85 some_change = True | |
86 break | |
87 if not some_change: | |
88 break | |
89 return first | |
90 | |
91 | |
178 | 92 class Grammar: |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
93 """ Defines a grammar of a language """ |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
94 def __init__(self, terminals): |
319 | 95 self.terminals = set(terminals) |
96 self.nonterminals = set() | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
97 self.productions = [] |
185 | 98 self._first = None # Cached first set |
318 | 99 self.start_symbol = None |
100 | |
101 def __repr__(self): | |
102 return 'Grammar with {} rules'.format(len(self.productions)) | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
103 |
194 | 104 def add_production(self, name, symbols, f=None): |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
105 """ Add a production rule to the grammar """ |
194 | 106 production = Production(name, symbols, f) |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
107 self.productions.append(production) |
192 | 108 if name in self.terminals: |
109 raise ParserGenerationException("Cannot redefine terminal {0}".format(name)) | |
319 | 110 self.nonterminals.add(name) |
194 | 111 self._first = None # Invalidate cached version |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
112 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
113 def productionsForName(self, name): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
114 """ Retrieve all productions for a non terminal """ |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
115 return [p for p in self.productions if p.name == name] |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
116 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
117 @property |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
118 def Symbols(self): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
119 """ Get all the symbols defined by this grammar """ |
319 | 120 return self.nonterminals | self.terminals |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
121 |
185 | 122 @property |
123 def first(self): | |
318 | 124 """ |
186 | 125 The first set is a mapping from a grammar symbol to a set of |
126 set of all terminal symbols that can be the first terminal when | |
318 | 127 looking for the grammar symbol |
186 | 128 """ |
185 | 129 if not self._first: |
319 | 130 self._first = calculate_first_sets(self) |
185 | 131 return self._first |
132 | |
184 | 133 def closure(self, itemset): |
134 """ Expand itemset by using epsilon moves """ | |
135 worklist = list(itemset) | |
136 def addIt(itm): | |
137 if not itm in itemset: | |
138 itemset.add(itm) | |
139 worklist.append(itm) | |
186 | 140 def first2(itm): |
184 | 141 # When using the first sets, create a copy: |
186 | 142 f = set(self.first[itm.NextNext]) |
184 | 143 if EPS in f: |
144 f.discard(EPS) | |
186 | 145 f.add(itm.look_ahead) |
184 | 146 return f |
319 | 147 # Start of algorithm: |
184 | 148 while worklist: |
149 item = worklist.pop(0) | |
150 if not item.IsShift: | |
151 continue | |
152 if not (item.Next in self.nonterminals): | |
153 continue | |
154 C = item.Next | |
155 for add_p in self.productionsForName(C): | |
186 | 156 for b in first2(item): |
184 | 157 addIt(Item(add_p, 0, b)) |
158 return frozenset(itemset) | |
159 | |
160 def initialItemSet(self): | |
161 """ Calculates the initial item set """ | |
162 iis = set() | |
163 for p in self.productionsForName(self.start_symbol): | |
164 iis.add(Item(p, 0, EOF)) | |
165 return self.closure(iis) | |
166 | |
167 def nextItemSet(self, itemset, symbol): | |
318 | 168 """ |
319 | 169 Determines the next itemset for the current set and a symbol |
184 | 170 This is the goto procedure |
171 """ | |
172 next_set = set() | |
173 for item in itemset: | |
174 if item.can_shift_over(symbol): | |
175 next_set.add(item.shifted()) | |
176 return self.closure(next_set) | |
318 | 177 |
184 | 178 def genCanonicalSet(self, iis): |
185 | 179 states = [] |
184 | 180 worklist = [] |
185 | 181 transitions = {} |
184 | 182 def addSt(s): |
183 if not (s in states): | |
184 worklist.append(s) | |
185 | 185 states.append(s) |
184 | 186 addSt(iis) |
187 while len(worklist) > 0: | |
188 itemset = worklist.pop(0) | |
189 for symbol in self.Symbols: | |
190 nis = self.nextItemSet(itemset, symbol) | |
191 if not nis: | |
192 continue | |
193 addSt(nis) | |
185 | 194 transitions[(states.index(itemset), symbol)] = states.index(nis) |
195 return states, transitions | |
318 | 196 |
192 | 197 def checkSymbols(self): |
198 """ Checks no symbols are undefined """ | |
199 for production in self.productions: | |
200 for symbol in production.symbols: | |
319 | 201 if symbol not in self.Symbols: |
192 | 202 raise ParserGenerationException('Symbol {0} undefined'.format(symbol)) |
203 | |
340 | 204 def generate_parser(self): |
205 """ Generates a parser from the grammar """ | |
206 action_table, goto_table = self.generate_tables() | |
218 | 207 p = LRParser(action_table, goto_table, self.start_symbol) |
208 p.grammar = self | |
209 return p | |
210 | |
340 | 211 def generate_tables(self): |
212 """ Generate parsing tables """ | |
318 | 213 if not self.start_symbol: |
214 self.start_symbol = self.productions[0].name | |
192 | 215 self.checkSymbols() |
184 | 216 action_table = {} |
185 | 217 goto_table = {} |
184 | 218 iis = self.initialItemSet() |
219 | |
220 # First generate all item sets by using the nextItemset function: | |
185 | 221 states, transitions = self.genCanonicalSet(iis) |
194 | 222 |
185 | 223 def setAction(state, t, action): |
318 | 224 assert isinstance(action, Action) |
185 | 225 key = (state, t) |
218 | 226 assert type(state) is int |
227 assert type(t) is str | |
185 | 228 if key in action_table: |
229 action2 = action_table[key] | |
230 if action != action2: | |
318 | 231 if (type(action2) is Reduce) and (type(action) is Shift): |
194 | 232 # Automatically resolve and do the shift action! |
233 # Simple, but almost always what you want!! | |
234 action_table[key] = action | |
318 | 235 elif isinstance(action2, Shift) and isinstance(action, Reduce): |
236 pass | |
194 | 237 else: |
318 | 238 a1 = str(action) |
239 a2 = str(action2) | |
240 raise ParserGenerationException('LR construction conflict {0} vs {1}'.format(a1, a2)) | |
185 | 241 else: |
242 action_table[key] = action | |
184 | 243 |
244 # Fill action table: | |
245 for state in states: | |
185 | 246 # Detect conflicts: |
184 | 247 for item in state: |
248 if item.IsShift and item.Next in self.terminals: | |
185 | 249 # Rule 1, a shift item: |
250 nextstate = transitions[(states.index(state), item.Next)] | |
318 | 251 setAction(states.index(state), item.Next, Shift(nextstate)) |
185 | 252 if item.IsReduce: |
253 if item.production.name == self.start_symbol and item.look_ahead == EOF: | |
254 # Rule 3: accept: | |
318 | 255 act = Accept(self.productions.index(item.production)) |
184 | 256 else: |
185 | 257 # Rule 2, reduce item: |
318 | 258 act = Reduce(self.productions.index(item.production)) |
259 setAction(states.index(state), item.look_ahead, act) | |
185 | 260 for nt in self.nonterminals: |
261 key = (states.index(state), nt) | |
262 if key in transitions: | |
263 goto_table[key] = transitions[key] | |
218 | 264 return action_table, goto_table |
185 | 265 |
178 | 266 |
267 class Production: | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
268 """ Production rule for a grammar """ |
341 | 269 def __init__(self, name, symbols, f): |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
270 self.name = name |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
271 self.symbols = symbols |
194 | 272 self.f = f |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
273 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
274 def __repr__(self): |
318 | 275 action = ' ' + str(self.f) if self.f else '' |
276 return '{0} -> {1}'.format(self.name, self.symbols) + action | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
277 |
179 | 278 |
178 | 279 class Item: |
318 | 280 """ |
281 Represents a partially parsed item | |
186 | 282 It has a production it is looking for, a position |
283 in this production called the 'dot' and a look ahead | |
284 symbol that must follow this item. | |
285 """ | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
286 def __init__(self, production, dotpos, look_ahead): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
287 self.production = production |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
288 self.dotpos = dotpos |
184 | 289 assert self.dotpos <= len(self.production.symbols) |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
290 self.look_ahead = look_ahead |
346 | 291 self._is_shift = self.dotpos < len(self.production.symbols) |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
292 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
293 def getdata(self): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
294 """ Gets the members as a tuple """ |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
295 return (self.production, self.dotpos, self.look_ahead) |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
296 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
297 def __eq__(self, other): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
298 if type(other) is type(self): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
299 return self.getdata() == other.getdata() |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
300 return False |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
301 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
302 def __hash__(self): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
303 return self.getdata().__hash__() |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
304 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
305 @property |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
306 def IsReduce(self): |
186 | 307 """ Check if this item has the dot at the end """ |
346 | 308 return not self._is_shift |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
309 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
310 @property |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
311 def IsShift(self): |
186 | 312 """ Check if this item is a shift item, i.e. the dot can proceed """ |
346 | 313 return self._is_shift |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
314 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
315 @property |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
316 def Next(self): |
186 | 317 """ Returns the symbol after the dot """ |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
318 return self.production.symbols[self.dotpos] |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
319 |
184 | 320 def can_shift_over(self, symbol): |
321 """ Determines if this item can shift over the given symbol """ | |
322 return self.IsShift and self.Next == symbol | |
323 | |
324 def shifted(self): | |
325 """ Creates a new item that is shifted one position """ | |
326 return Item(self.production, self.dotpos + 1, self.look_ahead) | |
327 | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
328 @property |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
329 def NextNext(self): |
186 | 330 """ Gets the symbol after the next symbol, or EPS if at the end """ |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
331 if self.dotpos + 1 >= len(self.production.symbols): |
184 | 332 return EPS |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
333 else: |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
334 return self.production.symbols[self.dotpos + 1] |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
335 |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
336 def __repr__(self): |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
337 prod = self.production |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
338 predot = ' '.join(prod.symbols[0:self.dotpos]) |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
339 postdot = ' '.join(prod.symbols[self.dotpos:]) |
186 | 340 name = prod.name |
341 args = (name, predot, postdot, self.look_ahead) | |
185 | 342 return '[{0} -> {1} . {2} -> {3}]'.format(*args) |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
343 |
179 | 344 |
345 class LRParser: | |
318 | 346 """ LR parser automata """ |
195 | 347 def __init__(self, action_table, goto_table, start_symbol): |
184 | 348 self.action_table = action_table |
185 | 349 self.goto_table = goto_table |
195 | 350 self.start_symbol = start_symbol |
184 | 351 |
318 | 352 def parse(self, lexer): |
191 | 353 """ Parse an iterable with tokens """ |
318 | 354 assert hasattr(lexer, 'next_token'), '{0} is no lexer'.format(type(lexer)) |
185 | 355 stack = [0] |
195 | 356 r_data_stack = [] |
318 | 357 look_ahead = lexer.next_token() |
191 | 358 assert type(look_ahead) is Token |
195 | 359 # TODO: exit on this condition: |
360 while stack != [0, self.start_symbol, 2222]: | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
361 state = stack[-1] # top of stack |
191 | 362 key = (state, look_ahead.typ) |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
363 if not key in self.action_table: |
194 | 364 raise ParserException('Error parsing at character {0}'.format(look_ahead)) |
318 | 365 action = self.action_table[key] |
366 if type(action) is Reduce: | |
194 | 367 f_args = [] |
318 | 368 prod = self.grammar.productions[action.rule] |
369 for s in prod.symbols: | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
370 stack.pop() |
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
371 stack.pop() |
195 | 372 f_args.append(r_data_stack.pop()) |
373 f_args.reverse() | |
374 r_data = None | |
318 | 375 if prod.f: |
376 r_data = prod.f(*f_args) | |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
377 state = stack[-1] |
318 | 378 stack.append(prod.name) |
379 stack.append(self.goto_table[(state, prod.name)]) | |
195 | 380 r_data_stack.append(r_data) |
318 | 381 elif type(action) is Shift: |
191 | 382 stack.append(look_ahead.typ) |
318 | 383 stack.append(action.to_state) |
384 r_data_stack.append(look_ahead) | |
385 look_ahead = lexer.next_token() | |
191 | 386 assert type(look_ahead) is Token |
318 | 387 elif type(action) is Accept: |
195 | 388 # Pop last rule data off the stack: |
389 f_args = [] | |
318 | 390 param = self.grammar.productions[action.rule] |
195 | 391 for s in param.symbols: |
392 stack.pop() | |
393 stack.pop() | |
394 f_args.append(r_data_stack.pop()) | |
395 f_args.reverse() | |
396 if param.f: | |
318 | 397 ret_val = param.f(*f_args) |
398 else: | |
399 ret_val = None | |
195 | 400 # Break out! |
181
216da5e46efc
Changed indent to 4 spaces to comply to convention
Windel Bouwman
parents:
180
diff
changeset
|
401 break |
195 | 402 # At exit, the stack must be 1 long |
403 # TODO: fix that this holds: | |
319 | 404 #assert len(stack) == 1, 'stack {0} not totally reduce'.format(stack) |
318 | 405 return ret_val |