comparison python/pyyacc.py @ 318:e84047f29c78

Add burg and yacc initial attempts
author Windel Bouwman
date Tue, 31 Dec 2013 12:38:15 +0100
parents 6259856841a0
children 8d07a4254f04
comparison
equal deleted inserted replaced
317:e30a77ae359b 318:e84047f29c78
1 """ 1 """
2 Parser generator script 2 Parser generator script
3 """ 3 """
4 4
5 import hashlib
6 from ppci import Token 5 from ppci import Token
7 6
8 EPS = 'EPS' 7 EPS = 'EPS'
9 EOF = 'EOF' 8 EOF = 'EOF'
10 SHIFT = 1 9
11 REDUCE = 2
12 ACCEPT = 3
13 10
14 class ParserGenerationException(Exception): 11 class ParserGenerationException(Exception):
15 """ Raised when something goes wrong during parser generation """ 12 """ Raised when something goes wrong during parser generation """
16 pass 13 pass
17 14
18 15
19 class ParserException(Exception): 16 class ParserException(Exception):
20 """ Raised during a failure in the parsing process """ 17 """ Raised during a failure in the parsing process """
21 pass 18 pass
22 19
20
21 class Action:
22 def __repr__(self):
23 return 'Action'
24
25 def __eq__(self, other):
26 return str(self) == str(other)
27
28
29 class Shift(Action):
30 def __init__(self, to_state):
31 self.to_state = to_state
32
33 def __repr__(self):
34 return 'Shift({})'.format(self.to_state)
35
36
37 class Reduce(Action):
38 def __init__(self, rule):
39 self.rule = rule
40
41 def __repr__(self):
42 return 'Reduce({})'.format(self.rule)
43
44
45 class Accept(Reduce):
46 def __repr__(self):
47 return 'Accept({})'.format(self.rule)
48
49
50 def print_grammar(g):
51 """ Pretty print a grammar """
52 print(g)
53 for production in g.productions:
54 print(production)
23 55
24 class Grammar: 56 class Grammar:
25 """ Defines a grammar of a language """ 57 """ Defines a grammar of a language """
26 def __init__(self, terminals): 58 def __init__(self, terminals):
27 self.terminals = terminals 59 self.terminals = terminals
28 self.nonterminals = [] 60 self.nonterminals = []
29 self.productions = [] 61 self.productions = []
30 self._first = None # Cached first set 62 self._first = None # Cached first set
63 self.start_symbol = None
64
65 def __repr__(self):
66 return 'Grammar with {} rules'.format(len(self.productions))
31 67
32 def add_production(self, name, symbols, f=None): 68 def add_production(self, name, symbols, f=None):
33 """ Add a production rule to the grammar """ 69 """ Add a production rule to the grammar """
34 production = Production(name, symbols, f) 70 production = Production(name, symbols, f)
35 self.productions.append(production) 71 self.productions.append(production)
48 """ Get all the symbols defined by this grammar """ 84 """ Get all the symbols defined by this grammar """
49 return self.nonterminals + self.terminals 85 return self.nonterminals + self.terminals
50 86
51 @property 87 @property
52 def first(self): 88 def first(self):
53 """ 89 """
54 The first set is a mapping from a grammar symbol to a set of 90 The first set is a mapping from a grammar symbol to a set of
55 set of all terminal symbols that can be the first terminal when 91 set of all terminal symbols that can be the first terminal when
56 looking for the grammar symbol 92 looking for the grammar symbol
57 """ 93 """
58 if not self._first: 94 if not self._first:
59 self._first = self.calcFirstSets() 95 self._first = self.calcFirstSets()
60 return self._first 96 return self._first
61 97
123 for p in self.productionsForName(self.start_symbol): 159 for p in self.productionsForName(self.start_symbol):
124 iis.add(Item(p, 0, EOF)) 160 iis.add(Item(p, 0, EOF))
125 return self.closure(iis) 161 return self.closure(iis)
126 162
127 def nextItemSet(self, itemset, symbol): 163 def nextItemSet(self, itemset, symbol):
128 """ 164 """
129 Determines the next itemset for the current set and a symbol 165 Determines the next itemset for the current set and a symbol
130 This is the goto procedure 166 This is the goto procedure
131 """ 167 """
132 next_set = set() 168 next_set = set()
133 for item in itemset: 169 for item in itemset:
134 if item.can_shift_over(symbol): 170 if item.can_shift_over(symbol):
135 next_set.add(item.shifted()) 171 next_set.add(item.shifted())
136 return self.closure(next_set) 172 return self.closure(next_set)
137 173
138 def genCanonicalSet(self, iis): 174 def genCanonicalSet(self, iis):
139 states = [] 175 states = []
140 worklist = [] 176 worklist = []
141 transitions = {} 177 transitions = {}
142 def addSt(s): 178 def addSt(s):
151 if not nis: 187 if not nis:
152 continue 188 continue
153 addSt(nis) 189 addSt(nis)
154 transitions[(states.index(itemset), symbol)] = states.index(nis) 190 transitions[(states.index(itemset), symbol)] = states.index(nis)
155 return states, transitions 191 return states, transitions
156 192
157 def checkSymbols(self): 193 def checkSymbols(self):
158 """ Checks no symbols are undefined """ 194 """ Checks no symbols are undefined """
159 for production in self.productions: 195 for production in self.productions:
160 for symbol in production.symbols: 196 for symbol in production.symbols:
161 if symbol not in self.Symbols + [EPS]: 197 if symbol not in self.Symbols + [EPS]:
162 raise ParserGenerationException('Symbol {0} undefined'.format(symbol)) 198 raise ParserGenerationException('Symbol {0} undefined'.format(symbol))
163 199
164 def getSignature(self):
165 m = hashlib.md5()
166 m.update((str(self.productions) + str(self.start_symbol)).encode('ascii'))
167 signature = m.hexdigest()
168
169 def genParser(self): 200 def genParser(self):
170 """ Generates a parser from the grammar (using a caching algorithm) """ 201 """ Generates a parser from the grammar (using a caching algorithm) """
171 signature = self.getSignature()
172 action_table, goto_table = self.doGenerate() 202 action_table, goto_table = self.doGenerate()
173 p = LRParser(action_table, goto_table, self.start_symbol) 203 p = LRParser(action_table, goto_table, self.start_symbol)
174 p.grammar = self 204 p.grammar = self
175 return p 205 return p
176 206
177 def doGenerate(self): 207 def doGenerate(self):
208 if not self.start_symbol:
209 self.start_symbol = self.productions[0].name
178 self.checkSymbols() 210 self.checkSymbols()
179 action_table = {} 211 action_table = {}
180 goto_table = {} 212 goto_table = {}
181 iis = self.initialItemSet() 213 iis = self.initialItemSet()
182 214
183 # First generate all item sets by using the nextItemset function: 215 # First generate all item sets by using the nextItemset function:
184 states, transitions = self.genCanonicalSet(iis) 216 states, transitions = self.genCanonicalSet(iis)
185 217
186 def action_str(act):
187 a, p = act
188 if a is SHIFT:
189 return 'Shift {0}'.format(0)
190 elif a is REDUCE:
191 return 'Reduce {0}'.format(p)
192 return 'Other'
193
194 def setAction(state, t, action): 218 def setAction(state, t, action):
219 assert isinstance(action, Action)
195 key = (state, t) 220 key = (state, t)
196 assert type(state) is int 221 assert type(state) is int
197 assert type(t) is str 222 assert type(t) is str
198 if key in action_table: 223 if key in action_table:
199 action2 = action_table[key] 224 action2 = action_table[key]
200 if action != action2: 225 if action != action2:
201 if (action2[0] == REDUCE) and (action[0] == SHIFT): 226 if (type(action2) is Reduce) and (type(action) is Shift):
202 # Automatically resolve and do the shift action! 227 # Automatically resolve and do the shift action!
203 # Simple, but almost always what you want!! 228 # Simple, but almost always what you want!!
204 action_table[key] = action 229 action_table[key] = action
230 elif isinstance(action2, Shift) and isinstance(action, Reduce):
231 pass
205 else: 232 else:
206 if (action2[0] == SHIFT) and (action[0] == REDUCE): 233 a1 = str(action)
207 pass 234 a2 = str(action2)
208 else: 235 raise ParserGenerationException('LR construction conflict {0} vs {1}'.format(a1, a2))
209 a1 = action_str(action)
210 a2 = action_str(action2)
211 raise ParserGenerationException('LR construction conflict {0} vs {1}'.format(a1, a2))
212 else: 236 else:
213 action_table[key] = action 237 action_table[key] = action
214 238
215 # Fill action table: 239 # Fill action table:
216 for state in states: 240 for state in states:
217 # Detect conflicts: 241 # Detect conflicts:
218 for item in state: 242 for item in state:
219 if item.IsShift and item.Next in self.terminals: 243 if item.IsShift and item.Next in self.terminals:
220 # Rule 1, a shift item: 244 # Rule 1, a shift item:
221 nextstate = transitions[(states.index(state), item.Next)] 245 nextstate = transitions[(states.index(state), item.Next)]
222 setAction(states.index(state), item.Next, (SHIFT, nextstate)) 246 setAction(states.index(state), item.Next, Shift(nextstate))
223 if item.IsReduce: 247 if item.IsReduce:
224 if item.production.name == self.start_symbol and item.look_ahead == EOF: 248 if item.production.name == self.start_symbol and item.look_ahead == EOF:
225 # Rule 3: accept: 249 # Rule 3: accept:
226 setAction(states.index(state), item.look_ahead, (ACCEPT, self.productions.index(item.production))) 250 act = Accept(self.productions.index(item.production))
227 else: 251 else:
228 # Rule 2, reduce item: 252 # Rule 2, reduce item:
229 setAction(states.index(state), item.look_ahead, (REDUCE, self.productions.index(item.production))) 253 act = Reduce(self.productions.index(item.production))
254 setAction(states.index(state), item.look_ahead, act)
230 for nt in self.nonterminals: 255 for nt in self.nonterminals:
231 key = (states.index(state), nt) 256 key = (states.index(state), nt)
232 if key in transitions: 257 if key in transitions:
233 goto_table[key] = transitions[key] 258 goto_table[key] = transitions[key]
234 return action_table, goto_table 259 return action_table, goto_table
240 self.name = name 265 self.name = name
241 self.symbols = symbols 266 self.symbols = symbols
242 self.f = f 267 self.f = f
243 268
244 def __repr__(self): 269 def __repr__(self):
245 return '{0} -> {1}'.format(self.name, self.symbols) 270 action = ' ' + str(self.f) if self.f else ''
271 return '{0} -> {1}'.format(self.name, self.symbols) + action
246 272
247 273
248 class Item: 274 class Item:
249 """ 275 """
250 Represents a partially parsed item 276 Represents a partially parsed item
251 It has a production it is looking for, a position 277 It has a production it is looking for, a position
252 in this production called the 'dot' and a look ahead 278 in this production called the 'dot' and a look ahead
253 symbol that must follow this item. 279 symbol that must follow this item.
254 """ 280 """
255 def __init__(self, production, dotpos, look_ahead): 281 def __init__(self, production, dotpos, look_ahead):
309 args = (name, predot, postdot, self.look_ahead) 335 args = (name, predot, postdot, self.look_ahead)
310 return '[{0} -> {1} . {2} -> {3}]'.format(*args) 336 return '[{0} -> {1} . {2} -> {3}]'.format(*args)
311 337
312 338
313 class LRParser: 339 class LRParser:
314 """ LR parser """ 340 """ LR parser automata """
315 def __init__(self, action_table, goto_table, start_symbol): 341 def __init__(self, action_table, goto_table, start_symbol):
316 self.action_table = action_table 342 self.action_table = action_table
317 self.goto_table = goto_table 343 self.goto_table = goto_table
318 self.start_symbol = start_symbol 344 self.start_symbol = start_symbol
319 345
320 def parse(self, toks): 346 def parse(self, lexer):
321 """ Parse an iterable with tokens """ 347 """ Parse an iterable with tokens """
322 assert hasattr(toks, '__iter__'), '{0} not iter type'.format(type(toks)) 348 assert hasattr(lexer, 'next_token'), '{0} is no lexer'.format(type(lexer))
323 stack = [0] 349 stack = [0]
324 r_data_stack = [] 350 r_data_stack = []
325 try: 351 look_ahead = lexer.next_token()
326 look_ahead = toks.__next__()
327 except StopIteration:
328 look_ahead = Token(EOF, EOF)
329 assert type(look_ahead) is Token 352 assert type(look_ahead) is Token
330 # TODO: exit on this condition: 353 # TODO: exit on this condition:
331 while stack != [0, self.start_symbol, 2222]: 354 while stack != [0, self.start_symbol, 2222]:
332 #print(stack)
333 state = stack[-1] # top of stack 355 state = stack[-1] # top of stack
334 key = (state, look_ahead.typ) 356 key = (state, look_ahead.typ)
335 if not key in self.action_table: 357 if not key in self.action_table:
336 raise ParserException('Error parsing at character {0}'.format(look_ahead)) 358 raise ParserException('Error parsing at character {0}'.format(look_ahead))
337 action, param = self.action_table[key] 359 action = self.action_table[key]
338 if action == REDUCE: 360 if type(action) is Reduce:
339 f_args = [] 361 f_args = []
340 param = self.grammar.productions[param] 362 prod = self.grammar.productions[action.rule]
341 for s in param.symbols: 363 for s in prod.symbols:
342 stack.pop() 364 stack.pop()
343 stack.pop() 365 stack.pop()
344 f_args.append(r_data_stack.pop()) 366 f_args.append(r_data_stack.pop())
345 f_args.reverse() 367 f_args.reverse()
346 r_data = None 368 r_data = None
347 if param.f: 369 if prod.f:
348 r_data = param.f(*f_args) 370 r_data = prod.f(*f_args)
349 state = stack[-1] 371 state = stack[-1]
350 stack.append(param.name) 372 stack.append(prod.name)
351 stack.append(self.goto_table[(state, param.name)]) 373 stack.append(self.goto_table[(state, prod.name)])
352 r_data_stack.append(r_data) 374 r_data_stack.append(r_data)
353 elif action == SHIFT: 375 elif type(action) is Shift:
354 stack.append(look_ahead.typ) 376 stack.append(look_ahead.typ)
355 stack.append(param) 377 stack.append(action.to_state)
356 r_data_stack.append(look_ahead.val) 378 r_data_stack.append(look_ahead)
357 try: 379 look_ahead = lexer.next_token()
358 look_ahead = toks.__next__()
359 except StopIteration:
360 look_ahead = Token(EOF, EOF)
361 assert type(look_ahead) is Token 380 assert type(look_ahead) is Token
362 elif action == ACCEPT: 381 elif type(action) is Accept:
363 # Pop last rule data off the stack: 382 # Pop last rule data off the stack:
364 f_args = [] 383 f_args = []
365 param = self.grammar.productions[param] 384 param = self.grammar.productions[action.rule]
366 for s in param.symbols: 385 for s in param.symbols:
367 stack.pop() 386 stack.pop()
368 stack.pop() 387 stack.pop()
369 f_args.append(r_data_stack.pop()) 388 f_args.append(r_data_stack.pop())
370 f_args.reverse() 389 f_args.reverse()
371 if param.f: 390 if param.f:
372 param.f(*f_args) 391 ret_val = param.f(*f_args)
392 else:
393 ret_val = None
373 # Break out! 394 # Break out!
374 break 395 break
375 # At exit, the stack must be 1 long 396 # At exit, the stack must be 1 long
376 # TODO: fix that this holds: 397 # TODO: fix that this holds:
377 #assert len(stack) == 1, 'stack {0} not totally reduce'.format(stack) 398 #assert len(stack) == 1, 'stack {0} not totally reduce'.format(stack)
378 399 return ret_val