Mercurial > lcfOS
view python/pyyacc.py @ 257:703321743e8a
Fixed readme
author | Windel Bouwman |
---|---|
date | Sun, 04 Aug 2013 18:34:06 +0200 |
parents | 6259856841a0 |
children | e84047f29c78 |
line wrap: on
line source
""" Parser generator script """ import hashlib from ppci import Token EPS = 'EPS' EOF = 'EOF' SHIFT = 1 REDUCE = 2 ACCEPT = 3 class ParserGenerationException(Exception): """ Raised when something goes wrong during parser generation """ pass class ParserException(Exception): """ Raised during a failure in the parsing process """ pass class Grammar: """ Defines a grammar of a language """ def __init__(self, terminals): self.terminals = terminals self.nonterminals = [] self.productions = [] self._first = None # Cached first set def add_production(self, name, symbols, f=None): """ Add a production rule to the grammar """ production = Production(name, symbols, f) self.productions.append(production) if name in self.terminals: raise ParserGenerationException("Cannot redefine terminal {0}".format(name)) if not name in self.nonterminals: self.nonterminals.append(name) self._first = None # Invalidate cached version def productionsForName(self, name): """ Retrieve all productions for a non terminal """ return [p for p in self.productions if p.name == name] @property def Symbols(self): """ Get all the symbols defined by this grammar """ return self.nonterminals + self.terminals @property def first(self): """ The first set is a mapping from a grammar symbol to a set of set of all terminal symbols that can be the first terminal when looking for the grammar symbol """ if not self._first: self._first = self.calcFirstSets() return self._first def calcFirstSets(self): """ Calculate first sets for each grammar symbol This is a dictionary which maps each grammar symbol to a set of terminals that can be encountered first when looking for the symbol. """ first = {} for t in self.terminals + [EOF, EPS]: first[t] = set([t]) for nt in self.nonterminals: first[nt] = set() epsset = {EPS} while True: some_change = False for p in self.productions: rhs = set() for beta in p.symbols: rhs = rhs | (first[beta] - epsset) if not EPS in first[beta]: break else: if EPS in first[beta]: rhs.add(EPS) if rhs - first[p.name]: first[p.name] |= rhs some_change = True if not some_change: break return first def closure(self, itemset): """ Expand itemset by using epsilon moves """ worklist = list(itemset) def addIt(itm): if not itm in itemset: itemset.add(itm) worklist.append(itm) def first2(itm): # When using the first sets, create a copy: f = set(self.first[itm.NextNext]) if EPS in f: f.discard(EPS) f.add(itm.look_ahead) return f # Start of algorithm: while worklist: item = worklist.pop(0) if not item.IsShift: continue if not (item.Next in self.nonterminals): continue C = item.Next for add_p in self.productionsForName(C): for b in first2(item): addIt(Item(add_p, 0, b)) return frozenset(itemset) def initialItemSet(self): """ Calculates the initial item set """ iis = set() for p in self.productionsForName(self.start_symbol): iis.add(Item(p, 0, EOF)) return self.closure(iis) def nextItemSet(self, itemset, symbol): """ Determines the next itemset for the current set and a symbol This is the goto procedure """ next_set = set() for item in itemset: if item.can_shift_over(symbol): next_set.add(item.shifted()) return self.closure(next_set) def genCanonicalSet(self, iis): states = [] worklist = [] transitions = {} def addSt(s): if not (s in states): worklist.append(s) states.append(s) addSt(iis) while len(worklist) > 0: itemset = worklist.pop(0) for symbol in self.Symbols: nis = self.nextItemSet(itemset, symbol) if not nis: continue addSt(nis) transitions[(states.index(itemset), symbol)] = states.index(nis) return states, transitions def checkSymbols(self): """ Checks no symbols are undefined """ for production in self.productions: for symbol in production.symbols: if symbol not in self.Symbols + [EPS]: raise ParserGenerationException('Symbol {0} undefined'.format(symbol)) def getSignature(self): m = hashlib.md5() m.update((str(self.productions) + str(self.start_symbol)).encode('ascii')) signature = m.hexdigest() def genParser(self): """ Generates a parser from the grammar (using a caching algorithm) """ signature = self.getSignature() action_table, goto_table = self.doGenerate() p = LRParser(action_table, goto_table, self.start_symbol) p.grammar = self return p def doGenerate(self): self.checkSymbols() action_table = {} goto_table = {} iis = self.initialItemSet() # First generate all item sets by using the nextItemset function: states, transitions = self.genCanonicalSet(iis) def action_str(act): a, p = act if a is SHIFT: return 'Shift {0}'.format(0) elif a is REDUCE: return 'Reduce {0}'.format(p) return 'Other' def setAction(state, t, action): key = (state, t) assert type(state) is int assert type(t) is str if key in action_table: action2 = action_table[key] if action != action2: if (action2[0] == REDUCE) and (action[0] == SHIFT): # Automatically resolve and do the shift action! # Simple, but almost always what you want!! action_table[key] = action else: if (action2[0] == SHIFT) and (action[0] == REDUCE): pass else: a1 = action_str(action) a2 = action_str(action2) raise ParserGenerationException('LR construction conflict {0} vs {1}'.format(a1, a2)) else: action_table[key] = action # Fill action table: for state in states: # Detect conflicts: for item in state: if item.IsShift and item.Next in self.terminals: # Rule 1, a shift item: nextstate = transitions[(states.index(state), item.Next)] setAction(states.index(state), item.Next, (SHIFT, nextstate)) if item.IsReduce: if item.production.name == self.start_symbol and item.look_ahead == EOF: # Rule 3: accept: setAction(states.index(state), item.look_ahead, (ACCEPT, self.productions.index(item.production))) else: # Rule 2, reduce item: setAction(states.index(state), item.look_ahead, (REDUCE, self.productions.index(item.production))) for nt in self.nonterminals: key = (states.index(state), nt) if key in transitions: goto_table[key] = transitions[key] return action_table, goto_table class Production: """ Production rule for a grammar """ def __init__(self, name, symbols, f=None): self.name = name self.symbols = symbols self.f = f def __repr__(self): return '{0} -> {1}'.format(self.name, self.symbols) class Item: """ Represents a partially parsed item It has a production it is looking for, a position in this production called the 'dot' and a look ahead symbol that must follow this item. """ def __init__(self, production, dotpos, look_ahead): self.production = production self.dotpos = dotpos assert self.dotpos <= len(self.production.symbols) self.look_ahead = look_ahead def getdata(self): """ Gets the members as a tuple """ return (self.production, self.dotpos, self.look_ahead) def __eq__(self, other): if type(other) is type(self): return self.getdata() == other.getdata() return False def __hash__(self): return self.getdata().__hash__() @property def IsReduce(self): """ Check if this item has the dot at the end """ return self.dotpos == len(self.production.symbols) @property def IsShift(self): """ Check if this item is a shift item, i.e. the dot can proceed """ return not self.IsReduce @property def Next(self): """ Returns the symbol after the dot """ return self.production.symbols[self.dotpos] def can_shift_over(self, symbol): """ Determines if this item can shift over the given symbol """ return self.IsShift and self.Next == symbol def shifted(self): """ Creates a new item that is shifted one position """ return Item(self.production, self.dotpos + 1, self.look_ahead) @property def NextNext(self): """ Gets the symbol after the next symbol, or EPS if at the end """ if self.dotpos + 1 >= len(self.production.symbols): return EPS else: return self.production.symbols[self.dotpos + 1] def __repr__(self): prod = self.production predot = ' '.join(prod.symbols[0:self.dotpos]) postdot = ' '.join(prod.symbols[self.dotpos:]) name = prod.name args = (name, predot, postdot, self.look_ahead) return '[{0} -> {1} . {2} -> {3}]'.format(*args) class LRParser: """ LR parser """ def __init__(self, action_table, goto_table, start_symbol): self.action_table = action_table self.goto_table = goto_table self.start_symbol = start_symbol def parse(self, toks): """ Parse an iterable with tokens """ assert hasattr(toks, '__iter__'), '{0} not iter type'.format(type(toks)) stack = [0] r_data_stack = [] try: look_ahead = toks.__next__() except StopIteration: look_ahead = Token(EOF, EOF) assert type(look_ahead) is Token # TODO: exit on this condition: while stack != [0, self.start_symbol, 2222]: #print(stack) state = stack[-1] # top of stack key = (state, look_ahead.typ) if not key in self.action_table: raise ParserException('Error parsing at character {0}'.format(look_ahead)) action, param = self.action_table[key] if action == REDUCE: f_args = [] param = self.grammar.productions[param] for s in param.symbols: stack.pop() stack.pop() f_args.append(r_data_stack.pop()) f_args.reverse() r_data = None if param.f: r_data = param.f(*f_args) state = stack[-1] stack.append(param.name) stack.append(self.goto_table[(state, param.name)]) r_data_stack.append(r_data) elif action == SHIFT: stack.append(look_ahead.typ) stack.append(param) r_data_stack.append(look_ahead.val) try: look_ahead = toks.__next__() except StopIteration: look_ahead = Token(EOF, EOF) assert type(look_ahead) is Token elif action == ACCEPT: # Pop last rule data off the stack: f_args = [] param = self.grammar.productions[param] for s in param.symbols: stack.pop() stack.pop() f_args.append(r_data_stack.pop()) f_args.reverse() if param.f: param.f(*f_args) # Break out! break # At exit, the stack must be 1 long # TODO: fix that this holds: #assert len(stack) == 1, 'stack {0} not totally reduce'.format(stack)