Mercurial > lcfOS
view python/pyyacc.py @ 184:fe2b72381a83
Added testset for pyy
author | Windel Bouwman |
---|---|
date | Fri, 24 May 2013 16:13:23 +0200 |
parents | 216da5e46efc |
children | 51a6440d6398 |
line wrap: on
line source
""" Parser generator script """ EPS = 'EPS' EOF = 'EOF' SHIFT = 1 REDUCE = 2 ACCEPT = 3 class Grammar: """ Defines a grammar of a language """ def __init__(self, terminals): self.terminals = terminals self.nonterminals = [] self.productions = [] def add_production(self, name, symbols): """ Add a production rule to the grammar """ production = Production(name, symbols) self.productions.append(production) assert not name in self.terminals, "Cannot redefine terminal" if not name in self.nonterminals: self.nonterminals.append(name) def productionsForName(self, name): """ Retrieve all productions for a non terminal """ return [p for p in self.productions if p.name == name] @property def Symbols(self): """ Get all the symbols defined by this grammar """ return self.nonterminals + self.terminals def calcFirstSets(self): """ Calculate first sets for each grammar symbol This is a dictionary which maps each grammar symbol to a set of terminals that can be encountered first when looking for the symbol. """ first = {} for t in self.terminals + [EOF, EPS]: first[t] = set([t]) for nt in self.nonterminals: first[nt] = set() epsset = set([EPS]) while True: some_change = False for p in self.productions: rhs = set() for beta in p.symbols: rhs = rhs | (first[beta] - epsset) if not EPS in first[beta]: break else: if EPS in first[beta]: rhs.add(EPS) if rhs - first[p.name]: first[p.name] |= rhs some_change = True if not some_change: break return first def closure(self, itemset): """ Expand itemset by using epsilon moves """ worklist = list(itemset) def addIt(itm): if not itm in itemset: itemset.add(itm) worklist.append(itm) def first2(itm, la): # When using the first sets, create a copy: f = set(self.first[item.NextNext]) if EPS in f: f.discard(EPS) f.add(item.look_ahead) return f while worklist: item = worklist.pop(0) if not item.IsShift: continue if not (item.Next in self.nonterminals): continue C = item.Next for add_p in self.productionsForName(C): for b in first2(item, item.look_ahead): addIt(Item(add_p, 0, b)) return frozenset(itemset) def initialItemSet(self): """ Calculates the initial item set """ iis = set() for p in self.productionsForName(self.start_symbol): iis.add(Item(p, 0, EOF)) return self.closure(iis) def nextItemSet(self, itemset, symbol): """ Determines the next itemset for the current set and a symbol This is the goto procedure """ next_set = set() for item in itemset: if item.can_shift_over(symbol): next_set.add(item.shifted()) return self.closure(next_set) def genCanonicalSet(self, iis): states = set() worklist = [] goto_table = {} def addSt(s): if not (s in states): worklist.append(s) states.add(s) addSt(iis) while len(worklist) > 0: itemset = worklist.pop(0) for symbol in self.Symbols: nis = self.nextItemSet(itemset, symbol) if not nis: continue goto_table[(itemset, symbol)] = nis addSt(nis) return states, goto_table def genParser(self): """ Generates a parser from the grammar """ action_table = {} self.first = self.calcFirstSets() iis = self.initialItemSet() # First generate all item sets by using the nextItemset function: states, goto_table = self.genCanonicalSet(iis) # Number the states: number_states = {} for s, i in zip(states, range(len(states))): number_states[s] = i # Fill action table: for state in states: for item in state: if item.IsShift and item.Next in self.terminals: action_table[(state, item.Next)] = (SHIFT, 0) elif item.IsReduce: if item.look_ahead == EOF: action_table[(state, item.look_ahead)] = (ACCEPT, 0) else: action_table[(state, item.look_ahead)] = (REDUCE, item.production) else: pass p = LRParser(action_table) p.goto_table = goto_table p.s0 = iis return p class Production: """ Production rule for a grammar """ def __init__(self, name, symbols): self.name = name self.symbols = symbols def __repr__(self): return '{0} -> {1}'.format(self.name, self.symbols) class Item: def __init__(self, production, dotpos, look_ahead): self.production = production self.dotpos = dotpos assert self.dotpos <= len(self.production.symbols) self.look_ahead = look_ahead def getdata(self): """ Gets the members as a tuple """ return (self.production, self.dotpos, self.look_ahead) def __eq__(self, other): if type(other) is type(self): return self.getdata() == other.getdata() return False def __hash__(self): return self.getdata().__hash__() @property def IsReduce(self): return self.dotpos == len(self.production.symbols) @property def IsShift(self): return not self.IsReduce @property def Next(self): return self.production.symbols[self.dotpos] def can_shift_over(self, symbol): """ Determines if this item can shift over the given symbol """ return self.IsShift and self.Next == symbol def shifted(self): """ Creates a new item that is shifted one position """ return Item(self.production, self.dotpos + 1, self.look_ahead) @property def NextNext(self): if self.dotpos + 1 >= len(self.production.symbols): return EPS else: return self.production.symbols[self.dotpos + 1] def __repr__(self): prod = self.production predot = ' '.join(prod.symbols[0:self.dotpos]) postdot = ' '.join(prod.symbols[self.dotpos:]) nt = prod.name args = (nt, predot, postdot, self.look_ahead) return '[{0} -> {1} . {2}, {3}]'.format(*args) class LRParser: """ LR parser """ def __init__(self, action_table): self.action_table = action_table def parse(self, toks): stack = [EOF, self.s0] look_ahead = toks.pop(0) while True: state = stack[-1] # top of stack key = (state, look_ahead) if not key in self.action_table: raise Exception('Error parsing') action, param = self.action_table[(state, look_ahead)] if action == REDUCE: for s in param.symbols: stack.pop() stack.pop() state = stack[-1] stack.append(param.name) stack.append(self.goto_table[(state, param.name)]) elif action == SHIFT: stack.append(look_ahead) s, i = stack[-2:] stack.append(self.goto_table[(s, i)]) look_ahead = toks.pop(0) elif action == ACCEPT: break def testSimpleGrammar(): # 1. define a simple grammar: g = Grammar(['EOF', 'identifier', '(', ')', '+', '*']) g.add_production('input', ['expression']) g.add_production('expression', ['term']) g.add_production('expression', ['expression', '+', 'term']) g.add_production('term', ['factor']) g.add_production('term', ['term', '*', 'factor']) g.add_production('factor', ['(', 'expression', ')']) g.add_production('factor', ['identifier']) g.start_symbol = 'input' # 2. define input: tokens = ['identifier', '+', 'identifier', '+', 'identifier', 'EOF'] # 3. build parser: p = g.genParser() # 4. feed input: p.parse(tokens) if __name__ == '__main__': testSimpleGrammar()