Mercurial > lcfOS
view python/pyyacc.py @ 191:6b2bec5653f1
Added assembler testset
author | Windel Bouwman |
---|---|
date | Sun, 26 May 2013 15:28:07 +0200 |
parents | 46d62dadd61b |
children | 6cd6260789a1 |
line wrap: on
line source
""" Parser generator script """ from ppci import Token EPS = 'EPS' EOF = 'EOF' SHIFT = 1 REDUCE = 2 ACCEPT = 3 class ParserGenerationException(Exception): """ Raised when something goes wrong during parser generation """ pass class ParserException(Exception): """ Raised during a failure in the parsing process """ pass class Grammar: """ Defines a grammar of a language """ def __init__(self, terminals): self.terminals = terminals self.nonterminals = [] self.productions = [] self._first = None # Cached first set def add_production(self, name, symbols): """ Add a production rule to the grammar """ production = Production(name, symbols) self.productions.append(production) assert not name in self.terminals, "Cannot redefine terminal" if not name in self.nonterminals: self.nonterminals.append(name) def productionsForName(self, name): """ Retrieve all productions for a non terminal """ return [p for p in self.productions if p.name == name] @property def Symbols(self): """ Get all the symbols defined by this grammar """ return self.nonterminals + self.terminals @property def first(self): """ The first set is a mapping from a grammar symbol to a set of set of all terminal symbols that can be the first terminal when looking for the grammar symbol """ if not self._first: self._first = self.calcFirstSets() return self._first def calcFirstSets(self): """ Calculate first sets for each grammar symbol This is a dictionary which maps each grammar symbol to a set of terminals that can be encountered first when looking for the symbol. """ first = {} for t in self.terminals + [EOF, EPS]: first[t] = set([t]) for nt in self.nonterminals: first[nt] = set() epsset = {EPS} while True: some_change = False for p in self.productions: rhs = set() for beta in p.symbols: rhs = rhs | (first[beta] - epsset) if not EPS in first[beta]: break else: if EPS in first[beta]: rhs.add(EPS) if rhs - first[p.name]: first[p.name] |= rhs some_change = True if not some_change: break return first def closure(self, itemset): """ Expand itemset by using epsilon moves """ worklist = list(itemset) def addIt(itm): if not itm in itemset: itemset.add(itm) worklist.append(itm) def first2(itm): # When using the first sets, create a copy: f = set(self.first[itm.NextNext]) if EPS in f: f.discard(EPS) f.add(itm.look_ahead) return f # Start of algorithm: while worklist: item = worklist.pop(0) if not item.IsShift: continue if not (item.Next in self.nonterminals): continue C = item.Next for add_p in self.productionsForName(C): for b in first2(item): addIt(Item(add_p, 0, b)) return frozenset(itemset) def initialItemSet(self): """ Calculates the initial item set """ iis = set() for p in self.productionsForName(self.start_symbol): iis.add(Item(p, 0, EOF)) return self.closure(iis) def nextItemSet(self, itemset, symbol): """ Determines the next itemset for the current set and a symbol This is the goto procedure """ next_set = set() for item in itemset: if item.can_shift_over(symbol): next_set.add(item.shifted()) return self.closure(next_set) def genCanonicalSet(self, iis): states = [] worklist = [] transitions = {} def addSt(s): if not (s in states): worklist.append(s) states.append(s) addSt(iis) while len(worklist) > 0: itemset = worklist.pop(0) for symbol in self.Symbols: nis = self.nextItemSet(itemset, symbol) if not nis: continue addSt(nis) transitions[(states.index(itemset), symbol)] = states.index(nis) return states, transitions def genParser(self): """ Generates a parser from the grammar """ action_table = {} goto_table = {} iis = self.initialItemSet() # First generate all item sets by using the nextItemset function: states, transitions = self.genCanonicalSet(iis) def setAction(state, t, action): key = (state, t) if key in action_table: action2 = action_table[key] if action != action2: raise ParserGenerationException('LR construction conflict') else: action_table[key] = action # Fill action table: for state in states: # Detect conflicts: for item in state: if item.IsShift and item.Next in self.terminals: # Rule 1, a shift item: nextstate = transitions[(states.index(state), item.Next)] setAction(states.index(state), item.Next, (SHIFT, nextstate)) if item.IsReduce: if item.production.name == self.start_symbol and item.look_ahead == EOF: # Rule 3: accept: setAction(states.index(state), item.look_ahead, (ACCEPT, None)) else: # Rule 2, reduce item: setAction(states.index(state), item.look_ahead, (REDUCE, item.production)) for nt in self.nonterminals: key = (states.index(state), nt) if key in transitions: goto_table[key] = transitions[key] return LRParser(action_table, goto_table) class Production: """ Production rule for a grammar """ def __init__(self, name, symbols): self.name = name self.symbols = symbols def __repr__(self): return '{0} -> {1}'.format(self.name, self.symbols) class Item: """ Represents a partially parsed item It has a production it is looking for, a position in this production called the 'dot' and a look ahead symbol that must follow this item. """ def __init__(self, production, dotpos, look_ahead): self.production = production self.dotpos = dotpos assert self.dotpos <= len(self.production.symbols) self.look_ahead = look_ahead def getdata(self): """ Gets the members as a tuple """ return (self.production, self.dotpos, self.look_ahead) def __eq__(self, other): if type(other) is type(self): return self.getdata() == other.getdata() return False def __hash__(self): return self.getdata().__hash__() @property def IsReduce(self): """ Check if this item has the dot at the end """ return self.dotpos == len(self.production.symbols) @property def IsShift(self): """ Check if this item is a shift item, i.e. the dot can proceed """ return not self.IsReduce @property def Next(self): """ Returns the symbol after the dot """ return self.production.symbols[self.dotpos] def can_shift_over(self, symbol): """ Determines if this item can shift over the given symbol """ return self.IsShift and self.Next == symbol def shifted(self): """ Creates a new item that is shifted one position """ return Item(self.production, self.dotpos + 1, self.look_ahead) @property def NextNext(self): """ Gets the symbol after the next symbol, or EPS if at the end """ if self.dotpos + 1 >= len(self.production.symbols): return EPS else: return self.production.symbols[self.dotpos + 1] def __repr__(self): prod = self.production predot = ' '.join(prod.symbols[0:self.dotpos]) postdot = ' '.join(prod.symbols[self.dotpos:]) name = prod.name args = (name, predot, postdot, self.look_ahead) return '[{0} -> {1} . {2} -> {3}]'.format(*args) class LRParser: """ LR parser """ def __init__(self, action_table, goto_table): self.action_table = action_table self.goto_table = goto_table def parse(self, toks): """ Parse an iterable with tokens """ assert hasattr(toks, '__iter__'), '{0} not iter type'.format(type(toks)) stack = [0] look_ahead = toks.__next__() assert type(look_ahead) is Token while True: state = stack[-1] # top of stack key = (state, look_ahead.typ) if not key in self.action_table: raise Exception('Error parsing at character {0}'.format(look_ahead)) action, param = self.action_table[key] if action == REDUCE: for s in param.symbols: stack.pop() stack.pop() state = stack[-1] stack.append(param.name) stack.append(self.goto_table[(state, param.name)]) elif action == SHIFT: stack.append(look_ahead.typ) stack.append(param) try: look_ahead = toks.__next__() except StopIteration: look_ahead = Token(EOF, EOF, 0) assert type(look_ahead) is Token elif action == ACCEPT: break def testSimpleGrammar(): # 1. define a simple grammar: g = Grammar(['EOF', 'identifier', '(', ')', '+', '*']) g.add_production('input', ['expression']) g.add_production('expression', ['term']) g.add_production('expression', ['expression', '+', 'term']) g.add_production('term', ['factor']) g.add_production('term', ['term', '*', 'factor']) g.add_production('factor', ['(', 'expression', ')']) g.add_production('factor', ['identifier']) g.start_symbol = 'input' # 2. define input: tokens = ['identifier', '+', 'identifier', '+', 'identifier', 'EOF'] # 3. build parser: p = g.genParser() # 4. feed input: p.parse(tokens) if __name__ == '__main__': testSimpleGrammar()