Mercurial > lcfOS
diff python/pyyacc.py @ 181:216da5e46efc
Changed indent to 4 spaces to comply to convention
author | Windel Bouwman |
---|---|
date | Sat, 18 May 2013 17:45:26 +0200 |
parents | 25a0753da4cf |
children | fe2b72381a83 |
line wrap: on
line diff
--- a/python/pyyacc.py Thu May 09 17:35:19 2013 +0200 +++ b/python/pyyacc.py Sat May 18 17:45:26 2013 +0200 @@ -1,165 +1,232 @@ +""" + Parser generator script +""" + +EPS = 'EPS' + + class Grammar: - def __init__(self, terminals): - self.terminals = terminals - self.nonterminals = [] - self.productions = [] - def add_production(self, name, symbols): - p = Production(name, symbols) - self.productions.append(p) - if not name in self.nonterminals: - self.nonterminals.append(name) - def productionsForName(self, name): - return [p for p in self.productions if p.name == name] - @property - def Symbols(self): - return self.nonterminals + self.terminals - def calcFollow(self): - follow = {} - for nt in self.nonterminals: - follow[nt] = set() - while True: - change = False - # 1. - for p in self.productions: - pass - if not change: - break - return follow - def calcFirst(self): - first = {} - return first + """ Defines a grammar of a language """ + def __init__(self, terminals): + self.terminals = terminals + self.nonterminals = [] + self.productions = [] + + def add_production(self, name, symbols): + """ Add a production rule to the grammar """ + production = Production(name, symbols) + self.productions.append(production) + assert not name in self.terminals, "Cannot redefine terminal" + if not name in self.nonterminals: + self.nonterminals.append(name) + + def productionsForName(self, name): + """ Retrieve all productions for a non terminal """ + return [p for p in self.productions if p.name == name] + + @property + def Symbols(self): + """ Get all the symbols defined by this grammar """ + return self.nonterminals + self.terminals + + def calcFirstSets(self): + """ Calculate first sets for each grammar symbol """ + first = {} + for t in self.terminals + ['EOF', EPS]: + first[t] = set([t]) + for nt in self.nonterminals: + first[nt] = set() + epsset = set([EPS]) + while True: + some_change = False + for p in self.productions: + rhs = set() + for beta in p.symbols: + rhs = rhs | (first[beta] - epsset) + if not EPS in first[beta]: + break + else: + if EPS in first[beta]: + rhs.add(EPS) + if rhs - first[p.name]: + first[p.name] |= rhs + some_change = True + if not some_change: + break + return first + class Production: - def __init__(self, name, symbols): - self.name = name - self.symbols = symbols - def __repr__(self): - return '{0} -> {1}'.format(self.name, self.symbols) + """ Production rule for a grammar """ + def __init__(self, name, symbols): + self.name = name + self.symbols = symbols + + def __repr__(self): + return '{0} -> {1}'.format(self.name, self.symbols) + class Item: - def __init__(self, production, dotpos): - self.production = production - self.dotpos = dotpos - def __eq__(self, other): - if type(other) is type(self): - return (self.production, self.dotpos) == (other.production, other.dotpos) - return False - def __hash__(self): - return (self.production, self.dotpos).__hash__() - @property - def IsReduce(self): - return self.dotpos == len(self.production.symbols) - @property - def IsShift(self): - return not self.IsReduce - @property - def Next(self): - return self.production.symbols[self.dotpos] - def matches(self, symbol): - return self.Next == symbol - def __repr__(self): - return '{0} -> {1} _dot_ {2} {{}}'.format(self.production.name, ' '.join(self.production.symbols[0:self.dotpos]), ' '.join(self.production.symbols[self.dotpos:])) + def __init__(self, production, dotpos, look_ahead): + self.production = production + self.dotpos = dotpos + self.look_ahead = look_ahead + + def getdata(self): + """ Gets the members as a tuple """ + return (self.production, self.dotpos, self.look_ahead) + + def __eq__(self, other): + if type(other) is type(self): + return self.getdata() == other.getdata() + return False + + def __hash__(self): + return self.getdata().__hash__() + + @property + def IsReduce(self): + return self.dotpos == len(self.production.symbols) + + @property + def IsShift(self): + return not self.IsReduce + + @property + def Next(self): + return self.production.symbols[self.dotpos] + + @property + def NextNext(self): + if self.dotpos + 1 >= len(self.production.symbols): + return 'EPS' + else: + return self.production.symbols[self.dotpos + 1] + + def __repr__(self): + prod = self.production + predot = ' '.join(prod.symbols[0:self.dotpos]) + postdot = ' '.join(prod.symbols[self.dotpos:]) + nt = prod.name + args = (nt, predot, postdot, self.look_ahead) + return '[{0} -> {1} _dot_ {2},{3}]'.format(*args) + class LRgenerator: - def __init__(self, g): - self.grammar = g - def e_closure(self, S): - """ Expand itemset by using epsilon moves """ - something_changed = True - while something_changed: - something_changed = False - worklist = list(S) - for item in worklist: + """ Parser generator """ + def __init__(self, g): + self.grammar = g + + def closure(self, itemset): + """ Expand itemset by using epsilon moves """ + worklist = list(itemset) + while worklist: + item = worklist.pop(0) if item.IsShift and item.Next in self.grammar.nonterminals: - n = item.Next - for add_p in self.grammar.productionsForName(n): - i = Item(add_p, 0) - if not i in S: - S.add(i) - something_changed = True - return frozenset(S) - def initialItemSet(self): - nis = set() - for p in self.grammar.productionsForName(self.grammar.start_symbol): - nis.add(Item(p, 0)) - return self.e_closure(nis) - def nextItemSet(self, itemset, symbol): - nis = set() - for item in itemset: - if item.IsShift and item.Next == symbol: - nis.add(Item(item.production, item.dotpos + 1)) - return self.e_closure(nis) - def genParser(self): - states = set() - goto_table = {} - action_table = {} - iis = self.initialItemSet() - worklist = [iis] - states.add(iis) - while len(worklist) > 0: - itemset = worklist.pop(0) - has_goto = False - for symbol in self.grammar.Symbols: - nis = self.nextItemSet(itemset, symbol) - if nis: - goto_table[(itemset, symbol)] = nis - has_goto = True - if not nis in states: - worklist.append(nis) - states.add(nis) - if has_goto: - action_table[itemset] = 'shift', 's' - for item in itemset: - assert item.IsShift, item - else: - # Check for reduce-reduce conflicts: - assert len(itemset) == 1 - item = list(itemset)[0] - assert item.IsReduce - action_table[itemset] = 'reduce', item.production - for s, i in zip(states, range(len(states))): - print(i, ' || '.join(str(x) for x in s)) - p = LRParser() - p.goto_table = goto_table - p.action_table = action_table - p.s0 = iis - return p + for add_p in self.grammar.productionsForName(item.Next): + f = self.first[item.NextNext] + if 'EPS' in f: + f.discard('EPS') + f.add(item.look_ahead) + for b in f: + i = Item(add_p, 0, b) + if not i in itemset: + itemset.add(i) + worklist.append(i) + return frozenset(itemset) + + def initialItemSet(self): + """ Calculates the initial item set """ + iis = set() + for p in self.grammar.productionsForName(self.grammar.start_symbol): + iis.add(Item(p, 0, 'EOF')) + return self.closure(iis) + + def nextItemSet(self, itemset, symbol): + """ Determines the next itemset for the current set and a symbol """ + nis = set() + for item in itemset: + if item.IsShift and item.Next == symbol: + nis.add(Item(item.production, item.dotpos + 1, item.look_ahead)) + return self.closure(nis) + + def genParser(self): + states = set() + goto_table = {} + action_table = {} + self.first = self.grammar.calcFirstSets() + iis = self.initialItemSet() + states.add(iis) + + # First generate all item sets by using the nextItemset function: + worklist = [iis] + while len(worklist) > 0: + itemset = worklist.pop(0) + for symbol in self.grammar.Symbols: + nis = self.nextItemSet(itemset, symbol) + if nis: + goto_table[(itemset, symbol)] = nis + if not nis in states: + worklist.append(nis) + states.add(nis) + + # Number the states: + print('States:') + for s, i in zip(states, range(len(states))): + print(i, ', '.join(str(x) for x in s)) + p = LRParser() + p.goto_table = goto_table + p.action_table = action_table + p.s0 = iis + return p + class LRParser: - def parse(self, toks): - stack = [self.s0] - while stack[-1] != 'input': - action, p = self.action_table[stack[-1]] - print(action, p) - if action == 'shift': - stack.append(toks.pop(0)) - s, i = stack[-2:] - stack.append(self.goto_table[(s, i)]) - elif action == 'reduce': - for s in p.symbols: - stack.pop() - stack.pop() - stack.append(p.name) - s, i = stack[-2:] - if stack[-1] == 'input': - break - stack.append(self.goto_table[(s, i)]) + def parse(self, toks): + stack = ['EOF', self.s0] + look_ahead = toks.pop(0) + while True: + state = stack[-1] # top of stack + key = (state, look_ahead) + if not key in self.action_table: + raise Exception('Error parsing') + action, param = self.action_table[(state, look_ahead)] + if action == 'reduce': + for s in param.symbols: + stack.pop() + stack.pop() + state = stack[-1] + stack.append(param.name) + stack.append(self.goto_table[(state, param.name)]) + elif action == 'shift': + stack.append(look_ahead) + s, i = stack[-2:] + stack.append(self.goto_table[(s, i)]) + look_ahead = toks.pop(0) + elif action == 'accept': + break # Test it: -# 1. define a simple grammar: -g = Grammar(['EOF', 'identifier', '(', ')', '+', '*']) -g.add_production('input', ['expression', 'EOF']) -g.add_production('expression', ['term']) -g.add_production('expression', ['expression', '+', 'term']) -g.add_production('term', ['factor']) -#g.add_production('term', ['term', '*', 'factor']) -g.add_production('term', ['(', 'expression', ')']) -g.add_production('factor', ['identifier']) -g.start_symbol = 'input' -# 2. define input: -tokens = ['identifier', '+', 'identifier', '+', 'identifier', 'EOF'] -# 3. build parser: -p = LRgenerator(g).genParser() -# 4. feed input: -p.parse(tokens) +def test(): + # 1. define a simple grammar: + g = Grammar(['EOF', 'identifier', '(', ')', '+', '*']) + g.add_production('input', ['expression']) + g.add_production('expression', ['term']) + g.add_production('expression', ['expression', '+', 'term']) + g.add_production('term', ['factor']) + g.add_production('term', ['term', '*', 'factor']) + g.add_production('factor', ['(', 'expression', ')']) + g.add_production('factor', ['identifier']) + g.start_symbol = 'input' + # 2. define input: + tokens = ['identifier', '+', 'identifier', '+', 'identifier', 'EOF'] + # 3. build parser: + p = LRgenerator(g).genParser() + # 4. feed input: + p.parse(tokens) +print('Hallo') +if __name__ == '__main__': + print('Hallo') + test() +