Mercurial > lcfOS
diff python/pyyacc.py @ 318:e84047f29c78
Add burg and yacc initial attempts
author | Windel Bouwman |
---|---|
date | Tue, 31 Dec 2013 12:38:15 +0100 |
parents | 6259856841a0 |
children | 8d07a4254f04 |
line wrap: on
line diff
--- a/python/pyyacc.py Sun Dec 22 15:50:59 2013 +0100 +++ b/python/pyyacc.py Tue Dec 31 12:38:15 2013 +0100 @@ -2,14 +2,11 @@ Parser generator script """ -import hashlib from ppci import Token EPS = 'EPS' EOF = 'EOF' -SHIFT = 1 -REDUCE = 2 -ACCEPT = 3 + class ParserGenerationException(Exception): """ Raised when something goes wrong during parser generation """ @@ -21,6 +18,41 @@ pass +class Action: + def __repr__(self): + return 'Action' + + def __eq__(self, other): + return str(self) == str(other) + + +class Shift(Action): + def __init__(self, to_state): + self.to_state = to_state + + def __repr__(self): + return 'Shift({})'.format(self.to_state) + + +class Reduce(Action): + def __init__(self, rule): + self.rule = rule + + def __repr__(self): + return 'Reduce({})'.format(self.rule) + + +class Accept(Reduce): + def __repr__(self): + return 'Accept({})'.format(self.rule) + + +def print_grammar(g): + """ Pretty print a grammar """ + print(g) + for production in g.productions: + print(production) + class Grammar: """ Defines a grammar of a language """ def __init__(self, terminals): @@ -28,6 +60,10 @@ self.nonterminals = [] self.productions = [] self._first = None # Cached first set + self.start_symbol = None + + def __repr__(self): + return 'Grammar with {} rules'.format(len(self.productions)) def add_production(self, name, symbols, f=None): """ Add a production rule to the grammar """ @@ -50,10 +86,10 @@ @property def first(self): - """ + """ The first set is a mapping from a grammar symbol to a set of set of all terminal symbols that can be the first terminal when - looking for the grammar symbol + looking for the grammar symbol """ if not self._first: self._first = self.calcFirstSets() @@ -125,7 +161,7 @@ return self.closure(iis) def nextItemSet(self, itemset, symbol): - """ + """ Determines the next itemset for the current set and a symbol This is the goto procedure """ @@ -134,7 +170,7 @@ if item.can_shift_over(symbol): next_set.add(item.shifted()) return self.closure(next_set) - + def genCanonicalSet(self, iis): states = [] worklist = [] @@ -153,7 +189,7 @@ addSt(nis) transitions[(states.index(itemset), symbol)] = states.index(nis) return states, transitions - + def checkSymbols(self): """ Checks no symbols are undefined """ for production in self.productions: @@ -161,20 +197,16 @@ if symbol not in self.Symbols + [EPS]: raise ParserGenerationException('Symbol {0} undefined'.format(symbol)) - def getSignature(self): - m = hashlib.md5() - m.update((str(self.productions) + str(self.start_symbol)).encode('ascii')) - signature = m.hexdigest() - def genParser(self): """ Generates a parser from the grammar (using a caching algorithm) """ - signature = self.getSignature() action_table, goto_table = self.doGenerate() p = LRParser(action_table, goto_table, self.start_symbol) p.grammar = self return p def doGenerate(self): + if not self.start_symbol: + self.start_symbol = self.productions[0].name self.checkSymbols() action_table = {} goto_table = {} @@ -183,32 +215,24 @@ # First generate all item sets by using the nextItemset function: states, transitions = self.genCanonicalSet(iis) - def action_str(act): - a, p = act - if a is SHIFT: - return 'Shift {0}'.format(0) - elif a is REDUCE: - return 'Reduce {0}'.format(p) - return 'Other' - def setAction(state, t, action): + assert isinstance(action, Action) key = (state, t) assert type(state) is int assert type(t) is str if key in action_table: action2 = action_table[key] if action != action2: - if (action2[0] == REDUCE) and (action[0] == SHIFT): + if (type(action2) is Reduce) and (type(action) is Shift): # Automatically resolve and do the shift action! # Simple, but almost always what you want!! action_table[key] = action + elif isinstance(action2, Shift) and isinstance(action, Reduce): + pass else: - if (action2[0] == SHIFT) and (action[0] == REDUCE): - pass - else: - a1 = action_str(action) - a2 = action_str(action2) - raise ParserGenerationException('LR construction conflict {0} vs {1}'.format(a1, a2)) + a1 = str(action) + a2 = str(action2) + raise ParserGenerationException('LR construction conflict {0} vs {1}'.format(a1, a2)) else: action_table[key] = action @@ -219,14 +243,15 @@ if item.IsShift and item.Next in self.terminals: # Rule 1, a shift item: nextstate = transitions[(states.index(state), item.Next)] - setAction(states.index(state), item.Next, (SHIFT, nextstate)) + setAction(states.index(state), item.Next, Shift(nextstate)) if item.IsReduce: if item.production.name == self.start_symbol and item.look_ahead == EOF: # Rule 3: accept: - setAction(states.index(state), item.look_ahead, (ACCEPT, self.productions.index(item.production))) + act = Accept(self.productions.index(item.production)) else: # Rule 2, reduce item: - setAction(states.index(state), item.look_ahead, (REDUCE, self.productions.index(item.production))) + act = Reduce(self.productions.index(item.production)) + setAction(states.index(state), item.look_ahead, act) for nt in self.nonterminals: key = (states.index(state), nt) if key in transitions: @@ -242,12 +267,13 @@ self.f = f def __repr__(self): - return '{0} -> {1}'.format(self.name, self.symbols) + action = ' ' + str(self.f) if self.f else '' + return '{0} -> {1}'.format(self.name, self.symbols) + action class Item: - """ - Represents a partially parsed item + """ + Represents a partially parsed item It has a production it is looking for, a position in this production called the 'dot' and a look ahead symbol that must follow this item. @@ -311,68 +337,63 @@ class LRParser: - """ LR parser """ + """ LR parser automata """ def __init__(self, action_table, goto_table, start_symbol): self.action_table = action_table self.goto_table = goto_table self.start_symbol = start_symbol - def parse(self, toks): + def parse(self, lexer): """ Parse an iterable with tokens """ - assert hasattr(toks, '__iter__'), '{0} not iter type'.format(type(toks)) + assert hasattr(lexer, 'next_token'), '{0} is no lexer'.format(type(lexer)) stack = [0] r_data_stack = [] - try: - look_ahead = toks.__next__() - except StopIteration: - look_ahead = Token(EOF, EOF) + look_ahead = lexer.next_token() assert type(look_ahead) is Token # TODO: exit on this condition: while stack != [0, self.start_symbol, 2222]: - #print(stack) state = stack[-1] # top of stack key = (state, look_ahead.typ) if not key in self.action_table: raise ParserException('Error parsing at character {0}'.format(look_ahead)) - action, param = self.action_table[key] - if action == REDUCE: + action = self.action_table[key] + if type(action) is Reduce: f_args = [] - param = self.grammar.productions[param] - for s in param.symbols: + prod = self.grammar.productions[action.rule] + for s in prod.symbols: stack.pop() stack.pop() f_args.append(r_data_stack.pop()) f_args.reverse() r_data = None - if param.f: - r_data = param.f(*f_args) + if prod.f: + r_data = prod.f(*f_args) state = stack[-1] - stack.append(param.name) - stack.append(self.goto_table[(state, param.name)]) + stack.append(prod.name) + stack.append(self.goto_table[(state, prod.name)]) r_data_stack.append(r_data) - elif action == SHIFT: + elif type(action) is Shift: stack.append(look_ahead.typ) - stack.append(param) - r_data_stack.append(look_ahead.val) - try: - look_ahead = toks.__next__() - except StopIteration: - look_ahead = Token(EOF, EOF) + stack.append(action.to_state) + r_data_stack.append(look_ahead) + look_ahead = lexer.next_token() assert type(look_ahead) is Token - elif action == ACCEPT: + elif type(action) is Accept: # Pop last rule data off the stack: f_args = [] - param = self.grammar.productions[param] + param = self.grammar.productions[action.rule] for s in param.symbols: stack.pop() stack.pop() f_args.append(r_data_stack.pop()) f_args.reverse() if param.f: - param.f(*f_args) + ret_val = param.f(*f_args) + else: + ret_val = None # Break out! break # At exit, the stack must be 1 long # TODO: fix that this holds: #assert len(stack) == 1, 'stack {0} not totally reduce'.format(stack) - + return ret_val