diff python/pyyacc.py @ 318:e84047f29c78

Add burg and yacc initial attempts
author Windel Bouwman
date Tue, 31 Dec 2013 12:38:15 +0100
parents 6259856841a0
children 8d07a4254f04
line wrap: on
line diff
--- a/python/pyyacc.py	Sun Dec 22 15:50:59 2013 +0100
+++ b/python/pyyacc.py	Tue Dec 31 12:38:15 2013 +0100
@@ -2,14 +2,11 @@
   Parser generator script
 """
 
-import hashlib
 from ppci import Token
 
 EPS = 'EPS'
 EOF = 'EOF'
-SHIFT = 1
-REDUCE = 2
-ACCEPT = 3
+
 
 class ParserGenerationException(Exception):
     """ Raised when something goes wrong during parser generation """
@@ -21,6 +18,41 @@
     pass
 
 
+class Action:
+    def __repr__(self):
+        return 'Action'
+
+    def __eq__(self, other):
+        return str(self) == str(other)
+
+
+class Shift(Action):
+    def __init__(self, to_state):
+        self.to_state = to_state
+
+    def __repr__(self):
+        return 'Shift({})'.format(self.to_state)
+
+
+class Reduce(Action):
+    def __init__(self, rule):
+        self.rule = rule
+
+    def __repr__(self):
+        return 'Reduce({})'.format(self.rule)
+
+
+class Accept(Reduce):
+    def __repr__(self):
+        return 'Accept({})'.format(self.rule)
+
+
+def print_grammar(g):
+    """ Pretty print a grammar """
+    print(g)
+    for production in g.productions:
+        print(production)
+
 class Grammar:
     """ Defines a grammar of a language """
     def __init__(self, terminals):
@@ -28,6 +60,10 @@
         self.nonterminals = []
         self.productions = []
         self._first = None  # Cached first set
+        self.start_symbol = None
+
+    def __repr__(self):
+        return 'Grammar with {} rules'.format(len(self.productions))
 
     def add_production(self, name, symbols, f=None):
         """ Add a production rule to the grammar """
@@ -50,10 +86,10 @@
 
     @property
     def first(self):
-        """ 
+        """
           The first set is a mapping from a grammar symbol to a set of
           set of all terminal symbols that can be the first terminal when
-          looking for the grammar symbol 
+          looking for the grammar symbol
         """
         if not self._first:
             self._first = self.calcFirstSets()
@@ -125,7 +161,7 @@
         return self.closure(iis)
 
     def nextItemSet(self, itemset, symbol):
-        """ 
+        """
             Determines the next itemset for the current set and a symbol 
             This is the goto procedure
         """
@@ -134,7 +170,7 @@
             if item.can_shift_over(symbol):
                 next_set.add(item.shifted())
         return self.closure(next_set)
-    
+
     def genCanonicalSet(self, iis):
         states = []
         worklist = []
@@ -153,7 +189,7 @@
                 addSt(nis)
                 transitions[(states.index(itemset), symbol)] = states.index(nis)
         return states, transitions
-    
+
     def checkSymbols(self):
         """ Checks no symbols are undefined """
         for production in self.productions:
@@ -161,20 +197,16 @@
                 if symbol not in self.Symbols + [EPS]:
                     raise ParserGenerationException('Symbol {0} undefined'.format(symbol))
 
-    def getSignature(self):
-        m = hashlib.md5()
-        m.update((str(self.productions) + str(self.start_symbol)).encode('ascii'))
-        signature = m.hexdigest()
-        
     def genParser(self):
         """ Generates a parser from the grammar (using a caching algorithm) """
-        signature = self.getSignature() 
         action_table, goto_table = self.doGenerate()
         p = LRParser(action_table, goto_table, self.start_symbol)
         p.grammar = self
         return p
 
     def doGenerate(self):
+        if not self.start_symbol:
+            self.start_symbol = self.productions[0].name
         self.checkSymbols()
         action_table = {}
         goto_table = {}
@@ -183,32 +215,24 @@
         # First generate all item sets by using the nextItemset function:
         states, transitions = self.genCanonicalSet(iis)
 
-        def action_str(act):
-            a, p = act
-            if a is SHIFT:
-                return 'Shift {0}'.format(0)
-            elif a is REDUCE:
-                return 'Reduce {0}'.format(p)
-            return 'Other'
-        
         def setAction(state, t, action):
+            assert isinstance(action, Action)
             key = (state, t)
             assert type(state) is int
             assert type(t) is str
             if key in action_table:
                 action2 = action_table[key]
                 if action != action2:
-                    if (action2[0] == REDUCE) and (action[0] == SHIFT):
+                    if (type(action2) is Reduce) and (type(action) is Shift):
                         # Automatically resolve and do the shift action!
                         # Simple, but almost always what you want!!
                         action_table[key] = action
+                    elif isinstance(action2, Shift) and isinstance(action, Reduce):
+                        pass
                     else:
-                        if (action2[0] == SHIFT) and (action[0] == REDUCE):
-                            pass
-                        else:
-                            a1 = action_str(action)
-                            a2 = action_str(action2)
-                            raise ParserGenerationException('LR construction conflict {0} vs {1}'.format(a1, a2))
+                        a1 = str(action)
+                        a2 = str(action2)
+                        raise ParserGenerationException('LR construction conflict {0} vs {1}'.format(a1, a2))
             else:
                 action_table[key] = action
 
@@ -219,14 +243,15 @@
                 if item.IsShift and item.Next in self.terminals:
                     # Rule 1, a shift item:
                     nextstate = transitions[(states.index(state), item.Next)]
-                    setAction(states.index(state), item.Next, (SHIFT, nextstate))
+                    setAction(states.index(state), item.Next, Shift(nextstate))
                 if item.IsReduce:
                     if item.production.name == self.start_symbol and item.look_ahead == EOF:
                         # Rule 3: accept:
-                        setAction(states.index(state), item.look_ahead, (ACCEPT, self.productions.index(item.production)))
+                        act = Accept(self.productions.index(item.production))
                     else:
                         # Rule 2, reduce item:
-                        setAction(states.index(state), item.look_ahead, (REDUCE, self.productions.index(item.production)))
+                        act = Reduce(self.productions.index(item.production))
+                    setAction(states.index(state), item.look_ahead, act)
             for nt in self.nonterminals:
                 key = (states.index(state), nt)
                 if key in transitions:
@@ -242,12 +267,13 @@
         self.f = f
 
     def __repr__(self):
-        return '{0} -> {1}'.format(self.name, self.symbols)
+        action = ' ' + str(self.f) if self.f else ''
+        return '{0} -> {1}'.format(self.name, self.symbols) + action
 
 
 class Item:
-    """ 
-        Represents a partially parsed item 
+    """
+        Represents a partially parsed item
         It has a production it is looking for, a position
         in this production called the 'dot' and a look ahead
         symbol that must follow this item.
@@ -311,68 +337,63 @@
 
 
 class LRParser:
-    """ LR parser """
+    """ LR parser automata """
     def __init__(self, action_table, goto_table, start_symbol):
         self.action_table = action_table
         self.goto_table = goto_table
         self.start_symbol = start_symbol
 
-    def parse(self, toks):
+    def parse(self, lexer):
         """ Parse an iterable with tokens """
-        assert hasattr(toks, '__iter__'), '{0} not iter type'.format(type(toks))
+        assert hasattr(lexer, 'next_token'), '{0} is no lexer'.format(type(lexer))
         stack = [0]
         r_data_stack = []
-        try:
-            look_ahead = toks.__next__()
-        except StopIteration:
-            look_ahead = Token(EOF, EOF)
+        look_ahead = lexer.next_token()
         assert type(look_ahead) is Token
         # TODO: exit on this condition:
         while stack != [0, self.start_symbol, 2222]:
-            #print(stack)
             state = stack[-1]   # top of stack
             key = (state, look_ahead.typ)
             if not key in self.action_table:
                 raise ParserException('Error parsing at character {0}'.format(look_ahead))
-            action, param = self.action_table[key]
-            if action == REDUCE:
+            action = self.action_table[key]
+            if type(action) is Reduce:
                 f_args = []
-                param = self.grammar.productions[param]
-                for s in param.symbols:
+                prod = self.grammar.productions[action.rule]
+                for s in prod.symbols:
                     stack.pop()
                     stack.pop()
                     f_args.append(r_data_stack.pop())
                 f_args.reverse()
                 r_data = None
-                if param.f:
-                    r_data = param.f(*f_args)
+                if prod.f:
+                    r_data = prod.f(*f_args)
                 state = stack[-1]
-                stack.append(param.name)
-                stack.append(self.goto_table[(state, param.name)])
+                stack.append(prod.name)
+                stack.append(self.goto_table[(state, prod.name)])
                 r_data_stack.append(r_data)
-            elif action == SHIFT:
+            elif type(action) is Shift:
                 stack.append(look_ahead.typ)
-                stack.append(param)
-                r_data_stack.append(look_ahead.val)
-                try:
-                    look_ahead = toks.__next__()
-                except StopIteration:
-                    look_ahead = Token(EOF, EOF)
+                stack.append(action.to_state)
+                r_data_stack.append(look_ahead)
+                look_ahead = lexer.next_token()
                 assert type(look_ahead) is Token
-            elif action == ACCEPT:
+            elif type(action) is Accept:
                 # Pop last rule data off the stack:
                 f_args = []
-                param = self.grammar.productions[param]
+                param = self.grammar.productions[action.rule]
                 for s in param.symbols:
                     stack.pop()
                     stack.pop()
                     f_args.append(r_data_stack.pop())
                 f_args.reverse()
                 if param.f:
-                    param.f(*f_args)
+                    ret_val = param.f(*f_args)
+                else:
+                    ret_val = None
                 # Break out!
                 break
         # At exit, the stack must be 1 long
         # TODO: fix that this holds:
         #assert len(stack) == 1, 'stack {0} not totally reduce'.format(stack) 
-
+        return ret_val