changeset 185:51a6440d6398

Fixed LR(1) parser
author Windel Bouwman
date Fri, 24 May 2013 20:45:03 +0200
parents fe2b72381a83
children 46d62dadd61b
files python/pyyacc.py python/testpyy.py
diffstat 2 files changed, 85 insertions(+), 35 deletions(-) [+]
line wrap: on
line diff
--- a/python/pyyacc.py	Fri May 24 16:13:23 2013 +0200
+++ b/python/pyyacc.py	Fri May 24 20:45:03 2013 +0200
@@ -8,6 +8,13 @@
 REDUCE = 2
 ACCEPT = 3
 
+class ParserGenerationException(Exception):
+    pass
+
+
+class ParserException(Exception):
+    pass
+
 
 class Grammar:
     """ Defines a grammar of a language """
@@ -15,6 +22,7 @@
         self.terminals = terminals
         self.nonterminals = []
         self.productions = []
+        self._first = None  # Cached first set
 
     def add_production(self, name, symbols):
         """ Add a production rule to the grammar """
@@ -33,6 +41,12 @@
         """ Get all the symbols defined by this grammar """
         return self.nonterminals + self.terminals
 
+    @property
+    def first(self):
+        if not self._first:
+            self._first = self.calcFirstSets()
+        return self._first
+
     def calcFirstSets(self):
         """
             Calculate first sets for each grammar symbol
@@ -45,7 +59,7 @@
             first[t] = set([t])
         for nt in self.nonterminals:
             first[nt] = set()
-        epsset = set([EPS])
+        epsset = {EPS}
         while True:
             some_change = False
             for p in self.productions:
@@ -78,7 +92,7 @@
                 f.discard(EPS)
                 f.add(item.look_ahead)
             return f
-            
+        # Start of algorithm: 
         while worklist:
             item = worklist.pop(0)
             if not item.IsShift:
@@ -110,13 +124,13 @@
         return self.closure(next_set)
     
     def genCanonicalSet(self, iis):
-        states = set()
+        states = []
         worklist = []
-        goto_table = {}
+        transitions = {}
         def addSt(s):
             if not (s in states):
                 worklist.append(s)
-                states.add(s)
+                states.append(s)
         addSt(iis)
         while len(worklist) > 0:
             itemset = worklist.pop(0)
@@ -124,41 +138,50 @@
                 nis = self.nextItemSet(itemset, symbol)
                 if not nis:
                     continue
-                goto_table[(itemset, symbol)] = nis
                 addSt(nis)
-        return states, goto_table
+                transitions[(states.index(itemset), symbol)] = states.index(nis)
+        return states, transitions
         
     def genParser(self):
         """ Generates a parser from the grammar """
         action_table = {}
-        self.first = self.calcFirstSets()
+        goto_table = {}
         iis = self.initialItemSet()
 
         # First generate all item sets by using the nextItemset function:
-        states, goto_table = self.genCanonicalSet(iis)
-
-        # Number the states:
-        number_states = {}
-        for s, i in zip(states, range(len(states))):
-            number_states[s] = i
+        states, transitions = self.genCanonicalSet(iis)
+        
+        def setAction(state, t, action):
+            key = (state, t)
+            if key in action_table:
+                action2 = action_table[key]
+                if action != action2:
+                    raise ParserGenerationException('LR construction conflict')
+            else:
+                action_table[key] = action
 
         # Fill action table:
         for state in states:
+            # Detect conflicts:
             for item in state:
                 if item.IsShift and item.Next in self.terminals:
-                    action_table[(state, item.Next)] = (SHIFT, 0)
-                elif item.IsReduce:
-                    if item.look_ahead == EOF:
-                        action_table[(state, item.look_ahead)] = (ACCEPT, 0)
+                    # Rule 1, a shift item:
+                    nextstate = transitions[(states.index(state), item.Next)]
+                    setAction(states.index(state), item.Next, (SHIFT, nextstate))
+                if item.IsReduce:
+                    if item.production.name == self.start_symbol and item.look_ahead == EOF:
+                        # Rule 3: accept:
+                        setAction(states.index(state), item.look_ahead, (ACCEPT, None))
                     else:
-                        action_table[(state, item.look_ahead)] = (REDUCE, item.production)
-                else:
-                    pass
+                        # Rule 2, reduce item:
+                        setAction(states.index(state), item.look_ahead, (REDUCE, item.production))
+            for nt in self.nonterminals:
+                key = (states.index(state), nt)
+                if key in transitions:
+                    goto_table[key] = transitions[key]
 
-        p = LRParser(action_table)
-        p.goto_table = goto_table
-        p.s0 = iis
-        return p
+        return LRParser(action_table, goto_table)
+
 
 class Production:
     """ Production rule for a grammar """
@@ -222,21 +245,23 @@
         postdot = ' '.join(prod.symbols[self.dotpos:])
         nt = prod.name
         args = (nt, predot, postdot, self.look_ahead)
-        return '[{0} -> {1} . {2}, {3}]'.format(*args)
+        return '[{0} -> {1} . {2} -> {3}]'.format(*args)
 
 
 class LRParser:
     """ LR parser """
-    def __init__(self, action_table):
+    def __init__(self, action_table, goto_table):
         self.action_table = action_table
+        self.goto_table = goto_table
 
     def parse(self, toks):
-        stack = [EOF, self.s0]
+        stack = [0]
         look_ahead = toks.pop(0)
         while True:
             state = stack[-1]   # top of stack
             key = (state, look_ahead)
             if not key in self.action_table:
+                print(key)
                 raise Exception('Error parsing')
             action, param = self.action_table[(state, look_ahead)]
             if action == REDUCE:
@@ -248,8 +273,7 @@
                 stack.append(self.goto_table[(state, param.name)])
             elif action == SHIFT:
                 stack.append(look_ahead)
-                s, i = stack[-2:]
-                stack.append(self.goto_table[(s, i)])
+                stack.append(param)
                 look_ahead = toks.pop(0)
             elif action == ACCEPT:
                 break
--- a/python/testpyy.py	Fri May 24 16:13:23 2013 +0200
+++ b/python/testpyy.py	Fri May 24 20:45:03 2013 +0200
@@ -24,6 +24,32 @@
         # 4. feed input:
         p.parse(tokens)
 
+class testExpressionGrammar(unittest.TestCase):
+    def setUp(self):
+        g = Grammar(['EOF', 'identifier', '(', ')', '+', '*', 'num'])
+        g.add_production('input', ['expression'])
+        g.add_production('expression', ['term'])
+        g.add_production('expression', ['expression', '+', 'term'])
+        g.add_production('term', ['factor'])
+        g.add_production('term', ['term', '*', 'factor'])
+        g.add_production('factor', ['(', 'expression', ')'])
+        g.add_production('factor', ['identifier'])
+        g.add_production('factor', ['num'])
+        g.start_symbol = 'input'
+        self.g = g
+
+    def testFirstSimpleGrammar(self):
+        # 1. define a simple grammar:
+        first = self.g.calcFirstSets()
+        self.assertEqual(first['input'], {'identifier', '(', 'num'})
+        self.assertEqual(first['term'], {'identifier', '(', 'num'})
+
+    def testCanonical(self):
+        s0 = self.g.initialItemSet()
+        s, gt = self.g.genCanonicalSet(s0)
+        # Must result in 12 sets:
+        self.assertEqual(len(s), 24)
+
 class testPG(unittest.TestCase):
     """ Tests several parts of the parser generator """
     def setUp(self):
@@ -34,7 +60,6 @@
         g.add_production('pair', ['(', 'pair', ')'])
         g.add_production('pair', ['(', ')'])
         g.start_symbol = 'goal'
-        g.first = g.calcFirstSets()
         self.g = g
 
     def testFirstSet(self):
@@ -60,15 +85,13 @@
     def testCanonical(self):
         s0 = self.g.initialItemSet()
         s, gt = self.g.genCanonicalSet(s0)
-        pprint.pprint(s)
         # Must result in 12 sets:
         self.assertEqual(len(s), 12)
 
     def testClosure(self):
         p0, p1, p2, p3, p4 = self.g.productions
         s0 = set()
-        for p in self.g.productionsForName(self.g.start_symbol):
-            s0.add(Item(p, 0, EOF))
+        s0.add(Item(p0, 0, EOF))
         self.assertEqual(len(s0), 1)    # 1 rule
         self.assertIn(Item(p0, 0, EOF), s0)
 
@@ -85,9 +108,12 @@
         self.assertIn(Item(p4, 0, '('), s0)
 
     def testParser(self):
-        tokens = ['(', '(', ')', '(', ')', ')', 'EOF']
+        tokens = ['(', '(', ')', ')', '(', ')', EOF]
         # 3. build parser:
         p = self.g.genParser()
+        self.assertEqual(len(p.goto_table), 5)
+        self.assertEqual(len(p.action_table), 19)
+
         # 4. feed input:
         p.parse(tokens)