Mercurial > lcfOS

--- a/python/pyyacc.py	Sat May 18 18:30:26 2013 +0200
+++ b/python/pyyacc.py	Fri May 24 16:13:23 2013 +0200
@@ -3,6 +3,10 @@
 """

 EPS = 'EPS'
+EOF = 'EOF'
+SHIFT = 1
+REDUCE = 2
+ACCEPT = 3


 class Grammar:
@@ -30,9 +34,14 @@
         return self.nonterminals + self.terminals

     def calcFirstSets(self):
-        """ Calculate first sets for each grammar symbol """
+        """
+            Calculate first sets for each grammar symbol
+            This is a dictionary which maps each grammar symbol
+            to a set of terminals that can be encountered first
+            when looking for the symbol.
+        """
         first = {}
-        for t in self.terminals + ['EOF', EPS]:
+        for t in self.terminals + [EOF, EPS]:
             first[t] = set([t])
         for nt in self.nonterminals:
             first[nt] = set()
@@ -55,6 +64,101 @@
                 break
         return first

+    def closure(self, itemset):
+        """ Expand itemset by using epsilon moves """
+        worklist = list(itemset)
+        def addIt(itm):
+            if not itm in itemset:
+                itemset.add(itm)
+                worklist.append(itm)
+        def first2(itm, la):
+            # When using the first sets, create a copy:
+            f = set(self.first[item.NextNext])
+            if EPS in f:
+                f.discard(EPS)
+                f.add(item.look_ahead)
+            return f
+
+        while worklist:
+            item = worklist.pop(0)
+            if not item.IsShift:
+                continue
+            if not (item.Next in self.nonterminals):
+                continue
+            C = item.Next
+            for add_p in self.productionsForName(C):
+                for b in first2(item, item.look_ahead):
+                    addIt(Item(add_p, 0, b))
+        return frozenset(itemset)
+
+    def initialItemSet(self):
+        """ Calculates the initial item set """
+        iis = set()
+        for p in self.productionsForName(self.start_symbol):
+            iis.add(Item(p, 0, EOF))
+        return self.closure(iis)
+
+    def nextItemSet(self, itemset, symbol):
+        """
+            Determines the next itemset for the current set and a symbol
+            This is the goto procedure
+        """
+        next_set = set()
+        for item in itemset:
+            if item.can_shift_over(symbol):
+                next_set.add(item.shifted())
+        return self.closure(next_set)
+
+    def genCanonicalSet(self, iis):
+        states = set()
+        worklist = []
+        goto_table = {}
+        def addSt(s):
+            if not (s in states):
+                worklist.append(s)
+                states.add(s)
+        addSt(iis)
+        while len(worklist) > 0:
+            itemset = worklist.pop(0)
+            for symbol in self.Symbols:
+                nis = self.nextItemSet(itemset, symbol)
+                if not nis:
+                    continue
+                goto_table[(itemset, symbol)] = nis
+                addSt(nis)
+        return states, goto_table
+
+    def genParser(self):
+        """ Generates a parser from the grammar """
+        action_table = {}
+        self.first = self.calcFirstSets()
+        iis = self.initialItemSet()
+
+        # First generate all item sets by using the nextItemset function:
+        states, goto_table = self.genCanonicalSet(iis)
+
+        # Number the states:
+        number_states = {}
+        for s, i in zip(states, range(len(states))):
+            number_states[s] = i
+
+        # Fill action table:
+        for state in states:
+            for item in state:
+                if item.IsShift and item.Next in self.terminals:
+                    action_table[(state, item.Next)] = (SHIFT, 0)
+                elif item.IsReduce:
+                    if item.look_ahead == EOF:
+                        action_table[(state, item.look_ahead)] = (ACCEPT, 0)
+                    else:
+                        action_table[(state, item.look_ahead)] = (REDUCE, item.production)
+                else:
+                    pass
+
+        p = LRParser(action_table)
+        p.goto_table = goto_table
+        p.s0 = iis
+        return p

 class Production:
     """ Production rule for a grammar """
@@ -70,6 +174,7 @@
     def __init__(self, production, dotpos, look_ahead):
         self.production = production
         self.dotpos = dotpos
+        assert self.dotpos <= len(self.production.symbols)
         self.look_ahead = look_ahead

     def getdata(self):
@@ -96,10 +201,18 @@
     def Next(self):
         return self.production.symbols[self.dotpos]

+    def can_shift_over(self, symbol):
+        """ Determines if this item can shift over the given symbol """
+        return self.IsShift and self.Next == symbol
+
+    def shifted(self):
+        """ Creates a new item that is shifted one position """
+        return Item(self.production, self.dotpos + 1, self.look_ahead)
+
     @property
     def NextNext(self):
         if self.dotpos + 1 >= len(self.production.symbols):
-            return 'EPS'
+            return EPS
         else:
             return self.production.symbols[self.dotpos + 1]

@@ -109,81 +222,16 @@
         postdot = ' '.join(prod.symbols[self.dotpos:])
         nt = prod.name
         args = (nt, predot, postdot, self.look_ahead)
-        return '[{0} -> {1} _dot_ {2},{3}]'.format(*args)
-
-
-class LRgenerator:
-    """ Parser generator """
-    def __init__(self, g):
-        self.grammar = g
-
-    def closure(self, itemset):
-        """ Expand itemset by using epsilon moves """
-        worklist = list(itemset)
-        while worklist:
-            item = worklist.pop(0)
-            if item.IsShift and item.Next in self.grammar.nonterminals:
-                for add_p in self.grammar.productionsForName(item.Next):
-                    f = self.first[item.NextNext]
-                    if 'EPS' in f:
-                        f.discard('EPS')
-                        f.add(item.look_ahead)
-                    for b in f:
-                        i = Item(add_p, 0, b)
-                        if not i in itemset:
-                            itemset.add(i)
-                            worklist.append(i)
-        return frozenset(itemset)
-
-    def initialItemSet(self):
-        """ Calculates the initial item set """
-        iis = set()
-        for p in self.grammar.productionsForName(self.grammar.start_symbol):
-            iis.add(Item(p, 0, 'EOF'))
-        return self.closure(iis)
-
-    def nextItemSet(self, itemset, symbol):
-        """ Determines the next itemset for the current set and a symbol """
-        nis = set()
-        for item in itemset:
-            if item.IsShift and item.Next == symbol:
-                nis.add(Item(item.production, item.dotpos + 1, item.look_ahead))
-        return self.closure(nis)
-
-    def genParser(self):
-        states = set()
-        goto_table = {}
-        action_table = {}
-        self.first = self.grammar.calcFirstSets()
-        iis = self.initialItemSet()
-        states.add(iis)
-
-        # First generate all item sets by using the nextItemset function:
-        worklist = [iis]
-        while len(worklist) > 0:
-            itemset = worklist.pop(0)
-            for symbol in self.grammar.Symbols:
-                nis = self.nextItemSet(itemset, symbol)
-                if nis:
-                    goto_table[(itemset, symbol)] = nis
-                    if not nis in states:
-                        worklist.append(nis)
-                        states.add(nis)
-
-        # Number the states:
-        print('States:')
-        for s, i in zip(states, range(len(states))):
-            print(i, ', '.join(str(x) for x in s))
-        p = LRParser()
-        p.goto_table = goto_table
-        p.action_table = action_table
-        p.s0 = iis
-        return p
+        return '[{0} -> {1} . {2}, {3}]'.format(*args)


 class LRParser:
+    """ LR parser """
+    def __init__(self, action_table):
+        self.action_table = action_table
+
     def parse(self, toks):
-        stack = ['EOF', self.s0]
+        stack = [EOF, self.s0]
         look_ahead = toks.pop(0)
         while True:
             state = stack[-1]   # top of stack
@@ -191,23 +239,22 @@
             if not key in self.action_table:
                 raise Exception('Error parsing')
             action, param = self.action_table[(state, look_ahead)]
-            if action == 'reduce':
+            if action == REDUCE:
                 for s in param.symbols:
                     stack.pop()
                     stack.pop()
                 state = stack[-1]
                 stack.append(param.name)
                 stack.append(self.goto_table[(state, param.name)])
-            elif action == 'shift':
+            elif action == SHIFT:
                 stack.append(look_ahead)
                 s, i = stack[-2:]
                 stack.append(self.goto_table[(s, i)])
                 look_ahead = toks.pop(0)
-            elif action == 'accept':
+            elif action == ACCEPT:
                 break

-# Test it:
-def test():
+def testSimpleGrammar():
     # 1. define a simple grammar:
     g = Grammar(['EOF', 'identifier', '(', ')', '+', '*'])
     g.add_production('input', ['expression'])
@@ -221,12 +268,10 @@
     # 2. define input:
     tokens = ['identifier', '+', 'identifier', '+', 'identifier', 'EOF']
     # 3. build parser:
-    p = LRgenerator(g).genParser()
+    p = g.genParser()
     # 4. feed input:
     p.parse(tokens)

-print('Hallo')
+
 if __name__ == '__main__':
-    print('Hallo')
-    test()
-
+    testSimpleGrammar()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/python/testpyy.py	Fri May 24 16:13:23 2013 +0200
@@ -0,0 +1,97 @@
+import unittest, pprint
+from pyyacc import Grammar, Item, EOF
+
+
+class testLR(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def testSimpleGrammar(self):
+        # 1. define a simple grammar:
+        g = Grammar(['EOF', 'identifier', '(', ')', '+', '*'])
+        g.add_production('input', ['expression'])
+        g.add_production('expression', ['term'])
+        g.add_production('expression', ['expression', '+', 'term'])
+        g.add_production('term', ['factor'])
+        g.add_production('term', ['term', '*', 'factor'])
+        g.add_production('factor', ['(', 'expression', ')'])
+        g.add_production('factor', ['identifier'])
+        g.start_symbol = 'input'
+        # 2. define input:
+        tokens = ['identifier', '+', 'identifier', '+', 'identifier', 'EOF']
+        # 3. build parser:
+        p = g.genParser()
+        # 4. feed input:
+        p.parse(tokens)
+
+class testPG(unittest.TestCase):
+    """ Tests several parts of the parser generator """
+    def setUp(self):
+        g = Grammar(['(', ')'])
+        g.add_production('goal', ['list'])
+        g.add_production('list', ['list', 'pair'])
+        g.add_production('list', ['pair'])
+        g.add_production('pair', ['(', 'pair', ')'])
+        g.add_production('pair', ['(', ')'])
+        g.start_symbol = 'goal'
+        g.first = g.calcFirstSets()
+        self.g = g
+
+    def testFirstSet(self):
+        for a in ['(', ')', EOF, 'EPS']:
+            self.assertEqual(self.g.first[a], {a})
+        for nt in ['list', 'pair', 'goal']:
+            self.assertEqual(self.g.first[nt], {'('})
+
+    def testInitItemSet(self):
+        p0, p1, p2, p3, p4 = self.g.productions
+        s0 = self.g.initialItemSet()
+        self.assertEqual(len(s0), 9)    # 9 with the goal rule included!
+        self.assertIn(Item(p0, 0, EOF), s0)
+        self.assertIn(Item(p1, 0, EOF), s0)
+        self.assertIn(Item(p1, 0, '('), s0)
+        self.assertIn(Item(p2, 0, EOF), s0)
+        self.assertIn(Item(p2, 0, '('), s0)
+        self.assertIn(Item(p3, 0, EOF), s0)
+        self.assertIn(Item(p3, 0, '('), s0)
+        self.assertIn(Item(p4, 0, EOF), s0)
+        self.assertIn(Item(p4, 0, '('), s0)
+
+    def testCanonical(self):
+        s0 = self.g.initialItemSet()
+        s, gt = self.g.genCanonicalSet(s0)
+        pprint.pprint(s)
+        # Must result in 12 sets:
+        self.assertEqual(len(s), 12)
+
+    def testClosure(self):
+        p0, p1, p2, p3, p4 = self.g.productions
+        s0 = set()
+        for p in self.g.productionsForName(self.g.start_symbol):
+            s0.add(Item(p, 0, EOF))
+        self.assertEqual(len(s0), 1)    # 1 rule
+        self.assertIn(Item(p0, 0, EOF), s0)
+
+        # Invoke closure on set:
+        s0 = self.g.closure(s0)
+        self.assertIn(Item(p0, 0, EOF), s0)
+        self.assertIn(Item(p1, 0, EOF), s0)
+        self.assertIn(Item(p1, 0, '('), s0)
+        self.assertIn(Item(p2, 0, EOF), s0)
+        self.assertIn(Item(p2, 0, '('), s0)
+        self.assertIn(Item(p3, 0, EOF), s0)
+        self.assertIn(Item(p3, 0, '('), s0)
+        self.assertIn(Item(p4, 0, EOF), s0)
+        self.assertIn(Item(p4, 0, '('), s0)
+
+    def testParser(self):
+        tokens = ['(', '(', ')', '(', ')', ')', 'EOF']
+        # 3. build parser:
+        p = self.g.genParser()
+        # 4. feed input:
+        p.parse(tokens)
+
+if __name__ == '__main__':
+    unittest.main()
+
+