diff python/pyyacc.py @ 181:216da5e46efc

Changed indent to 4 spaces to comply to convention
author Windel Bouwman
date Sat, 18 May 2013 17:45:26 +0200
parents 25a0753da4cf
children fe2b72381a83
line wrap: on
line diff
--- a/python/pyyacc.py	Thu May 09 17:35:19 2013 +0200
+++ b/python/pyyacc.py	Sat May 18 17:45:26 2013 +0200
@@ -1,165 +1,232 @@
+"""
+  Parser generator script
+"""
+
+EPS = 'EPS'
+
+
 class Grammar:
-   def __init__(self, terminals):
-      self.terminals = terminals
-      self.nonterminals = []
-      self.productions = []
-   def add_production(self, name, symbols):
-      p = Production(name, symbols)
-      self.productions.append(p)
-      if not name in self.nonterminals:
-         self.nonterminals.append(name)
-   def productionsForName(self, name):
-      return [p for p in self.productions if p.name == name]
-   @property
-   def Symbols(self):
-      return self.nonterminals + self.terminals
-   def calcFollow(self):
-      follow = {}
-      for nt in self.nonterminals:
-         follow[nt] = set()
-      while True:
-         change = False
-         # 1.
-         for p in self.productions:
-            pass
-         if not change:
-            break
-      return follow
-   def calcFirst(self):
-      first = {}
-      return first
+    """ Defines a grammar of a language """
+    def __init__(self, terminals):
+        self.terminals = terminals
+        self.nonterminals = []
+        self.productions = []
+
+    def add_production(self, name, symbols):
+        """ Add a production rule to the grammar """
+        production = Production(name, symbols)
+        self.productions.append(production)
+        assert not name in self.terminals, "Cannot redefine terminal"
+        if not name in self.nonterminals:
+            self.nonterminals.append(name)
+
+    def productionsForName(self, name):
+        """ Retrieve all productions for a non terminal """
+        return [p for p in self.productions if p.name == name]
+
+    @property
+    def Symbols(self):
+        """ Get all the symbols defined by this grammar """
+        return self.nonterminals + self.terminals
+
+    def calcFirstSets(self):
+        """ Calculate first sets for each grammar symbol """
+        first = {}
+        for t in self.terminals + ['EOF', EPS]:
+            first[t] = set([t])
+        for nt in self.nonterminals:
+            first[nt] = set()
+        epsset = set([EPS])
+        while True:
+            some_change = False
+            for p in self.productions:
+                rhs = set()
+                for beta in p.symbols:
+                    rhs = rhs | (first[beta] - epsset)
+                    if not EPS in first[beta]:
+                        break
+                else:
+                    if EPS in first[beta]:
+                        rhs.add(EPS)
+                if rhs - first[p.name]:
+                    first[p.name] |= rhs
+                    some_change = True
+            if not some_change:
+                break
+        return first
+
 
 class Production:
-   def __init__(self, name, symbols):
-      self.name = name
-      self.symbols = symbols
-   def __repr__(self):
-      return '{0} -> {1}'.format(self.name, self.symbols)
+    """ Production rule for a grammar """
+    def __init__(self, name, symbols):
+        self.name = name
+        self.symbols = symbols
+
+    def __repr__(self):
+        return '{0} -> {1}'.format(self.name, self.symbols)
+
 
 class Item:
-   def __init__(self, production, dotpos):
-      self.production = production
-      self.dotpos = dotpos
-   def __eq__(self, other):
-      if type(other) is type(self):
-         return (self.production, self.dotpos) == (other.production, other.dotpos)
-      return False
-   def __hash__(self):
-      return (self.production, self.dotpos).__hash__()
-   @property
-   def IsReduce(self):
-      return self.dotpos == len(self.production.symbols)
-   @property
-   def IsShift(self):
-      return not self.IsReduce
-   @property
-   def Next(self):
-      return self.production.symbols[self.dotpos]
-   def matches(self, symbol):
-      return self.Next == symbol
-   def __repr__(self):
-      return '{0} -> {1} _dot_ {2} {{}}'.format(self.production.name, ' '.join(self.production.symbols[0:self.dotpos]), ' '.join(self.production.symbols[self.dotpos:]))
+    def __init__(self, production, dotpos, look_ahead):
+        self.production = production
+        self.dotpos = dotpos
+        self.look_ahead = look_ahead
+
+    def getdata(self):
+        """ Gets the members as a tuple """
+        return (self.production, self.dotpos, self.look_ahead)
+
+    def __eq__(self, other):
+        if type(other) is type(self):
+            return self.getdata() == other.getdata()
+        return False
+
+    def __hash__(self):
+        return self.getdata().__hash__()
+
+    @property
+    def IsReduce(self):
+        return self.dotpos == len(self.production.symbols)
+
+    @property
+    def IsShift(self):
+        return not self.IsReduce
+
+    @property
+    def Next(self):
+        return self.production.symbols[self.dotpos]
+
+    @property
+    def NextNext(self):
+        if self.dotpos + 1 >= len(self.production.symbols):
+            return 'EPS'
+        else:
+            return self.production.symbols[self.dotpos + 1]
+
+    def __repr__(self):
+        prod = self.production
+        predot = ' '.join(prod.symbols[0:self.dotpos])
+        postdot = ' '.join(prod.symbols[self.dotpos:])
+        nt = prod.name
+        args = (nt, predot, postdot, self.look_ahead)
+        return '[{0} -> {1} _dot_ {2},{3}]'.format(*args)
+
 
 class LRgenerator:
-   def __init__(self, g):
-      self.grammar = g
-   def e_closure(self, S):
-      """ Expand itemset by using epsilon moves """
-      something_changed = True
-      while something_changed:
-         something_changed = False
-         worklist = list(S)
-         for item in worklist:
+    """ Parser generator """
+    def __init__(self, g):
+        self.grammar = g
+
+    def closure(self, itemset):
+        """ Expand itemset by using epsilon moves """
+        worklist = list(itemset)
+        while worklist:
+            item = worklist.pop(0)
             if item.IsShift and item.Next in self.grammar.nonterminals:
-               n = item.Next
-               for add_p in self.grammar.productionsForName(n):
-                  i = Item(add_p, 0)
-                  if not i in S:
-                     S.add(i)
-                     something_changed = True
-      return frozenset(S)
-   def initialItemSet(self):
-      nis = set()
-      for p in self.grammar.productionsForName(self.grammar.start_symbol):
-         nis.add(Item(p, 0))
-      return self.e_closure(nis)
-   def nextItemSet(self, itemset, symbol):
-      nis = set()
-      for item in itemset:
-         if item.IsShift and item.Next == symbol:
-            nis.add(Item(item.production, item.dotpos + 1))
-      return self.e_closure(nis)
-   def genParser(self):
-      states = set()
-      goto_table = {}
-      action_table = {}
-      iis = self.initialItemSet()
-      worklist = [iis]
-      states.add(iis)
-      while len(worklist) > 0:
-         itemset = worklist.pop(0)
-         has_goto = False
-         for symbol in self.grammar.Symbols:
-            nis = self.nextItemSet(itemset, symbol)
-            if nis:
-               goto_table[(itemset, symbol)] = nis
-               has_goto = True
-               if not nis in states:
-                  worklist.append(nis)
-                  states.add(nis)
-         if has_goto:
-            action_table[itemset] = 'shift', 's'
-            for item in itemset:
-               assert item.IsShift, item
-         else:
-            # Check for reduce-reduce conflicts:
-            assert len(itemset) == 1
-            item = list(itemset)[0]
-            assert item.IsReduce
-            action_table[itemset] = 'reduce', item.production
-      for s, i in zip(states, range(len(states))):
-         print(i, ' || '.join(str(x) for x in s))
-      p = LRParser()
-      p.goto_table = goto_table
-      p.action_table = action_table
-      p.s0 = iis
-      return p
+                for add_p in self.grammar.productionsForName(item.Next):
+                    f = self.first[item.NextNext]
+                    if 'EPS' in f:
+                        f.discard('EPS')
+                        f.add(item.look_ahead)
+                    for b in f:
+                        i = Item(add_p, 0, b)
+                        if not i in itemset:
+                            itemset.add(i)
+                            worklist.append(i)
+        return frozenset(itemset)
+
+    def initialItemSet(self):
+        """ Calculates the initial item set """
+        iis = set()
+        for p in self.grammar.productionsForName(self.grammar.start_symbol):
+            iis.add(Item(p, 0, 'EOF'))
+        return self.closure(iis)
+
+    def nextItemSet(self, itemset, symbol):
+        """ Determines the next itemset for the current set and a symbol """
+        nis = set()
+        for item in itemset:
+            if item.IsShift and item.Next == symbol:
+                nis.add(Item(item.production, item.dotpos + 1, item.look_ahead))
+        return self.closure(nis)
+
+    def genParser(self):
+        states = set()
+        goto_table = {}
+        action_table = {}
+        self.first = self.grammar.calcFirstSets()
+        iis = self.initialItemSet()
+        states.add(iis)
+
+        # First generate all item sets by using the nextItemset function:
+        worklist = [iis]
+        while len(worklist) > 0:
+            itemset = worklist.pop(0)
+            for symbol in self.grammar.Symbols:
+                nis = self.nextItemSet(itemset, symbol)
+                if nis:
+                    goto_table[(itemset, symbol)] = nis
+                    if not nis in states:
+                        worklist.append(nis)
+                        states.add(nis)
+
+        # Number the states:
+        print('States:')
+        for s, i in zip(states, range(len(states))):
+            print(i, ', '.join(str(x) for x in s))
+        p = LRParser()
+        p.goto_table = goto_table
+        p.action_table = action_table
+        p.s0 = iis
+        return p
+
 
 class LRParser:
-   def parse(self, toks):
-      stack = [self.s0]
-      while stack[-1] != 'input':
-         action, p = self.action_table[stack[-1]]
-         print(action, p)
-         if action == 'shift':
-            stack.append(toks.pop(0))
-            s, i = stack[-2:]
-            stack.append(self.goto_table[(s, i)])
-         elif action == 'reduce':
-            for s in p.symbols:
-               stack.pop()
-               stack.pop()
-            stack.append(p.name)
-            s, i = stack[-2:]
-            if stack[-1] == 'input':
-               break
-            stack.append(self.goto_table[(s, i)])
+    def parse(self, toks):
+        stack = ['EOF', self.s0]
+        look_ahead = toks.pop(0)
+        while True:
+            state = stack[-1]   # top of stack
+            key = (state, look_ahead)
+            if not key in self.action_table:
+                raise Exception('Error parsing')
+            action, param = self.action_table[(state, look_ahead)]
+            if action == 'reduce':
+                for s in param.symbols:
+                    stack.pop()
+                    stack.pop()
+                state = stack[-1]
+                stack.append(param.name)
+                stack.append(self.goto_table[(state, param.name)])
+            elif action == 'shift':
+                stack.append(look_ahead)
+                s, i = stack[-2:]
+                stack.append(self.goto_table[(s, i)])
+                look_ahead = toks.pop(0)
+            elif action == 'accept':
+                break
 
 # Test it:
-# 1. define a simple grammar:
-g = Grammar(['EOF', 'identifier', '(', ')', '+', '*'])
-g.add_production('input', ['expression', 'EOF'])
-g.add_production('expression', ['term'])
-g.add_production('expression', ['expression', '+', 'term'])
-g.add_production('term', ['factor'])
-#g.add_production('term', ['term', '*', 'factor'])
-g.add_production('term', ['(', 'expression', ')'])
-g.add_production('factor', ['identifier'])
-g.start_symbol = 'input'
-# 2. define input:
-tokens = ['identifier', '+', 'identifier', '+', 'identifier', 'EOF']
-# 3. build parser:
-p = LRgenerator(g).genParser()
-# 4. feed input:
-p.parse(tokens)
+def test():
+    # 1. define a simple grammar:
+    g = Grammar(['EOF', 'identifier', '(', ')', '+', '*'])
+    g.add_production('input', ['expression'])
+    g.add_production('expression', ['term'])
+    g.add_production('expression', ['expression', '+', 'term'])
+    g.add_production('term', ['factor'])
+    g.add_production('term', ['term', '*', 'factor'])
+    g.add_production('factor', ['(', 'expression', ')'])
+    g.add_production('factor', ['identifier'])
+    g.start_symbol = 'input'
+    # 2. define input:
+    tokens = ['identifier', '+', 'identifier', '+', 'identifier', 'EOF']
+    # 3. build parser:
+    p = LRgenerator(g).genParser()
+    # 4. feed input:
+    p.parse(tokens)
 
+print('Hallo')
+if __name__ == '__main__':
+    print('Hallo')
+    test()
+