view python/pyyacc.py @ 181:216da5e46efc

Changed indent to 4 spaces to comply to convention
author Windel Bouwman
date Sat, 18 May 2013 17:45:26 +0200
parents 25a0753da4cf
children fe2b72381a83
line wrap: on
line source

"""
  Parser generator script
"""

EPS = 'EPS'


class Grammar:
    """ Defines a grammar of a language """
    def __init__(self, terminals):
        self.terminals = terminals
        self.nonterminals = []
        self.productions = []

    def add_production(self, name, symbols):
        """ Add a production rule to the grammar """
        production = Production(name, symbols)
        self.productions.append(production)
        assert not name in self.terminals, "Cannot redefine terminal"
        if not name in self.nonterminals:
            self.nonterminals.append(name)

    def productionsForName(self, name):
        """ Retrieve all productions for a non terminal """
        return [p for p in self.productions if p.name == name]

    @property
    def Symbols(self):
        """ Get all the symbols defined by this grammar """
        return self.nonterminals + self.terminals

    def calcFirstSets(self):
        """ Calculate first sets for each grammar symbol """
        first = {}
        for t in self.terminals + ['EOF', EPS]:
            first[t] = set([t])
        for nt in self.nonterminals:
            first[nt] = set()
        epsset = set([EPS])
        while True:
            some_change = False
            for p in self.productions:
                rhs = set()
                for beta in p.symbols:
                    rhs = rhs | (first[beta] - epsset)
                    if not EPS in first[beta]:
                        break
                else:
                    if EPS in first[beta]:
                        rhs.add(EPS)
                if rhs - first[p.name]:
                    first[p.name] |= rhs
                    some_change = True
            if not some_change:
                break
        return first


class Production:
    """ Production rule for a grammar """
    def __init__(self, name, symbols):
        self.name = name
        self.symbols = symbols

    def __repr__(self):
        return '{0} -> {1}'.format(self.name, self.symbols)


class Item:
    def __init__(self, production, dotpos, look_ahead):
        self.production = production
        self.dotpos = dotpos
        self.look_ahead = look_ahead

    def getdata(self):
        """ Gets the members as a tuple """
        return (self.production, self.dotpos, self.look_ahead)

    def __eq__(self, other):
        if type(other) is type(self):
            return self.getdata() == other.getdata()
        return False

    def __hash__(self):
        return self.getdata().__hash__()

    @property
    def IsReduce(self):
        return self.dotpos == len(self.production.symbols)

    @property
    def IsShift(self):
        return not self.IsReduce

    @property
    def Next(self):
        return self.production.symbols[self.dotpos]

    @property
    def NextNext(self):
        if self.dotpos + 1 >= len(self.production.symbols):
            return 'EPS'
        else:
            return self.production.symbols[self.dotpos + 1]

    def __repr__(self):
        prod = self.production
        predot = ' '.join(prod.symbols[0:self.dotpos])
        postdot = ' '.join(prod.symbols[self.dotpos:])
        nt = prod.name
        args = (nt, predot, postdot, self.look_ahead)
        return '[{0} -> {1} _dot_ {2},{3}]'.format(*args)


class LRgenerator:
    """ Parser generator """
    def __init__(self, g):
        self.grammar = g

    def closure(self, itemset):
        """ Expand itemset by using epsilon moves """
        worklist = list(itemset)
        while worklist:
            item = worklist.pop(0)
            if item.IsShift and item.Next in self.grammar.nonterminals:
                for add_p in self.grammar.productionsForName(item.Next):
                    f = self.first[item.NextNext]
                    if 'EPS' in f:
                        f.discard('EPS')
                        f.add(item.look_ahead)
                    for b in f:
                        i = Item(add_p, 0, b)
                        if not i in itemset:
                            itemset.add(i)
                            worklist.append(i)
        return frozenset(itemset)

    def initialItemSet(self):
        """ Calculates the initial item set """
        iis = set()
        for p in self.grammar.productionsForName(self.grammar.start_symbol):
            iis.add(Item(p, 0, 'EOF'))
        return self.closure(iis)

    def nextItemSet(self, itemset, symbol):
        """ Determines the next itemset for the current set and a symbol """
        nis = set()
        for item in itemset:
            if item.IsShift and item.Next == symbol:
                nis.add(Item(item.production, item.dotpos + 1, item.look_ahead))
        return self.closure(nis)

    def genParser(self):
        states = set()
        goto_table = {}
        action_table = {}
        self.first = self.grammar.calcFirstSets()
        iis = self.initialItemSet()
        states.add(iis)

        # First generate all item sets by using the nextItemset function:
        worklist = [iis]
        while len(worklist) > 0:
            itemset = worklist.pop(0)
            for symbol in self.grammar.Symbols:
                nis = self.nextItemSet(itemset, symbol)
                if nis:
                    goto_table[(itemset, symbol)] = nis
                    if not nis in states:
                        worklist.append(nis)
                        states.add(nis)

        # Number the states:
        print('States:')
        for s, i in zip(states, range(len(states))):
            print(i, ', '.join(str(x) for x in s))
        p = LRParser()
        p.goto_table = goto_table
        p.action_table = action_table
        p.s0 = iis
        return p


class LRParser:
    def parse(self, toks):
        stack = ['EOF', self.s0]
        look_ahead = toks.pop(0)
        while True:
            state = stack[-1]   # top of stack
            key = (state, look_ahead)
            if not key in self.action_table:
                raise Exception('Error parsing')
            action, param = self.action_table[(state, look_ahead)]
            if action == 'reduce':
                for s in param.symbols:
                    stack.pop()
                    stack.pop()
                state = stack[-1]
                stack.append(param.name)
                stack.append(self.goto_table[(state, param.name)])
            elif action == 'shift':
                stack.append(look_ahead)
                s, i = stack[-2:]
                stack.append(self.goto_table[(s, i)])
                look_ahead = toks.pop(0)
            elif action == 'accept':
                break

# Test it:
def test():
    # 1. define a simple grammar:
    g = Grammar(['EOF', 'identifier', '(', ')', '+', '*'])
    g.add_production('input', ['expression'])
    g.add_production('expression', ['term'])
    g.add_production('expression', ['expression', '+', 'term'])
    g.add_production('term', ['factor'])
    g.add_production('term', ['term', '*', 'factor'])
    g.add_production('factor', ['(', 'expression', ')'])
    g.add_production('factor', ['identifier'])
    g.start_symbol = 'input'
    # 2. define input:
    tokens = ['identifier', '+', 'identifier', '+', 'identifier', 'EOF']
    # 3. build parser:
    p = LRgenerator(g).genParser()
    # 4. feed input:
    p.parse(tokens)

print('Hallo')
if __name__ == '__main__':
    print('Hallo')
    test()