view python/pyyacc.py @ 192:6cd6260789a1

Added more tests for parser generator
author Windel Bouwman
date Sun, 26 May 2013 23:19:27 +0200
parents 6b2bec5653f1
children b01429a5d695
line wrap: on
line source

"""
  Parser generator script
"""

from ppci import Token

EPS = 'EPS'
EOF = 'EOF'
SHIFT = 1
REDUCE = 2
ACCEPT = 3

class ParserGenerationException(Exception):
    """ Raised when something goes wrong during parser generation """
    pass


class ParserException(Exception):
    """ Raised during a failure in the parsing process """
    pass


class Grammar:
    """ Defines a grammar of a language """
    def __init__(self, terminals):
        self.terminals = terminals
        self.nonterminals = []
        self.productions = []
        self._first = None  # Cached first set

    def add_production(self, name, symbols):
        """ Add a production rule to the grammar """
        production = Production(name, symbols)
        self.productions.append(production)
        if name in self.terminals:
            raise ParserGenerationException("Cannot redefine terminal {0}".format(name))
        if not name in self.nonterminals:
            self.nonterminals.append(name)

    def productionsForName(self, name):
        """ Retrieve all productions for a non terminal """
        return [p for p in self.productions if p.name == name]

    @property
    def Symbols(self):
        """ Get all the symbols defined by this grammar """
        return self.nonterminals + self.terminals

    @property
    def first(self):
        """ 
          The first set is a mapping from a grammar symbol to a set of
          set of all terminal symbols that can be the first terminal when
          looking for the grammar symbol 
        """
        if not self._first:
            self._first = self.calcFirstSets()
        return self._first

    def calcFirstSets(self):
        """
            Calculate first sets for each grammar symbol
            This is a dictionary which maps each grammar symbol
            to a set of terminals that can be encountered first
            when looking for the symbol.
        """
        first = {}
        for t in self.terminals + [EOF, EPS]:
            first[t] = set([t])
        for nt in self.nonterminals:
            first[nt] = set()
        epsset = {EPS}
        while True:
            some_change = False
            for p in self.productions:
                rhs = set()
                for beta in p.symbols:
                    rhs = rhs | (first[beta] - epsset)
                    if not EPS in first[beta]:
                        break
                else:
                    if EPS in first[beta]:
                        rhs.add(EPS)
                if rhs - first[p.name]:
                    first[p.name] |= rhs
                    some_change = True
            if not some_change:
                break
        return first

    def closure(self, itemset):
        """ Expand itemset by using epsilon moves """
        worklist = list(itemset)
        def addIt(itm):
            if not itm in itemset:
                itemset.add(itm)
                worklist.append(itm)
        def first2(itm):
            # When using the first sets, create a copy:
            f = set(self.first[itm.NextNext])
            if EPS in f:
                f.discard(EPS)
                f.add(itm.look_ahead)
            return f
        # Start of algorithm: 
        while worklist:
            item = worklist.pop(0)
            if not item.IsShift:
                continue
            if not (item.Next in self.nonterminals):
                continue
            C = item.Next
            for add_p in self.productionsForName(C):
                for b in first2(item):
                    addIt(Item(add_p, 0, b))
        return frozenset(itemset)

    def initialItemSet(self):
        """ Calculates the initial item set """
        iis = set()
        for p in self.productionsForName(self.start_symbol):
            iis.add(Item(p, 0, EOF))
        return self.closure(iis)

    def nextItemSet(self, itemset, symbol):
        """ 
            Determines the next itemset for the current set and a symbol 
            This is the goto procedure
        """
        next_set = set()
        for item in itemset:
            if item.can_shift_over(symbol):
                next_set.add(item.shifted())
        return self.closure(next_set)
    
    def genCanonicalSet(self, iis):
        states = []
        worklist = []
        transitions = {}
        def addSt(s):
            if not (s in states):
                worklist.append(s)
                states.append(s)
        addSt(iis)
        while len(worklist) > 0:
            itemset = worklist.pop(0)
            for symbol in self.Symbols:
                nis = self.nextItemSet(itemset, symbol)
                if not nis:
                    continue
                addSt(nis)
                transitions[(states.index(itemset), symbol)] = states.index(nis)
        return states, transitions
    
    def checkSymbols(self):
        """ Checks no symbols are undefined """
        for production in self.productions:
            for symbol in production.symbols:
                if symbol not in self.Symbols:
                    raise ParserGenerationException('Symbol {0} undefined'.format(symbol))


    def genParser(self):
        """ Generates a parser from the grammar """
        self.checkSymbols()
        action_table = {}
        goto_table = {}
        iis = self.initialItemSet()

        # First generate all item sets by using the nextItemset function:
        states, transitions = self.genCanonicalSet(iis)
        
        def setAction(state, t, action):
            key = (state, t)
            if key in action_table:
                action2 = action_table[key]
                if action != action2:
                    raise ParserGenerationException('LR construction conflict')
            else:
                action_table[key] = action

        # Fill action table:
        for state in states:
            # Detect conflicts:
            for item in state:
                if item.IsShift and item.Next in self.terminals:
                    # Rule 1, a shift item:
                    nextstate = transitions[(states.index(state), item.Next)]
                    setAction(states.index(state), item.Next, (SHIFT, nextstate))
                if item.IsReduce:
                    if item.production.name == self.start_symbol and item.look_ahead == EOF:
                        # Rule 3: accept:
                        setAction(states.index(state), item.look_ahead, (ACCEPT, None))
                    else:
                        # Rule 2, reduce item:
                        setAction(states.index(state), item.look_ahead, (REDUCE, item.production))
            for nt in self.nonterminals:
                key = (states.index(state), nt)
                if key in transitions:
                    goto_table[key] = transitions[key]

        return LRParser(action_table, goto_table)


class Production:
    """ Production rule for a grammar """
    def __init__(self, name, symbols):
        self.name = name
        self.symbols = symbols

    def __repr__(self):
        return '{0} -> {1}'.format(self.name, self.symbols)


class Item:
    """ 
        Represents a partially parsed item 
        It has a production it is looking for, a position
        in this production called the 'dot' and a look ahead
        symbol that must follow this item.
    """
    def __init__(self, production, dotpos, look_ahead):
        self.production = production
        self.dotpos = dotpos
        assert self.dotpos <= len(self.production.symbols)
        self.look_ahead = look_ahead

    def getdata(self):
        """ Gets the members as a tuple """
        return (self.production, self.dotpos, self.look_ahead)

    def __eq__(self, other):
        if type(other) is type(self):
            return self.getdata() == other.getdata()
        return False

    def __hash__(self):
        return self.getdata().__hash__()

    @property
    def IsReduce(self):
        """ Check if this item has the dot at the end """
        return self.dotpos == len(self.production.symbols)

    @property
    def IsShift(self):
        """ Check if this item is a shift item, i.e. the dot can proceed """
        return not self.IsReduce

    @property
    def Next(self):
        """ Returns the symbol after the dot """
        return self.production.symbols[self.dotpos]

    def can_shift_over(self, symbol):
        """ Determines if this item can shift over the given symbol """
        return self.IsShift and self.Next == symbol

    def shifted(self):
        """ Creates a new item that is shifted one position """
        return Item(self.production, self.dotpos + 1, self.look_ahead)

    @property
    def NextNext(self):
        """ Gets the symbol after the next symbol, or EPS if at the end """
        if self.dotpos + 1 >= len(self.production.symbols):
            return EPS
        else:
            return self.production.symbols[self.dotpos + 1]

    def __repr__(self):
        prod = self.production
        predot = ' '.join(prod.symbols[0:self.dotpos])
        postdot = ' '.join(prod.symbols[self.dotpos:])
        name = prod.name
        args = (name, predot, postdot, self.look_ahead)
        return '[{0} -> {1} . {2} -> {3}]'.format(*args)


class LRParser:
    """ LR parser """
    def __init__(self, action_table, goto_table):
        self.action_table = action_table
        self.goto_table = goto_table

    def parse(self, toks):
        """ Parse an iterable with tokens """
        assert hasattr(toks, '__iter__'), '{0} not iter type'.format(type(toks))
        stack = [0]
        look_ahead = toks.__next__()
        assert type(look_ahead) is Token
        while True:
            state = stack[-1]   # top of stack
            key = (state, look_ahead.typ)
            if not key in self.action_table:
                raise Exception('Error parsing at character {0}'.format(look_ahead))
            action, param = self.action_table[key]
            if action == REDUCE:
                for s in param.symbols:
                    stack.pop()
                    stack.pop()
                state = stack[-1]
                stack.append(param.name)
                stack.append(self.goto_table[(state, param.name)])
            elif action == SHIFT:
                stack.append(look_ahead.typ)
                stack.append(param)
                try:
                    look_ahead = toks.__next__()
                except StopIteration:
                    look_ahead = Token(EOF, EOF, 0)
                assert type(look_ahead) is Token
            elif action == ACCEPT:
                break


def testSimpleGrammar():
    # 1. define a simple grammar:
    g = Grammar(['EOF', 'identifier', '(', ')', '+', '*'])
    g.add_production('input', ['expression'])
    g.add_production('expression', ['term'])
    g.add_production('expression', ['expression', '+', 'term'])
    g.add_production('term', ['factor'])
    g.add_production('term', ['term', '*', 'factor'])
    g.add_production('factor', ['(', 'expression', ')'])
    g.add_production('factor', ['identifier'])
    g.start_symbol = 'input'
    # 2. define input:
    tokens = ['identifier', '+', 'identifier', '+', 'identifier']
    # 3. build parser:
    p = g.genParser()
    # 4. feed input:
    def genTokens(lst):
        for t in lst:
            yield Token(t, t, 0)
    p.parse(genTokens(tokens))


if __name__ == '__main__':
    testSimpleGrammar()