view python/pyyacc.py @ 306:b145f8e6050b

Start on c3 rewrite
author Windel Bouwman
date Mon, 09 Dec 2013 19:00:21 +0100
parents 6259856841a0
children e84047f29c78
line wrap: on
line source

"""
  Parser generator script
"""

import hashlib
from ppci import Token

EPS = 'EPS'
EOF = 'EOF'
SHIFT = 1
REDUCE = 2
ACCEPT = 3

class ParserGenerationException(Exception):
    """ Raised when something goes wrong during parser generation """
    pass


class ParserException(Exception):
    """ Raised during a failure in the parsing process """
    pass


class Grammar:
    """ Defines a grammar of a language """
    def __init__(self, terminals):
        self.terminals = terminals
        self.nonterminals = []
        self.productions = []
        self._first = None  # Cached first set

    def add_production(self, name, symbols, f=None):
        """ Add a production rule to the grammar """
        production = Production(name, symbols, f)
        self.productions.append(production)
        if name in self.terminals:
            raise ParserGenerationException("Cannot redefine terminal {0}".format(name))
        if not name in self.nonterminals:
            self.nonterminals.append(name)
        self._first = None  # Invalidate cached version

    def productionsForName(self, name):
        """ Retrieve all productions for a non terminal """
        return [p for p in self.productions if p.name == name]

    @property
    def Symbols(self):
        """ Get all the symbols defined by this grammar """
        return self.nonterminals + self.terminals

    @property
    def first(self):
        """ 
          The first set is a mapping from a grammar symbol to a set of
          set of all terminal symbols that can be the first terminal when
          looking for the grammar symbol 
        """
        if not self._first:
            self._first = self.calcFirstSets()
        return self._first

    def calcFirstSets(self):
        """
            Calculate first sets for each grammar symbol
            This is a dictionary which maps each grammar symbol
            to a set of terminals that can be encountered first
            when looking for the symbol.
        """
        first = {}
        for t in self.terminals + [EOF, EPS]:
            first[t] = set([t])
        for nt in self.nonterminals:
            first[nt] = set()
        epsset = {EPS}
        while True:
            some_change = False
            for p in self.productions:
                rhs = set()
                for beta in p.symbols:
                    rhs = rhs | (first[beta] - epsset)
                    if not EPS in first[beta]:
                        break
                else:
                    if EPS in first[beta]:
                        rhs.add(EPS)
                if rhs - first[p.name]:
                    first[p.name] |= rhs
                    some_change = True
            if not some_change:
                break
        return first

    def closure(self, itemset):
        """ Expand itemset by using epsilon moves """
        worklist = list(itemset)
        def addIt(itm):
            if not itm in itemset:
                itemset.add(itm)
                worklist.append(itm)
        def first2(itm):
            # When using the first sets, create a copy:
            f = set(self.first[itm.NextNext])
            if EPS in f:
                f.discard(EPS)
                f.add(itm.look_ahead)
            return f
        # Start of algorithm: 
        while worklist:
            item = worklist.pop(0)
            if not item.IsShift:
                continue
            if not (item.Next in self.nonterminals):
                continue
            C = item.Next
            for add_p in self.productionsForName(C):
                for b in first2(item):
                    addIt(Item(add_p, 0, b))
        return frozenset(itemset)

    def initialItemSet(self):
        """ Calculates the initial item set """
        iis = set()
        for p in self.productionsForName(self.start_symbol):
            iis.add(Item(p, 0, EOF))
        return self.closure(iis)

    def nextItemSet(self, itemset, symbol):
        """ 
            Determines the next itemset for the current set and a symbol 
            This is the goto procedure
        """
        next_set = set()
        for item in itemset:
            if item.can_shift_over(symbol):
                next_set.add(item.shifted())
        return self.closure(next_set)
    
    def genCanonicalSet(self, iis):
        states = []
        worklist = []
        transitions = {}
        def addSt(s):
            if not (s in states):
                worklist.append(s)
                states.append(s)
        addSt(iis)
        while len(worklist) > 0:
            itemset = worklist.pop(0)
            for symbol in self.Symbols:
                nis = self.nextItemSet(itemset, symbol)
                if not nis:
                    continue
                addSt(nis)
                transitions[(states.index(itemset), symbol)] = states.index(nis)
        return states, transitions
    
    def checkSymbols(self):
        """ Checks no symbols are undefined """
        for production in self.productions:
            for symbol in production.symbols:
                if symbol not in self.Symbols + [EPS]:
                    raise ParserGenerationException('Symbol {0} undefined'.format(symbol))

    def getSignature(self):
        m = hashlib.md5()
        m.update((str(self.productions) + str(self.start_symbol)).encode('ascii'))
        signature = m.hexdigest()
        
    def genParser(self):
        """ Generates a parser from the grammar (using a caching algorithm) """
        signature = self.getSignature() 
        action_table, goto_table = self.doGenerate()
        p = LRParser(action_table, goto_table, self.start_symbol)
        p.grammar = self
        return p

    def doGenerate(self):
        self.checkSymbols()
        action_table = {}
        goto_table = {}
        iis = self.initialItemSet()

        # First generate all item sets by using the nextItemset function:
        states, transitions = self.genCanonicalSet(iis)

        def action_str(act):
            a, p = act
            if a is SHIFT:
                return 'Shift {0}'.format(0)
            elif a is REDUCE:
                return 'Reduce {0}'.format(p)
            return 'Other'
        
        def setAction(state, t, action):
            key = (state, t)
            assert type(state) is int
            assert type(t) is str
            if key in action_table:
                action2 = action_table[key]
                if action != action2:
                    if (action2[0] == REDUCE) and (action[0] == SHIFT):
                        # Automatically resolve and do the shift action!
                        # Simple, but almost always what you want!!
                        action_table[key] = action
                    else:
                        if (action2[0] == SHIFT) and (action[0] == REDUCE):
                            pass
                        else:
                            a1 = action_str(action)
                            a2 = action_str(action2)
                            raise ParserGenerationException('LR construction conflict {0} vs {1}'.format(a1, a2))
            else:
                action_table[key] = action

        # Fill action table:
        for state in states:
            # Detect conflicts:
            for item in state:
                if item.IsShift and item.Next in self.terminals:
                    # Rule 1, a shift item:
                    nextstate = transitions[(states.index(state), item.Next)]
                    setAction(states.index(state), item.Next, (SHIFT, nextstate))
                if item.IsReduce:
                    if item.production.name == self.start_symbol and item.look_ahead == EOF:
                        # Rule 3: accept:
                        setAction(states.index(state), item.look_ahead, (ACCEPT, self.productions.index(item.production)))
                    else:
                        # Rule 2, reduce item:
                        setAction(states.index(state), item.look_ahead, (REDUCE, self.productions.index(item.production)))
            for nt in self.nonterminals:
                key = (states.index(state), nt)
                if key in transitions:
                    goto_table[key] = transitions[key]
        return action_table, goto_table


class Production:
    """ Production rule for a grammar """
    def __init__(self, name, symbols, f=None):
        self.name = name
        self.symbols = symbols
        self.f = f

    def __repr__(self):
        return '{0} -> {1}'.format(self.name, self.symbols)


class Item:
    """ 
        Represents a partially parsed item 
        It has a production it is looking for, a position
        in this production called the 'dot' and a look ahead
        symbol that must follow this item.
    """
    def __init__(self, production, dotpos, look_ahead):
        self.production = production
        self.dotpos = dotpos
        assert self.dotpos <= len(self.production.symbols)
        self.look_ahead = look_ahead

    def getdata(self):
        """ Gets the members as a tuple """
        return (self.production, self.dotpos, self.look_ahead)

    def __eq__(self, other):
        if type(other) is type(self):
            return self.getdata() == other.getdata()
        return False

    def __hash__(self):
        return self.getdata().__hash__()

    @property
    def IsReduce(self):
        """ Check if this item has the dot at the end """
        return self.dotpos == len(self.production.symbols)

    @property
    def IsShift(self):
        """ Check if this item is a shift item, i.e. the dot can proceed """
        return not self.IsReduce

    @property
    def Next(self):
        """ Returns the symbol after the dot """
        return self.production.symbols[self.dotpos]

    def can_shift_over(self, symbol):
        """ Determines if this item can shift over the given symbol """
        return self.IsShift and self.Next == symbol

    def shifted(self):
        """ Creates a new item that is shifted one position """
        return Item(self.production, self.dotpos + 1, self.look_ahead)

    @property
    def NextNext(self):
        """ Gets the symbol after the next symbol, or EPS if at the end """
        if self.dotpos + 1 >= len(self.production.symbols):
            return EPS
        else:
            return self.production.symbols[self.dotpos + 1]

    def __repr__(self):
        prod = self.production
        predot = ' '.join(prod.symbols[0:self.dotpos])
        postdot = ' '.join(prod.symbols[self.dotpos:])
        name = prod.name
        args = (name, predot, postdot, self.look_ahead)
        return '[{0} -> {1} . {2} -> {3}]'.format(*args)


class LRParser:
    """ LR parser """
    def __init__(self, action_table, goto_table, start_symbol):
        self.action_table = action_table
        self.goto_table = goto_table
        self.start_symbol = start_symbol

    def parse(self, toks):
        """ Parse an iterable with tokens """
        assert hasattr(toks, '__iter__'), '{0} not iter type'.format(type(toks))
        stack = [0]
        r_data_stack = []
        try:
            look_ahead = toks.__next__()
        except StopIteration:
            look_ahead = Token(EOF, EOF)
        assert type(look_ahead) is Token
        # TODO: exit on this condition:
        while stack != [0, self.start_symbol, 2222]:
            #print(stack)
            state = stack[-1]   # top of stack
            key = (state, look_ahead.typ)
            if not key in self.action_table:
                raise ParserException('Error parsing at character {0}'.format(look_ahead))
            action, param = self.action_table[key]
            if action == REDUCE:
                f_args = []
                param = self.grammar.productions[param]
                for s in param.symbols:
                    stack.pop()
                    stack.pop()
                    f_args.append(r_data_stack.pop())
                f_args.reverse()
                r_data = None
                if param.f:
                    r_data = param.f(*f_args)
                state = stack[-1]
                stack.append(param.name)
                stack.append(self.goto_table[(state, param.name)])
                r_data_stack.append(r_data)
            elif action == SHIFT:
                stack.append(look_ahead.typ)
                stack.append(param)
                r_data_stack.append(look_ahead.val)
                try:
                    look_ahead = toks.__next__()
                except StopIteration:
                    look_ahead = Token(EOF, EOF)
                assert type(look_ahead) is Token
            elif action == ACCEPT:
                # Pop last rule data off the stack:
                f_args = []
                param = self.grammar.productions[param]
                for s in param.symbols:
                    stack.pop()
                    stack.pop()
                    f_args.append(r_data_stack.pop())
                f_args.reverse()
                if param.f:
                    param.f(*f_args)
                # Break out!
                break
        # At exit, the stack must be 1 long
        # TODO: fix that this holds:
        #assert len(stack) == 1, 'stack {0} not totally reduce'.format(stack)