Mercurial > lcfOS

"""
  Parser generator script
"""

EPS = 'EPS'
EOF = 'EOF'
SHIFT = 1
REDUCE = 2
ACCEPT = 3


class Grammar:
    """ Defines a grammar of a language """
    def __init__(self, terminals):
        self.terminals = terminals
        self.nonterminals = []
        self.productions = []

    def add_production(self, name, symbols):
        """ Add a production rule to the grammar """
        production = Production(name, symbols)
        self.productions.append(production)
        assert not name in self.terminals, "Cannot redefine terminal"
        if not name in self.nonterminals:
            self.nonterminals.append(name)

    def productionsForName(self, name):
        """ Retrieve all productions for a non terminal """
        return [p for p in self.productions if p.name == name]

    @property
    def Symbols(self):
        """ Get all the symbols defined by this grammar """
        return self.nonterminals + self.terminals

    def calcFirstSets(self):
        """
            Calculate first sets for each grammar symbol
            This is a dictionary which maps each grammar symbol
            to a set of terminals that can be encountered first
            when looking for the symbol.
        """
        first = {}
        for t in self.terminals + [EOF, EPS]:
            first[t] = set([t])
        for nt in self.nonterminals:
            first[nt] = set()
        epsset = set([EPS])
        while True:
            some_change = False
            for p in self.productions:
                rhs = set()
                for beta in p.symbols:
                    rhs = rhs | (first[beta] - epsset)
                    if not EPS in first[beta]:
                        break
                else:
                    if EPS in first[beta]:
                        rhs.add(EPS)
                if rhs - first[p.name]:
                    first[p.name] |= rhs
                    some_change = True
            if not some_change:
                break
        return first

    def closure(self, itemset):
        """ Expand itemset by using epsilon moves """
        worklist = list(itemset)
        def addIt(itm):
            if not itm in itemset:
                itemset.add(itm)
                worklist.append(itm)
        def first2(itm, la):
            # When using the first sets, create a copy:
            f = set(self.first[item.NextNext])
            if EPS in f:
                f.discard(EPS)
                f.add(item.look_ahead)
            return f

        while worklist:
            item = worklist.pop(0)
            if not item.IsShift:
                continue
            if not (item.Next in self.nonterminals):
                continue
            C = item.Next
            for add_p in self.productionsForName(C):
                for b in first2(item, item.look_ahead):
                    addIt(Item(add_p, 0, b))
        return frozenset(itemset)

    def initialItemSet(self):
        """ Calculates the initial item set """
        iis = set()
        for p in self.productionsForName(self.start_symbol):
            iis.add(Item(p, 0, EOF))
        return self.closure(iis)

    def nextItemSet(self, itemset, symbol):
        """
            Determines the next itemset for the current set and a symbol
            This is the goto procedure
        """
        next_set = set()
        for item in itemset:
            if item.can_shift_over(symbol):
                next_set.add(item.shifted())
        return self.closure(next_set)

    def genCanonicalSet(self, iis):
        states = set()
        worklist = []
        goto_table = {}
        def addSt(s):
            if not (s in states):
                worklist.append(s)
                states.add(s)
        addSt(iis)
        while len(worklist) > 0:
            itemset = worklist.pop(0)
            for symbol in self.Symbols:
                nis = self.nextItemSet(itemset, symbol)
                if not nis:
                    continue
                goto_table[(itemset, symbol)] = nis
                addSt(nis)
        return states, goto_table

    def genParser(self):
        """ Generates a parser from the grammar """
        action_table = {}
        self.first = self.calcFirstSets()
        iis = self.initialItemSet()

        # First generate all item sets by using the nextItemset function:
        states, goto_table = self.genCanonicalSet(iis)

        # Number the states:
        number_states = {}
        for s, i in zip(states, range(len(states))):
            number_states[s] = i

        # Fill action table:
        for state in states:
            for item in state:
                if item.IsShift and item.Next in self.terminals:
                    action_table[(state, item.Next)] = (SHIFT, 0)
                elif item.IsReduce:
                    if item.look_ahead == EOF:
                        action_table[(state, item.look_ahead)] = (ACCEPT, 0)
                    else:
                        action_table[(state, item.look_ahead)] = (REDUCE, item.production)
                else:
                    pass

        p = LRParser(action_table)
        p.goto_table = goto_table
        p.s0 = iis
        return p

class Production:
    """ Production rule for a grammar """
    def __init__(self, name, symbols):
        self.name = name
        self.symbols = symbols

    def __repr__(self):
        return '{0} -> {1}'.format(self.name, self.symbols)


class Item:
    def __init__(self, production, dotpos, look_ahead):
        self.production = production
        self.dotpos = dotpos
        assert self.dotpos <= len(self.production.symbols)
        self.look_ahead = look_ahead

    def getdata(self):
        """ Gets the members as a tuple """
        return (self.production, self.dotpos, self.look_ahead)

    def __eq__(self, other):
        if type(other) is type(self):
            return self.getdata() == other.getdata()
        return False

    def __hash__(self):
        return self.getdata().__hash__()

    @property
    def IsReduce(self):
        return self.dotpos == len(self.production.symbols)

    @property
    def IsShift(self):
        return not self.IsReduce

    @property
    def Next(self):
        return self.production.symbols[self.dotpos]

    def can_shift_over(self, symbol):
        """ Determines if this item can shift over the given symbol """
        return self.IsShift and self.Next == symbol

    def shifted(self):
        """ Creates a new item that is shifted one position """
        return Item(self.production, self.dotpos + 1, self.look_ahead)

    @property
    def NextNext(self):
        if self.dotpos + 1 >= len(self.production.symbols):
            return EPS
        else:
            return self.production.symbols[self.dotpos + 1]

    def __repr__(self):
        prod = self.production
        predot = ' '.join(prod.symbols[0:self.dotpos])
        postdot = ' '.join(prod.symbols[self.dotpos:])
        nt = prod.name
        args = (nt, predot, postdot, self.look_ahead)
        return '[{0} -> {1} . {2}, {3}]'.format(*args)


class LRParser:
    """ LR parser """
    def __init__(self, action_table):
        self.action_table = action_table

    def parse(self, toks):
        stack = [EOF, self.s0]
        look_ahead = toks.pop(0)
        while True:
            state = stack[-1]   # top of stack
            key = (state, look_ahead)
            if not key in self.action_table:
                raise Exception('Error parsing')
            action, param = self.action_table[(state, look_ahead)]
            if action == REDUCE:
                for s in param.symbols:
                    stack.pop()
                    stack.pop()
                state = stack[-1]
                stack.append(param.name)
                stack.append(self.goto_table[(state, param.name)])
            elif action == SHIFT:
                stack.append(look_ahead)
                s, i = stack[-2:]
                stack.append(self.goto_table[(s, i)])
                look_ahead = toks.pop(0)
            elif action == ACCEPT:
                break

def testSimpleGrammar():
    # 1. define a simple grammar:
    g = Grammar(['EOF', 'identifier', '(', ')', '+', '*'])
    g.add_production('input', ['expression'])
    g.add_production('expression', ['term'])
    g.add_production('expression', ['expression', '+', 'term'])
    g.add_production('term', ['factor'])
    g.add_production('term', ['term', '*', 'factor'])
    g.add_production('factor', ['(', 'expression', ')'])
    g.add_production('factor', ['identifier'])
    g.start_symbol = 'input'
    # 2. define input:
    tokens = ['identifier', '+', 'identifier', '+', 'identifier', 'EOF']
    # 3. build parser:
    p = g.genParser()
    # 4. feed input:
    p.parse(tokens)


if __name__ == '__main__':
    testSimpleGrammar()
author	Windel Bouwman
date	Fri, 24 May 2013 16:13:23 +0200
parents	216da5e46efc
children	51a6440d6398