view python/yacc.py @ 318:e84047f29c78

Add burg and yacc initial attempts
author Windel Bouwman
date Tue, 31 Dec 2013 12:38:15 +0100
parents
children 8d07a4254f04
line wrap: on
line source

#!/usr/bin/python

""" Parser generator utility """

import argparse
import re
import sys
import datetime
from pyyacc import Grammar, print_grammar


class XaccLexer:
    def __init__(self):
        pass

    def feed(self, txt):
        # Create a regular expression for the lexing part:
        tok_spec = [
           ('ID', r'[A-Za-z][A-Za-z\d_]*'),
           ('STRING', r"'[^']*'"),
           ('BRACEDCODE', r"\{[^\}]*\}"),
           ('OTHER', r'[:;\|]'),
           ('SKIP', r'[ ]')
            ]
        tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
        gettok = re.compile(tok_re).match

        lines = txt.split('\n')

        def tokenize_line(line):
            """ Generator that splits up a line into tokens """
            mo = gettok(line)
            pos = 0
            while mo:
                typ = mo.lastgroup
                val = mo.group(typ)
                if typ == 'ID':
                    yield (typ, val)
                elif typ == 'STRING':
                    typ = 'ID'
                    yield (typ, val[1:-1])
                elif typ == 'OTHER':
                    typ = val
                    yield (typ, val)
                elif typ == 'BRACEDCODE':
                    yield (typ, val)
                elif typ == 'SKIP':
                    pass
                else:
                    raise NotImplementedError(str(typ))
                pos = mo.end()
                mo = gettok(line, pos)
            if len(line) != pos:
                raise ParseError('Lex fault at {}'.format(line))

        def tokenize():
            section = 0
            for line in lines:
                line = line.strip()
                if not line:
                    continue  # Skip empty lines
                if line == '%%':
                    section += 1
                    yield('%%', '%%')
                    continue
                if section == 0:
                    if line.startswith('%tokens'):
                        yield('%tokens', '%tokens')
                        yield from tokenize_line(line[7:])
                    else:
                        yield ('HEADER', line)
                elif section == 1:
                    yield from tokenize_line(line)
            yield ('eof', 'eof')
        self.tokens = tokenize()
        self.token = self.tokens.__next__()

    def next_token(self):
        t = self.token
        if t[0] != 'eof':
            self.token = self.tokens.__next__()
        #print(t)
        return t


class ParseError(Exception):
    pass


class XaccParser:
    """ Implements a recursive descent parser to parse grammar rules.
        We could have made an generated parser, but that would yield a chicken
        egg issue.
    """
    def __init__(self, lexer):
        self.lexer = lexer

    @property
    def Peak(self):
        """ Sneak peak to the next token in line """
        return self.lexer.token[0]

    def next_token(self):
        """ Take the next token """
        return self.lexer.next_token()

    def consume(self, typ):
        """ Eat next token of type typ or raise an exception """
        if self.Peak == typ:
            return self.next_token()
        else:
            raise ParseError('Expected {}, but got {}'.format(typ, self.Peak))

    def has_consumed(self, typ):
        """ Consume typ if possible and return true if so """
        if self.Peak == typ:
            self.consume(typ)
            return True
        return False

    def parse_grammar(self):
        """ Entry parse function into recursive descent parser """
        # parse header
        headers = []
        terminals = []
        while self.Peak in ['HEADER', '%tokens']:
            if self.Peak == '%tokens':
                self.consume('%tokens')
                while self.Peak == 'ID':
                    terminals.append(self.consume('ID')[1])
            else:
                headers.append(self.consume('HEADER')[1])
        self.consume('%%')
        self.grammar = Grammar(terminals)
        while self.Peak != 'eof':
            self.parse_rule()
        return self.grammar

    def parse_symbol(self):
        return self.consume('ID')[1]

    def parse_rhs(self):
        symbols = []
        while self.Peak not in [';', 'BRACEDCODE', '|']:
            symbols.append(self.parse_symbol())
        if self.Peak == 'BRACEDCODE':
            action = self.consume('BRACEDCODE')[1]
            action = action[1:-1].strip()
        else:
            action = None
        return symbols, action

    def parse_rule(self):
        p = self.parse_symbol()
        self.consume(':')
        symbols, action = self.parse_rhs()
        self.grammar.add_production(p, symbols, action)
        while self.has_consumed('|'):
            symbols, action = self.parse_rhs()
            self.grammar.add_production(p, symbols, action)
        self.consume(';')


class XaccGenerator:
    """ Generator that writes generated parser to file """
    def __init__(self):
        pass

    def generate(self, grammar, output_file):
        print_grammar(grammar)
        self.grammar = grammar
        self.action_table, self.goto_table = grammar.doGenerate()
        self.generate_python_script(output_file)

    def generate_python_script(self, f):
        """ Generate python script with the parser table """
        print('#!/usr/bin/python', file=f)
        stamp = datetime.datetime.now().ctime()
        print('""" Automatically generated by xacc on {} """'.format(stamp), file=f)
        print('from pyyacc import LRParser, Reduce, Shift, Accept, Production, Grammar', file=f)
        print('from ppci import Token', file=f)
        print(file=f)
        print('class Parser(LRParser):', file=f)
        print('    def __init__(self):', file=f)
        # Generate rules:
        print('        self.start_symbol = "{}"'.format(self.grammar.start_symbol), file=f)
        print('        self.grammar = Grammar({})'.format(self.grammar.terminals), file=f)
        for rule_number, rule in enumerate(self.grammar.productions):
            rule.f_name = 'action_{}_{}'.format(rule.name, rule_number)
            print('        self.grammar.add_production("{}", {}, self.{})'.format(rule.name, rule.symbols, rule.f_name), file=f)
        # Fill action table:
        print('        self.action_table = {}', file=f)
        for state in self.action_table:
            action = self.action_table[state]
            print('        self.action_table[{}] = {}'.format(state, action), file=f)
        print('', file=f)

        # Fill goto table:
        print('        self.goto_table = {}', file=f)
        for gt in self.goto_table:
            to = self.goto_table[gt]
            print('        self.goto_table[{}] = {}'.format(gt, to), file=f)
        print('', file=f)

        # Generate a function for each action:
        for rule in self.grammar.productions:
            M = len(rule.symbols)
            args = ', '.join('arg{}'.format(n + 1) for n in range(M))
            print('    def {}(self, {}):'.format(rule.f_name, args), file=f)
            if rule.f == None:
                semantics = 'pass'
            else:
                semantics = str(rule.f)
                if semantics.strip() == '':
                    semantics = 'pass'
            for n in range(M):
                semantics = semantics.replace('${}'.format(n + 1), 'arg{}'.format(n + 1))
            print('        {}'.format(semantics), file=f)
            print('', file=f)


def main():
    # Parse arguments:
    parser = argparse.ArgumentParser(description='xacc compiler compiler')
    parser.add_argument('source', type=argparse.FileType('r'), \
      help='the parser specification')
    parser.add_argument('-o', '--output', type=argparse.FileType('w'), \
        default=sys.stdout)
    args = parser.parse_args()
    src = args.source.read()
    args.source.close()

    # Construction of generator parts:
    lexer = XaccLexer()
    parser = XaccParser(lexer)
    generator = XaccGenerator()

    # Sequence source through the generator parts:
    lexer.feed(src)
    grammar = parser.parse_grammar()
    generator.generate(grammar, args.output)
    args.output.close()


if __name__ == '__main__':
    main()