Mercurial > lcfOS

#!/usr/bin/python

"""
Parser generator utility. This script can generate a python script from a
grammar description.

Invoke the script on a grammar specification file:

.. code::

    $ ./yacc.py test.x -o test_parser.py

And use the generated parser by deriving a user class:


.. code::

    import test_parser
    class MyParser(test_parser.Parser):
        pass
    p = MyParser()
    p.parse()


Alternatively you can load the parser on the fly:

.. code::

    import yacc
    parser_mod = yacc.load_as_module('mygrammar.x')
    class MyParser(parser_mod.Parser):
        pass
    p = MyParser()
    p.parse()

"""

import argparse
import re
import sys
import datetime
import types
import io
import logging
from pyyacc import Grammar, print_grammar


class XaccLexer:
    def __init__(self):
        pass

    def feed(self, txt):
        # Create a regular expression for the lexing part:
        tok_spec = [
           ('ID', r'[A-Za-z][A-Za-z\d_]*'),
           ('STRING', r"'[^']*'"),
           ('BRACEDCODE', r"\{[^\}]*\}"),
           ('OTHER', r'[:;\|]'),
           ('SKIP', r'[ ]')
            ]
        tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
        gettok = re.compile(tok_re).match

        lines = txt.split('\n')

        def tokenize_line(line):
            """ Generator that splits up a line into tokens """
            mo = gettok(line)
            pos = 0
            while mo:
                typ = mo.lastgroup
                val = mo.group(typ)
                if typ == 'ID':
                    yield (typ, val)
                elif typ == 'STRING':
                    typ = 'ID'
                    yield (typ, val[1:-1])
                elif typ == 'OTHER':
                    typ = val
                    yield (typ, val)
                elif typ == 'BRACEDCODE':
                    yield (typ, val)
                elif typ == 'SKIP':
                    pass
                else:
                    raise NotImplementedError(str(typ))
                pos = mo.end()
                mo = gettok(line, pos)
            if len(line) != pos:
                raise ParseError('Lex fault at {}'.format(line))

        def tokenize():
            section = 0
            for line in lines:
                line = line.strip()
                if not line:
                    continue  # Skip empty lines
                if line == '%%':
                    section += 1
                    yield('%%', '%%')
                    continue
                if section == 0:
                    if line.startswith('%tokens'):
                        yield('%tokens', '%tokens')
                        yield from tokenize_line(line[7:])
                    else:
                        yield ('HEADER', line)
                elif section == 1:
                    yield from tokenize_line(line)
            yield ('eof', 'eof')
        self.tokens = tokenize()
        self.token = self.tokens.__next__()

    def next_token(self):
        t = self.token
        if t[0] != 'eof':
            self.token = self.tokens.__next__()
        return t


class ParseError(Exception):
    pass


class XaccParser:
    """ Implements a recursive descent parser to parse grammar rules.
        We could have made an generated parser, but that would yield a chicken
        egg issue.
    """
    def __init__(self, lexer):
        self.lexer = lexer

    @property
    def Peak(self):
        """ Sneak peak to the next token in line """
        return self.lexer.token[0]

    def next_token(self):
        """ Take the next token """
        return self.lexer.next_token()

    def consume(self, typ):
        """ Eat next token of type typ or raise an exception """
        if self.Peak == typ:
            return self.next_token()
        else:
            raise ParseError('Expected {}, but got {}'.format(typ, self.Peak))

    def has_consumed(self, typ):
        """ Consume typ if possible and return true if so """
        if self.Peak == typ:
            self.consume(typ)
            return True
        return False

    def parse_grammar(self):
        """ Entry parse function into recursive descent parser """
        # parse header
        headers = []
        terminals = []
        while self.Peak in ['HEADER', '%tokens']:
            if self.Peak == '%tokens':
                self.consume('%tokens')
                while self.Peak == 'ID':
                    terminals.append(self.consume('ID')[1])
            else:
                headers.append(self.consume('HEADER')[1])
        self.consume('%%')
        self.headers = headers
        self.grammar = Grammar(terminals)
        while self.Peak != 'eof':
            self.parse_rule()
        return self.grammar

    def parse_symbol(self):
        return self.consume('ID')[1]

    def parse_rhs(self):
        """ Parse the right hand side of a rule definition """
        symbols = []
        while self.Peak not in [';', 'BRACEDCODE', '|']:
            symbols.append(self.parse_symbol())
        if self.Peak == 'BRACEDCODE':
            action = self.consume('BRACEDCODE')[1]
            action = action[1:-1].strip()
        else:
            action = None
        return symbols, action

    def parse_rule(self):
        """ Parse a rule definition """
        p = self.parse_symbol()
        self.consume(':')
        symbols, action = self.parse_rhs()
        self.grammar.add_production(p, symbols, action)
        while self.has_consumed('|'):
            symbols, action = self.parse_rhs()
            self.grammar.add_production(p, symbols, action)
        self.consume(';')


class XaccGenerator:
    """ Generator that writes generated parser to file """
    def __init__(self):
        self.logger = logging.getLogger('yacc')

    def generate(self, grammar, headers, output_file):
        self.output_file = output_file
        self.grammar = grammar
        self.headers = headers
        self.logger.info('Generating parser for grammar {}'.format(grammar))
        self.action_table, self.goto_table = grammar.doGenerate()
        self.generate_python_script()

    def print(self, *args):
        """ Print helper function that prints to output file """
        print(*args, file=self.output_file)

    def generate_python_script(self):
        """ Generate python script with the parser table """
        self.print('#!/usr/bin/python')
        stamp = datetime.datetime.now().ctime()
        self.print('""" Automatically generated by xacc on {} """'.format(stamp))
        self.print('from pyyacc import LRParser, Reduce, Shift, Accept, Production, Grammar')
        self.print('from ppci import Token')
        self.print('')
        for h in self.headers:
            print(h, file=output_file)
        self.print('')
        self.print('class Parser(LRParser):')
        self.print('    def __init__(self):')
        # Generate rules:
        self.print('        self.start_symbol = "{}"'.format(self.grammar.start_symbol))
        self.print('        self.grammar = Grammar({})'.format(self.grammar.terminals))
        for rule_number, rule in enumerate(self.grammar.productions):
            rule.f_name = 'action_{}_{}'.format(rule.name, rule_number)
            self.print('        self.grammar.add_production("{}", {}, self.{})'.format(rule.name, rule.symbols, rule.f_name))
        # Fill action table:
        self.print('        self.action_table = {}')
        for state in self.action_table:
            action = self.action_table[state]
            self.print('        self.action_table[{}] = {}'.format(state, action))
        self.print('')

        # Fill goto table:
        self.print('        self.goto_table = {}')
        for gt in self.goto_table:
            to = self.goto_table[gt]
            self.print('        self.goto_table[{}] = {}'.format(gt, to))
        self.print('')

        # Generate a function for each action:
        for rule in self.grammar.productions:
            M = len(rule.symbols)
            args = ', '.join('arg{}'.format(n + 1) for n in range(M))
            self.print('    def {}(self, {}):'.format(rule.f_name, args))
            if rule.f == None:
                semantics = 'pass'
            else:
                semantics = str(rule.f)
                if semantics.strip() == '':
                    semantics = 'pass'
            for n in range(M):
                semantics = semantics.replace('${}'.format(n + 1), 'arg{}'.format(n + 1))
            self.print('        {}'.format(semantics))
            self.print('')


def make_argument_parser():
    # Parse arguments:
    parser = argparse.ArgumentParser(description='xacc compiler compiler')
    parser.add_argument('source', type=argparse.FileType('r'), \
      help='the parser specification')
    parser.add_argument('-o', '--output', type=argparse.FileType('w'), \
        default=sys.stdout)
    return parser


def load_as_module(filename):
    """ Load a parser spec file, generate LR tables and create module """
    ob = io.StringIO()
    args = argparse.Namespace(source=open(filename), output=ob)
    main(args)

    parser_mod = types.ModuleType('generated_parser')
    exec(ob.getvalue(), parser_mod.__dict__)
    return parser_mod

def main(args):
    src = args.source.read()
    args.source.close()

    # Construction of generator parts:
    lexer = XaccLexer()
    parser = XaccParser(lexer)
    generator = XaccGenerator()

    # Sequence source through the generator parts:
    lexer.feed(src)
    grammar = parser.parse_grammar()
    # TODO: optionize this: print_grammar(grammar)
    generator.generate(grammar, parser.headers, args.output)


if __name__ == '__main__':
    args = make_argument_parser().parse_args()
    main(args)
author	Windel Bouwman
date	Thu, 30 Jan 2014 19:03:24 +0100
parents	44f336460c2a
children	6f4753202b9a