diff python/yacc.py @ 318:e84047f29c78

Add burg and yacc initial attempts
author Windel Bouwman
date Tue, 31 Dec 2013 12:38:15 +0100
parents
children 8d07a4254f04
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/python/yacc.py	Tue Dec 31 12:38:15 2013 +0100
@@ -0,0 +1,246 @@
+#!/usr/bin/python
+
+""" Parser generator utility """
+
+import argparse
+import re
+import sys
+import datetime
+from pyyacc import Grammar, print_grammar
+
+
+class XaccLexer:
+    def __init__(self):
+        pass
+
+    def feed(self, txt):
+        # Create a regular expression for the lexing part:
+        tok_spec = [
+           ('ID', r'[A-Za-z][A-Za-z\d_]*'),
+           ('STRING', r"'[^']*'"),
+           ('BRACEDCODE', r"\{[^\}]*\}"),
+           ('OTHER', r'[:;\|]'),
+           ('SKIP', r'[ ]')
+            ]
+        tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
+        gettok = re.compile(tok_re).match
+
+        lines = txt.split('\n')
+
+        def tokenize_line(line):
+            """ Generator that splits up a line into tokens """
+            mo = gettok(line)
+            pos = 0
+            while mo:
+                typ = mo.lastgroup
+                val = mo.group(typ)
+                if typ == 'ID':
+                    yield (typ, val)
+                elif typ == 'STRING':
+                    typ = 'ID'
+                    yield (typ, val[1:-1])
+                elif typ == 'OTHER':
+                    typ = val
+                    yield (typ, val)
+                elif typ == 'BRACEDCODE':
+                    yield (typ, val)
+                elif typ == 'SKIP':
+                    pass
+                else:
+                    raise NotImplementedError(str(typ))
+                pos = mo.end()
+                mo = gettok(line, pos)
+            if len(line) != pos:
+                raise ParseError('Lex fault at {}'.format(line))
+
+        def tokenize():
+            section = 0
+            for line in lines:
+                line = line.strip()
+                if not line:
+                    continue  # Skip empty lines
+                if line == '%%':
+                    section += 1
+                    yield('%%', '%%')
+                    continue
+                if section == 0:
+                    if line.startswith('%tokens'):
+                        yield('%tokens', '%tokens')
+                        yield from tokenize_line(line[7:])
+                    else:
+                        yield ('HEADER', line)
+                elif section == 1:
+                    yield from tokenize_line(line)
+            yield ('eof', 'eof')
+        self.tokens = tokenize()
+        self.token = self.tokens.__next__()
+
+    def next_token(self):
+        t = self.token
+        if t[0] != 'eof':
+            self.token = self.tokens.__next__()
+        #print(t)
+        return t
+
+
+class ParseError(Exception):
+    pass
+
+
+class XaccParser:
+    """ Implements a recursive descent parser to parse grammar rules.
+        We could have made an generated parser, but that would yield a chicken
+        egg issue.
+    """
+    def __init__(self, lexer):
+        self.lexer = lexer
+
+    @property
+    def Peak(self):
+        """ Sneak peak to the next token in line """
+        return self.lexer.token[0]
+
+    def next_token(self):
+        """ Take the next token """
+        return self.lexer.next_token()
+
+    def consume(self, typ):
+        """ Eat next token of type typ or raise an exception """
+        if self.Peak == typ:
+            return self.next_token()
+        else:
+            raise ParseError('Expected {}, but got {}'.format(typ, self.Peak))
+
+    def has_consumed(self, typ):
+        """ Consume typ if possible and return true if so """
+        if self.Peak == typ:
+            self.consume(typ)
+            return True
+        return False
+
+    def parse_grammar(self):
+        """ Entry parse function into recursive descent parser """
+        # parse header
+        headers = []
+        terminals = []
+        while self.Peak in ['HEADER', '%tokens']:
+            if self.Peak == '%tokens':
+                self.consume('%tokens')
+                while self.Peak == 'ID':
+                    terminals.append(self.consume('ID')[1])
+            else:
+                headers.append(self.consume('HEADER')[1])
+        self.consume('%%')
+        self.grammar = Grammar(terminals)
+        while self.Peak != 'eof':
+            self.parse_rule()
+        return self.grammar
+
+    def parse_symbol(self):
+        return self.consume('ID')[1]
+
+    def parse_rhs(self):
+        symbols = []
+        while self.Peak not in [';', 'BRACEDCODE', '|']:
+            symbols.append(self.parse_symbol())
+        if self.Peak == 'BRACEDCODE':
+            action = self.consume('BRACEDCODE')[1]
+            action = action[1:-1].strip()
+        else:
+            action = None
+        return symbols, action
+
+    def parse_rule(self):
+        p = self.parse_symbol()
+        self.consume(':')
+        symbols, action = self.parse_rhs()
+        self.grammar.add_production(p, symbols, action)
+        while self.has_consumed('|'):
+            symbols, action = self.parse_rhs()
+            self.grammar.add_production(p, symbols, action)
+        self.consume(';')
+
+
+class XaccGenerator:
+    """ Generator that writes generated parser to file """
+    def __init__(self):
+        pass
+
+    def generate(self, grammar, output_file):
+        print_grammar(grammar)
+        self.grammar = grammar
+        self.action_table, self.goto_table = grammar.doGenerate()
+        self.generate_python_script(output_file)
+
+    def generate_python_script(self, f):
+        """ Generate python script with the parser table """
+        print('#!/usr/bin/python', file=f)
+        stamp = datetime.datetime.now().ctime()
+        print('""" Automatically generated by xacc on {} """'.format(stamp), file=f)
+        print('from pyyacc import LRParser, Reduce, Shift, Accept, Production, Grammar', file=f)
+        print('from ppci import Token', file=f)
+        print(file=f)
+        print('class Parser(LRParser):', file=f)
+        print('    def __init__(self):', file=f)
+        # Generate rules:
+        print('        self.start_symbol = "{}"'.format(self.grammar.start_symbol), file=f)
+        print('        self.grammar = Grammar({})'.format(self.grammar.terminals), file=f)
+        for rule_number, rule in enumerate(self.grammar.productions):
+            rule.f_name = 'action_{}_{}'.format(rule.name, rule_number)
+            print('        self.grammar.add_production("{}", {}, self.{})'.format(rule.name, rule.symbols, rule.f_name), file=f)
+        # Fill action table:
+        print('        self.action_table = {}', file=f)
+        for state in self.action_table:
+            action = self.action_table[state]
+            print('        self.action_table[{}] = {}'.format(state, action), file=f)
+        print('', file=f)
+
+        # Fill goto table:
+        print('        self.goto_table = {}', file=f)
+        for gt in self.goto_table:
+            to = self.goto_table[gt]
+            print('        self.goto_table[{}] = {}'.format(gt, to), file=f)
+        print('', file=f)
+
+        # Generate a function for each action:
+        for rule in self.grammar.productions:
+            M = len(rule.symbols)
+            args = ', '.join('arg{}'.format(n + 1) for n in range(M))
+            print('    def {}(self, {}):'.format(rule.f_name, args), file=f)
+            if rule.f == None:
+                semantics = 'pass'
+            else:
+                semantics = str(rule.f)
+                if semantics.strip() == '':
+                    semantics = 'pass'
+            for n in range(M):
+                semantics = semantics.replace('${}'.format(n + 1), 'arg{}'.format(n + 1))
+            print('        {}'.format(semantics), file=f)
+            print('', file=f)
+
+
+def main():
+    # Parse arguments:
+    parser = argparse.ArgumentParser(description='xacc compiler compiler')
+    parser.add_argument('source', type=argparse.FileType('r'), \
+      help='the parser specification')
+    parser.add_argument('-o', '--output', type=argparse.FileType('w'), \
+        default=sys.stdout)
+    args = parser.parse_args()
+    src = args.source.read()
+    args.source.close()
+
+    # Construction of generator parts:
+    lexer = XaccLexer()
+    parser = XaccParser(lexer)
+    generator = XaccGenerator()
+
+    # Sequence source through the generator parts:
+    lexer.feed(src)
+    grammar = parser.parse_grammar()
+    generator.generate(grammar, args.output)
+    args.output.close()
+
+
+if __name__ == '__main__':
+    main()