Mercurial > lcfOS
diff python/yacc.py @ 318:e84047f29c78
Add burg and yacc initial attempts
author | Windel Bouwman |
---|---|
date | Tue, 31 Dec 2013 12:38:15 +0100 |
parents | |
children | 8d07a4254f04 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/python/yacc.py Tue Dec 31 12:38:15 2013 +0100 @@ -0,0 +1,246 @@ +#!/usr/bin/python + +""" Parser generator utility """ + +import argparse +import re +import sys +import datetime +from pyyacc import Grammar, print_grammar + + +class XaccLexer: + def __init__(self): + pass + + def feed(self, txt): + # Create a regular expression for the lexing part: + tok_spec = [ + ('ID', r'[A-Za-z][A-Za-z\d_]*'), + ('STRING', r"'[^']*'"), + ('BRACEDCODE', r"\{[^\}]*\}"), + ('OTHER', r'[:;\|]'), + ('SKIP', r'[ ]') + ] + tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) + gettok = re.compile(tok_re).match + + lines = txt.split('\n') + + def tokenize_line(line): + """ Generator that splits up a line into tokens """ + mo = gettok(line) + pos = 0 + while mo: + typ = mo.lastgroup + val = mo.group(typ) + if typ == 'ID': + yield (typ, val) + elif typ == 'STRING': + typ = 'ID' + yield (typ, val[1:-1]) + elif typ == 'OTHER': + typ = val + yield (typ, val) + elif typ == 'BRACEDCODE': + yield (typ, val) + elif typ == 'SKIP': + pass + else: + raise NotImplementedError(str(typ)) + pos = mo.end() + mo = gettok(line, pos) + if len(line) != pos: + raise ParseError('Lex fault at {}'.format(line)) + + def tokenize(): + section = 0 + for line in lines: + line = line.strip() + if not line: + continue # Skip empty lines + if line == '%%': + section += 1 + yield('%%', '%%') + continue + if section == 0: + if line.startswith('%tokens'): + yield('%tokens', '%tokens') + yield from tokenize_line(line[7:]) + else: + yield ('HEADER', line) + elif section == 1: + yield from tokenize_line(line) + yield ('eof', 'eof') + self.tokens = tokenize() + self.token = self.tokens.__next__() + + def next_token(self): + t = self.token + if t[0] != 'eof': + self.token = self.tokens.__next__() + #print(t) + return t + + +class ParseError(Exception): + pass + + +class XaccParser: + """ Implements a recursive descent parser to parse grammar rules. + We could have made an generated parser, but that would yield a chicken + egg issue. + """ + def __init__(self, lexer): + self.lexer = lexer + + @property + def Peak(self): + """ Sneak peak to the next token in line """ + return self.lexer.token[0] + + def next_token(self): + """ Take the next token """ + return self.lexer.next_token() + + def consume(self, typ): + """ Eat next token of type typ or raise an exception """ + if self.Peak == typ: + return self.next_token() + else: + raise ParseError('Expected {}, but got {}'.format(typ, self.Peak)) + + def has_consumed(self, typ): + """ Consume typ if possible and return true if so """ + if self.Peak == typ: + self.consume(typ) + return True + return False + + def parse_grammar(self): + """ Entry parse function into recursive descent parser """ + # parse header + headers = [] + terminals = [] + while self.Peak in ['HEADER', '%tokens']: + if self.Peak == '%tokens': + self.consume('%tokens') + while self.Peak == 'ID': + terminals.append(self.consume('ID')[1]) + else: + headers.append(self.consume('HEADER')[1]) + self.consume('%%') + self.grammar = Grammar(terminals) + while self.Peak != 'eof': + self.parse_rule() + return self.grammar + + def parse_symbol(self): + return self.consume('ID')[1] + + def parse_rhs(self): + symbols = [] + while self.Peak not in [';', 'BRACEDCODE', '|']: + symbols.append(self.parse_symbol()) + if self.Peak == 'BRACEDCODE': + action = self.consume('BRACEDCODE')[1] + action = action[1:-1].strip() + else: + action = None + return symbols, action + + def parse_rule(self): + p = self.parse_symbol() + self.consume(':') + symbols, action = self.parse_rhs() + self.grammar.add_production(p, symbols, action) + while self.has_consumed('|'): + symbols, action = self.parse_rhs() + self.grammar.add_production(p, symbols, action) + self.consume(';') + + +class XaccGenerator: + """ Generator that writes generated parser to file """ + def __init__(self): + pass + + def generate(self, grammar, output_file): + print_grammar(grammar) + self.grammar = grammar + self.action_table, self.goto_table = grammar.doGenerate() + self.generate_python_script(output_file) + + def generate_python_script(self, f): + """ Generate python script with the parser table """ + print('#!/usr/bin/python', file=f) + stamp = datetime.datetime.now().ctime() + print('""" Automatically generated by xacc on {} """'.format(stamp), file=f) + print('from pyyacc import LRParser, Reduce, Shift, Accept, Production, Grammar', file=f) + print('from ppci import Token', file=f) + print(file=f) + print('class Parser(LRParser):', file=f) + print(' def __init__(self):', file=f) + # Generate rules: + print(' self.start_symbol = "{}"'.format(self.grammar.start_symbol), file=f) + print(' self.grammar = Grammar({})'.format(self.grammar.terminals), file=f) + for rule_number, rule in enumerate(self.grammar.productions): + rule.f_name = 'action_{}_{}'.format(rule.name, rule_number) + print(' self.grammar.add_production("{}", {}, self.{})'.format(rule.name, rule.symbols, rule.f_name), file=f) + # Fill action table: + print(' self.action_table = {}', file=f) + for state in self.action_table: + action = self.action_table[state] + print(' self.action_table[{}] = {}'.format(state, action), file=f) + print('', file=f) + + # Fill goto table: + print(' self.goto_table = {}', file=f) + for gt in self.goto_table: + to = self.goto_table[gt] + print(' self.goto_table[{}] = {}'.format(gt, to), file=f) + print('', file=f) + + # Generate a function for each action: + for rule in self.grammar.productions: + M = len(rule.symbols) + args = ', '.join('arg{}'.format(n + 1) for n in range(M)) + print(' def {}(self, {}):'.format(rule.f_name, args), file=f) + if rule.f == None: + semantics = 'pass' + else: + semantics = str(rule.f) + if semantics.strip() == '': + semantics = 'pass' + for n in range(M): + semantics = semantics.replace('${}'.format(n + 1), 'arg{}'.format(n + 1)) + print(' {}'.format(semantics), file=f) + print('', file=f) + + +def main(): + # Parse arguments: + parser = argparse.ArgumentParser(description='xacc compiler compiler') + parser.add_argument('source', type=argparse.FileType('r'), \ + help='the parser specification') + parser.add_argument('-o', '--output', type=argparse.FileType('w'), \ + default=sys.stdout) + args = parser.parse_args() + src = args.source.read() + args.source.close() + + # Construction of generator parts: + lexer = XaccLexer() + parser = XaccParser(lexer) + generator = XaccGenerator() + + # Sequence source through the generator parts: + lexer.feed(src) + grammar = parser.parse_grammar() + generator.generate(grammar, args.output) + args.output.close() + + +if __name__ == '__main__': + main()