Mercurial > lcfOS
view python/yacc.py @ 335:582a1aaa3983
Added long branch format
author | Windel Bouwman |
---|---|
date | Mon, 17 Feb 2014 20:41:30 +0100 |
parents | 6f4753202b9a |
children | c7cc54c0dfdf |
line wrap: on
line source
#!/usr/bin/python """ Parser generator utility. This script can generate a python script from a grammar description. Invoke the script on a grammar specification file: .. code:: $ ./yacc.py test.x -o test_parser.py And use the generated parser by deriving a user class: .. code:: import test_parser class MyParser(test_parser.Parser): pass p = MyParser() p.parse() Alternatively you can load the parser on the fly: .. code:: import yacc parser_mod = yacc.load_as_module('mygrammar.x') class MyParser(parser_mod.Parser): pass p = MyParser() p.parse() """ import argparse import re import sys import datetime import types import io import logging from pyyacc import Grammar class XaccLexer: def __init__(self): pass def feed(self, txt): # Create a regular expression for the lexing part: tok_spec = [ ('ID', r'[A-Za-z][A-Za-z\d_]*'), ('STRING', r"'[^']*'"), ('BRACEDCODE', r"\{[^\}]*\}"), ('OTHER', r'[:;\|]'), ('SKIP', r'[ ]') ] tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) gettok = re.compile(tok_re).match lines = txt.split('\n') def tokenize_line(line): """ Generator that splits up a line into tokens """ mo = gettok(line) pos = 0 while mo: typ = mo.lastgroup val = mo.group(typ) if typ == 'ID': yield (typ, val) elif typ == 'STRING': typ = 'ID' yield (typ, val[1:-1]) elif typ == 'OTHER': typ = val yield (typ, val) elif typ == 'BRACEDCODE': yield (typ, val) elif typ == 'SKIP': pass else: raise NotImplementedError(str(typ)) pos = mo.end() mo = gettok(line, pos) if len(line) != pos: raise ParseError('Lex fault at {}'.format(line)) def tokenize(): section = 0 for line in lines: line = line.strip() if not line: continue # Skip empty lines if line == '%%': section += 1 yield('%%', '%%') continue if section == 0: if line.startswith('%tokens'): yield('%tokens', '%tokens') yield from tokenize_line(line[7:]) else: yield ('HEADER', line) elif section == 1: yield from tokenize_line(line) yield ('eof', 'eof') self.tokens = tokenize() self.token = self.tokens.__next__() def next_token(self): t = self.token if t[0] != 'eof': self.token = self.tokens.__next__() return t class ParseError(Exception): pass class XaccParser: """ Implements a recursive descent parser to parse grammar rules. We could have made an generated parser, but that would yield a chicken egg issue. """ def __init__(self, lexer): self.lexer = lexer @property def Peak(self): """ Sneak peak to the next token in line """ return self.lexer.token[0] def next_token(self): """ Take the next token """ return self.lexer.next_token() def consume(self, typ): """ Eat next token of type typ or raise an exception """ if self.Peak == typ: return self.next_token() else: raise ParseError('Expected {}, but got {}'.format(typ, self.Peak)) def has_consumed(self, typ): """ Consume typ if possible and return true if so """ if self.Peak == typ: self.consume(typ) return True return False def parse_grammar(self): """ Entry parse function into recursive descent parser """ # parse header headers = [] terminals = [] while self.Peak in ['HEADER', '%tokens']: if self.Peak == '%tokens': self.consume('%tokens') while self.Peak == 'ID': terminals.append(self.consume('ID')[1]) else: headers.append(self.consume('HEADER')[1]) self.consume('%%') self.headers = headers self.grammar = Grammar(terminals) while self.Peak != 'eof': self.parse_rule() return self.grammar def parse_symbol(self): return self.consume('ID')[1] def parse_rhs(self): """ Parse the right hand side of a rule definition """ symbols = [] while self.Peak not in [';', 'BRACEDCODE', '|']: symbols.append(self.parse_symbol()) if self.Peak == 'BRACEDCODE': action = self.consume('BRACEDCODE')[1] action = action[1:-1].strip() else: action = None return symbols, action def parse_rule(self): """ Parse a rule definition """ p = self.parse_symbol() self.consume(':') symbols, action = self.parse_rhs() self.grammar.add_production(p, symbols, action) while self.has_consumed('|'): symbols, action = self.parse_rhs() self.grammar.add_production(p, symbols, action) self.consume(';') class XaccGenerator: """ Generator that writes generated parser to file """ def __init__(self): self.logger = logging.getLogger('yacc') def generate(self, grammar, headers, output_file): self.output_file = output_file self.grammar = grammar self.headers = headers self.logger.info('Generating parser for grammar {}'.format(grammar)) self.action_table, self.goto_table = grammar.doGenerate() self.generate_python_script() def print(self, *args): """ Print helper function that prints to output file """ print(*args, file=self.output_file) def generate_python_script(self): """ Generate python script with the parser table """ self.print('#!/usr/bin/python') stamp = datetime.datetime.now().ctime() self.print('""" Automatically generated by xacc on {} """'.format(stamp)) self.print('from pyyacc import LRParser, Reduce, Shift, Accept, Production, Grammar') self.print('from ppci import Token') self.print('') for h in self.headers: print(h, file=output_file) self.print('') self.print('class Parser(LRParser):') self.print(' def __init__(self):') # Generate rules: self.print(' self.start_symbol = "{}"'.format(self.grammar.start_symbol)) self.print(' self.grammar = Grammar({})'.format(self.grammar.terminals)) for rule_number, rule in enumerate(self.grammar.productions): rule.f_name = 'action_{}_{}'.format(rule.name, rule_number) self.print(' self.grammar.add_production("{}", {}, self.{})'.format(rule.name, rule.symbols, rule.f_name)) # Fill action table: self.print(' self.action_table = {}') for state in self.action_table: action = self.action_table[state] self.print(' self.action_table[{}] = {}'.format(state, action)) self.print('') # Fill goto table: self.print(' self.goto_table = {}') for state_number in self.goto_table: to = self.goto_table[state_number] self.print(' self.goto_table[{}] = {}'.format(state_number, to)) self.print('') # Generate a function for each action: for rule in self.grammar.productions: num_symbols = len(rule.symbols) args = ', '.join('arg{}'.format(n + 1) for n in range(num_symbols)) self.print(' def {}(self, {}):'.format(rule.f_name, args)) if rule.f == None: semantics = 'pass' else: semantics = str(rule.f) if semantics.strip() == '': semantics = 'pass' for n in range(num_symbols): semantics = semantics.replace('${}'.format(n + 1), 'arg{}'.format(n + 1)) self.print(' {}'.format(semantics)) self.print('') def make_argument_parser(): # Parse arguments: parser = argparse.ArgumentParser(description='xacc compiler compiler') parser.add_argument('source', type=argparse.FileType('r'), \ help='the parser specification') parser.add_argument('-o', '--output', type=argparse.FileType('w'), \ default=sys.stdout) return parser def load_as_module(filename): """ Load a parser spec file, generate LR tables and create module """ ob = io.StringIO() args = argparse.Namespace(source=open(filename), output=ob) main(args) parser_mod = types.ModuleType('generated_parser') exec(ob.getvalue(), parser_mod.__dict__) return parser_mod def main(args): src = args.source.read() args.source.close() # Construction of generator parts: lexer = XaccLexer() parser = XaccParser(lexer) generator = XaccGenerator() # Sequence source through the generator parts: lexer.feed(src) grammar = parser.parse_grammar() generator.generate(grammar, parser.headers, args.output) if __name__ == '__main__': args = make_argument_parser().parse_args() main(args)