Mercurial > lcfOS
view python/yacc.py @ 319:8d07a4254f04
Work on burg
author | Windel Bouwman |
---|---|
date | Sat, 18 Jan 2014 18:58:43 +0100 |
parents | e84047f29c78 |
children | 8c569fbe60e4 |
line wrap: on
line source
#!/usr/bin/python """ Parser generator utility. This script can generate a python script from a grammar description. Invoke the script on a grammar specification file: .. code:: $ ./yacc.py test.x -o test_parser.py And use the generated parser by deriving a user class: .. code:: import test_parser class MyParser(test_parser.Parser): pass p = MyParser() p.parse() """ import argparse import re import sys import datetime from pyyacc import Grammar, print_grammar class XaccLexer: def __init__(self): pass def feed(self, txt): # Create a regular expression for the lexing part: tok_spec = [ ('ID', r'[A-Za-z][A-Za-z\d_]*'), ('STRING', r"'[^']*'"), ('BRACEDCODE', r"\{[^\}]*\}"), ('OTHER', r'[:;\|]'), ('SKIP', r'[ ]') ] tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) gettok = re.compile(tok_re).match lines = txt.split('\n') def tokenize_line(line): """ Generator that splits up a line into tokens """ mo = gettok(line) pos = 0 while mo: typ = mo.lastgroup val = mo.group(typ) if typ == 'ID': yield (typ, val) elif typ == 'STRING': typ = 'ID' yield (typ, val[1:-1]) elif typ == 'OTHER': typ = val yield (typ, val) elif typ == 'BRACEDCODE': yield (typ, val) elif typ == 'SKIP': pass else: raise NotImplementedError(str(typ)) pos = mo.end() mo = gettok(line, pos) if len(line) != pos: raise ParseError('Lex fault at {}'.format(line)) def tokenize(): section = 0 for line in lines: line = line.strip() if not line: continue # Skip empty lines if line == '%%': section += 1 yield('%%', '%%') continue if section == 0: if line.startswith('%tokens'): yield('%tokens', '%tokens') yield from tokenize_line(line[7:]) else: yield ('HEADER', line) elif section == 1: yield from tokenize_line(line) yield ('eof', 'eof') self.tokens = tokenize() self.token = self.tokens.__next__() def next_token(self): t = self.token if t[0] != 'eof': self.token = self.tokens.__next__() #print(t) return t class ParseError(Exception): pass class XaccParser: """ Implements a recursive descent parser to parse grammar rules. We could have made an generated parser, but that would yield a chicken egg issue. """ def __init__(self, lexer): self.lexer = lexer @property def Peak(self): """ Sneak peak to the next token in line """ return self.lexer.token[0] def next_token(self): """ Take the next token """ return self.lexer.next_token() def consume(self, typ): """ Eat next token of type typ or raise an exception """ if self.Peak == typ: return self.next_token() else: raise ParseError('Expected {}, but got {}'.format(typ, self.Peak)) def has_consumed(self, typ): """ Consume typ if possible and return true if so """ if self.Peak == typ: self.consume(typ) return True return False def parse_grammar(self): """ Entry parse function into recursive descent parser """ # parse header headers = [] terminals = [] while self.Peak in ['HEADER', '%tokens']: if self.Peak == '%tokens': self.consume('%tokens') while self.Peak == 'ID': terminals.append(self.consume('ID')[1]) else: headers.append(self.consume('HEADER')[1]) self.consume('%%') self.headers = headers self.grammar = Grammar(terminals) while self.Peak != 'eof': self.parse_rule() return self.grammar def parse_symbol(self): return self.consume('ID')[1] def parse_rhs(self): """ Parse the right hand side of a rule definition """ symbols = [] while self.Peak not in [';', 'BRACEDCODE', '|']: symbols.append(self.parse_symbol()) if self.Peak == 'BRACEDCODE': action = self.consume('BRACEDCODE')[1] action = action[1:-1].strip() else: action = None return symbols, action def parse_rule(self): """ Parse a rule definition """ p = self.parse_symbol() self.consume(':') symbols, action = self.parse_rhs() self.grammar.add_production(p, symbols, action) while self.has_consumed('|'): symbols, action = self.parse_rhs() self.grammar.add_production(p, symbols, action) self.consume(';') class XaccGenerator: """ Generator that writes generated parser to file """ def __init__(self): pass def generate(self, grammar, headers, output_file): print_grammar(grammar) self.grammar = grammar self.headers = headers self.action_table, self.goto_table = grammar.doGenerate() self.generate_python_script(output_file) def generate_python_script(self, output_file): """ Generate python script with the parser table """ print('#!/usr/bin/python', file=output_file) stamp = datetime.datetime.now().ctime() print('""" Automatically generated by xacc on {} """'.format(stamp), file=output_file) print('from pyyacc import LRParser, Reduce, Shift, Accept, Production, Grammar', file=output_file) print('from ppci import Token', file=output_file) print(file=output_file) for h in self.headers: print(h, file=output_file) print(file=output_file) print('class Parser(LRParser):', file=output_file) print(' def __init__(self):', file=output_file) # Generate rules: print(' self.start_symbol = "{}"'.format(self.grammar.start_symbol), file=output_file) print(' self.grammar = Grammar({})'.format(self.grammar.terminals), file=output_file) for rule_number, rule in enumerate(self.grammar.productions): rule.f_name = 'action_{}_{}'.format(rule.name, rule_number) print(' self.grammar.add_production("{}", {}, self.{})'.format(rule.name, rule.symbols, rule.f_name), file=output_file) # Fill action table: print(' self.action_table = {}', file=output_file) for state in self.action_table: action = self.action_table[state] print(' self.action_table[{}] = {}'.format(state, action), file=output_file) print('', file=output_file) # Fill goto table: print(' self.goto_table = {}', file=output_file) for gt in self.goto_table: to = self.goto_table[gt] print(' self.goto_table[{}] = {}'.format(gt, to), file=output_file) print('', file=output_file) # Generate a function for each action: for rule in self.grammar.productions: M = len(rule.symbols) args = ', '.join('arg{}'.format(n + 1) for n in range(M)) print(' def {}(self, {}):'.format(rule.f_name, args), file=output_file) if rule.f == None: semantics = 'pass' else: semantics = str(rule.f) if semantics.strip() == '': semantics = 'pass' for n in range(M): semantics = semantics.replace('${}'.format(n + 1), 'arg{}'.format(n + 1)) print(' {}'.format(semantics), file=output_file) def main(): # Parse arguments: parser = argparse.ArgumentParser(description='xacc compiler compiler') parser.add_argument('source', type=argparse.FileType('r'), \ help='the parser specification') parser.add_argument('-o', '--output', type=argparse.FileType('w'), \ default=sys.stdout) args = parser.parse_args() src = args.source.read() args.source.close() # Construction of generator parts: lexer = XaccLexer() parser = XaccParser(lexer) generator = XaccGenerator() # Sequence source through the generator parts: lexer.feed(src) grammar = parser.parse_grammar() generator.generate(grammar, parser.headers, args.output) args.output.close() if __name__ == '__main__': main()