comparison python/yacc.py @ 318:e84047f29c78

Add burg and yacc initial attempts
author Windel Bouwman
date Tue, 31 Dec 2013 12:38:15 +0100
parents
children 8d07a4254f04
comparison
equal deleted inserted replaced
317:e30a77ae359b 318:e84047f29c78
1 #!/usr/bin/python
2
3 """ Parser generator utility """
4
5 import argparse
6 import re
7 import sys
8 import datetime
9 from pyyacc import Grammar, print_grammar
10
11
12 class XaccLexer:
13 def __init__(self):
14 pass
15
16 def feed(self, txt):
17 # Create a regular expression for the lexing part:
18 tok_spec = [
19 ('ID', r'[A-Za-z][A-Za-z\d_]*'),
20 ('STRING', r"'[^']*'"),
21 ('BRACEDCODE', r"\{[^\}]*\}"),
22 ('OTHER', r'[:;\|]'),
23 ('SKIP', r'[ ]')
24 ]
25 tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
26 gettok = re.compile(tok_re).match
27
28 lines = txt.split('\n')
29
30 def tokenize_line(line):
31 """ Generator that splits up a line into tokens """
32 mo = gettok(line)
33 pos = 0
34 while mo:
35 typ = mo.lastgroup
36 val = mo.group(typ)
37 if typ == 'ID':
38 yield (typ, val)
39 elif typ == 'STRING':
40 typ = 'ID'
41 yield (typ, val[1:-1])
42 elif typ == 'OTHER':
43 typ = val
44 yield (typ, val)
45 elif typ == 'BRACEDCODE':
46 yield (typ, val)
47 elif typ == 'SKIP':
48 pass
49 else:
50 raise NotImplementedError(str(typ))
51 pos = mo.end()
52 mo = gettok(line, pos)
53 if len(line) != pos:
54 raise ParseError('Lex fault at {}'.format(line))
55
56 def tokenize():
57 section = 0
58 for line in lines:
59 line = line.strip()
60 if not line:
61 continue # Skip empty lines
62 if line == '%%':
63 section += 1
64 yield('%%', '%%')
65 continue
66 if section == 0:
67 if line.startswith('%tokens'):
68 yield('%tokens', '%tokens')
69 yield from tokenize_line(line[7:])
70 else:
71 yield ('HEADER', line)
72 elif section == 1:
73 yield from tokenize_line(line)
74 yield ('eof', 'eof')
75 self.tokens = tokenize()
76 self.token = self.tokens.__next__()
77
78 def next_token(self):
79 t = self.token
80 if t[0] != 'eof':
81 self.token = self.tokens.__next__()
82 #print(t)
83 return t
84
85
86 class ParseError(Exception):
87 pass
88
89
90 class XaccParser:
91 """ Implements a recursive descent parser to parse grammar rules.
92 We could have made an generated parser, but that would yield a chicken
93 egg issue.
94 """
95 def __init__(self, lexer):
96 self.lexer = lexer
97
98 @property
99 def Peak(self):
100 """ Sneak peak to the next token in line """
101 return self.lexer.token[0]
102
103 def next_token(self):
104 """ Take the next token """
105 return self.lexer.next_token()
106
107 def consume(self, typ):
108 """ Eat next token of type typ or raise an exception """
109 if self.Peak == typ:
110 return self.next_token()
111 else:
112 raise ParseError('Expected {}, but got {}'.format(typ, self.Peak))
113
114 def has_consumed(self, typ):
115 """ Consume typ if possible and return true if so """
116 if self.Peak == typ:
117 self.consume(typ)
118 return True
119 return False
120
121 def parse_grammar(self):
122 """ Entry parse function into recursive descent parser """
123 # parse header
124 headers = []
125 terminals = []
126 while self.Peak in ['HEADER', '%tokens']:
127 if self.Peak == '%tokens':
128 self.consume('%tokens')
129 while self.Peak == 'ID':
130 terminals.append(self.consume('ID')[1])
131 else:
132 headers.append(self.consume('HEADER')[1])
133 self.consume('%%')
134 self.grammar = Grammar(terminals)
135 while self.Peak != 'eof':
136 self.parse_rule()
137 return self.grammar
138
139 def parse_symbol(self):
140 return self.consume('ID')[1]
141
142 def parse_rhs(self):
143 symbols = []
144 while self.Peak not in [';', 'BRACEDCODE', '|']:
145 symbols.append(self.parse_symbol())
146 if self.Peak == 'BRACEDCODE':
147 action = self.consume('BRACEDCODE')[1]
148 action = action[1:-1].strip()
149 else:
150 action = None
151 return symbols, action
152
153 def parse_rule(self):
154 p = self.parse_symbol()
155 self.consume(':')
156 symbols, action = self.parse_rhs()
157 self.grammar.add_production(p, symbols, action)
158 while self.has_consumed('|'):
159 symbols, action = self.parse_rhs()
160 self.grammar.add_production(p, symbols, action)
161 self.consume(';')
162
163
164 class XaccGenerator:
165 """ Generator that writes generated parser to file """
166 def __init__(self):
167 pass
168
169 def generate(self, grammar, output_file):
170 print_grammar(grammar)
171 self.grammar = grammar
172 self.action_table, self.goto_table = grammar.doGenerate()
173 self.generate_python_script(output_file)
174
175 def generate_python_script(self, f):
176 """ Generate python script with the parser table """
177 print('#!/usr/bin/python', file=f)
178 stamp = datetime.datetime.now().ctime()
179 print('""" Automatically generated by xacc on {} """'.format(stamp), file=f)
180 print('from pyyacc import LRParser, Reduce, Shift, Accept, Production, Grammar', file=f)
181 print('from ppci import Token', file=f)
182 print(file=f)
183 print('class Parser(LRParser):', file=f)
184 print(' def __init__(self):', file=f)
185 # Generate rules:
186 print(' self.start_symbol = "{}"'.format(self.grammar.start_symbol), file=f)
187 print(' self.grammar = Grammar({})'.format(self.grammar.terminals), file=f)
188 for rule_number, rule in enumerate(self.grammar.productions):
189 rule.f_name = 'action_{}_{}'.format(rule.name, rule_number)
190 print(' self.grammar.add_production("{}", {}, self.{})'.format(rule.name, rule.symbols, rule.f_name), file=f)
191 # Fill action table:
192 print(' self.action_table = {}', file=f)
193 for state in self.action_table:
194 action = self.action_table[state]
195 print(' self.action_table[{}] = {}'.format(state, action), file=f)
196 print('', file=f)
197
198 # Fill goto table:
199 print(' self.goto_table = {}', file=f)
200 for gt in self.goto_table:
201 to = self.goto_table[gt]
202 print(' self.goto_table[{}] = {}'.format(gt, to), file=f)
203 print('', file=f)
204
205 # Generate a function for each action:
206 for rule in self.grammar.productions:
207 M = len(rule.symbols)
208 args = ', '.join('arg{}'.format(n + 1) for n in range(M))
209 print(' def {}(self, {}):'.format(rule.f_name, args), file=f)
210 if rule.f == None:
211 semantics = 'pass'
212 else:
213 semantics = str(rule.f)
214 if semantics.strip() == '':
215 semantics = 'pass'
216 for n in range(M):
217 semantics = semantics.replace('${}'.format(n + 1), 'arg{}'.format(n + 1))
218 print(' {}'.format(semantics), file=f)
219 print('', file=f)
220
221
222 def main():
223 # Parse arguments:
224 parser = argparse.ArgumentParser(description='xacc compiler compiler')
225 parser.add_argument('source', type=argparse.FileType('r'), \
226 help='the parser specification')
227 parser.add_argument('-o', '--output', type=argparse.FileType('w'), \
228 default=sys.stdout)
229 args = parser.parse_args()
230 src = args.source.read()
231 args.source.close()
232
233 # Construction of generator parts:
234 lexer = XaccLexer()
235 parser = XaccParser(lexer)
236 generator = XaccGenerator()
237
238 # Sequence source through the generator parts:
239 lexer.feed(src)
240 grammar = parser.parse_grammar()
241 generator.generate(grammar, args.output)
242 args.output.close()
243
244
245 if __name__ == '__main__':
246 main()