318
|
1 #!/usr/bin/python
|
|
2
|
319
|
3 """
|
|
4 Parser generator utility. This script can generate a python script from a
|
|
5 grammar description.
|
|
6
|
|
7 Invoke the script on a grammar specification file:
|
|
8
|
|
9 .. code::
|
|
10
|
|
11 $ ./yacc.py test.x -o test_parser.py
|
|
12
|
|
13 And use the generated parser by deriving a user class:
|
|
14
|
|
15
|
|
16 .. code::
|
|
17
|
|
18 import test_parser
|
|
19 class MyParser(test_parser.Parser):
|
|
20 pass
|
|
21 p = MyParser()
|
|
22 p.parse()
|
|
23
|
|
24
|
|
25 """
|
318
|
26
|
|
27 import argparse
|
|
28 import re
|
|
29 import sys
|
|
30 import datetime
|
|
31 from pyyacc import Grammar, print_grammar
|
|
32
|
|
33
|
|
34 class XaccLexer:
|
|
35 def __init__(self):
|
|
36 pass
|
|
37
|
|
38 def feed(self, txt):
|
|
39 # Create a regular expression for the lexing part:
|
|
40 tok_spec = [
|
|
41 ('ID', r'[A-Za-z][A-Za-z\d_]*'),
|
|
42 ('STRING', r"'[^']*'"),
|
|
43 ('BRACEDCODE', r"\{[^\}]*\}"),
|
|
44 ('OTHER', r'[:;\|]'),
|
|
45 ('SKIP', r'[ ]')
|
|
46 ]
|
|
47 tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
|
|
48 gettok = re.compile(tok_re).match
|
|
49
|
|
50 lines = txt.split('\n')
|
|
51
|
|
52 def tokenize_line(line):
|
|
53 """ Generator that splits up a line into tokens """
|
|
54 mo = gettok(line)
|
|
55 pos = 0
|
|
56 while mo:
|
|
57 typ = mo.lastgroup
|
|
58 val = mo.group(typ)
|
|
59 if typ == 'ID':
|
|
60 yield (typ, val)
|
|
61 elif typ == 'STRING':
|
|
62 typ = 'ID'
|
|
63 yield (typ, val[1:-1])
|
|
64 elif typ == 'OTHER':
|
|
65 typ = val
|
|
66 yield (typ, val)
|
|
67 elif typ == 'BRACEDCODE':
|
|
68 yield (typ, val)
|
|
69 elif typ == 'SKIP':
|
|
70 pass
|
|
71 else:
|
|
72 raise NotImplementedError(str(typ))
|
|
73 pos = mo.end()
|
|
74 mo = gettok(line, pos)
|
|
75 if len(line) != pos:
|
|
76 raise ParseError('Lex fault at {}'.format(line))
|
|
77
|
|
78 def tokenize():
|
|
79 section = 0
|
|
80 for line in lines:
|
|
81 line = line.strip()
|
|
82 if not line:
|
|
83 continue # Skip empty lines
|
|
84 if line == '%%':
|
|
85 section += 1
|
|
86 yield('%%', '%%')
|
|
87 continue
|
|
88 if section == 0:
|
|
89 if line.startswith('%tokens'):
|
|
90 yield('%tokens', '%tokens')
|
|
91 yield from tokenize_line(line[7:])
|
|
92 else:
|
|
93 yield ('HEADER', line)
|
|
94 elif section == 1:
|
|
95 yield from tokenize_line(line)
|
|
96 yield ('eof', 'eof')
|
|
97 self.tokens = tokenize()
|
|
98 self.token = self.tokens.__next__()
|
|
99
|
|
100 def next_token(self):
|
|
101 t = self.token
|
|
102 if t[0] != 'eof':
|
|
103 self.token = self.tokens.__next__()
|
|
104 #print(t)
|
|
105 return t
|
|
106
|
|
107
|
|
108 class ParseError(Exception):
|
|
109 pass
|
|
110
|
|
111
|
|
112 class XaccParser:
|
|
113 """ Implements a recursive descent parser to parse grammar rules.
|
|
114 We could have made an generated parser, but that would yield a chicken
|
|
115 egg issue.
|
|
116 """
|
|
117 def __init__(self, lexer):
|
|
118 self.lexer = lexer
|
|
119
|
|
120 @property
|
|
121 def Peak(self):
|
|
122 """ Sneak peak to the next token in line """
|
|
123 return self.lexer.token[0]
|
|
124
|
|
125 def next_token(self):
|
|
126 """ Take the next token """
|
|
127 return self.lexer.next_token()
|
|
128
|
|
129 def consume(self, typ):
|
|
130 """ Eat next token of type typ or raise an exception """
|
|
131 if self.Peak == typ:
|
|
132 return self.next_token()
|
|
133 else:
|
|
134 raise ParseError('Expected {}, but got {}'.format(typ, self.Peak))
|
|
135
|
|
136 def has_consumed(self, typ):
|
|
137 """ Consume typ if possible and return true if so """
|
|
138 if self.Peak == typ:
|
|
139 self.consume(typ)
|
|
140 return True
|
|
141 return False
|
|
142
|
|
143 def parse_grammar(self):
|
|
144 """ Entry parse function into recursive descent parser """
|
|
145 # parse header
|
|
146 headers = []
|
|
147 terminals = []
|
|
148 while self.Peak in ['HEADER', '%tokens']:
|
|
149 if self.Peak == '%tokens':
|
|
150 self.consume('%tokens')
|
|
151 while self.Peak == 'ID':
|
|
152 terminals.append(self.consume('ID')[1])
|
|
153 else:
|
|
154 headers.append(self.consume('HEADER')[1])
|
|
155 self.consume('%%')
|
319
|
156 self.headers = headers
|
318
|
157 self.grammar = Grammar(terminals)
|
|
158 while self.Peak != 'eof':
|
|
159 self.parse_rule()
|
|
160 return self.grammar
|
|
161
|
|
162 def parse_symbol(self):
|
|
163 return self.consume('ID')[1]
|
|
164
|
|
165 def parse_rhs(self):
|
319
|
166 """ Parse the right hand side of a rule definition """
|
318
|
167 symbols = []
|
|
168 while self.Peak not in [';', 'BRACEDCODE', '|']:
|
|
169 symbols.append(self.parse_symbol())
|
|
170 if self.Peak == 'BRACEDCODE':
|
|
171 action = self.consume('BRACEDCODE')[1]
|
|
172 action = action[1:-1].strip()
|
|
173 else:
|
|
174 action = None
|
|
175 return symbols, action
|
|
176
|
|
177 def parse_rule(self):
|
319
|
178 """ Parse a rule definition """
|
318
|
179 p = self.parse_symbol()
|
|
180 self.consume(':')
|
|
181 symbols, action = self.parse_rhs()
|
|
182 self.grammar.add_production(p, symbols, action)
|
|
183 while self.has_consumed('|'):
|
|
184 symbols, action = self.parse_rhs()
|
|
185 self.grammar.add_production(p, symbols, action)
|
|
186 self.consume(';')
|
|
187
|
|
188
|
|
189 class XaccGenerator:
|
|
190 """ Generator that writes generated parser to file """
|
|
191 def __init__(self):
|
|
192 pass
|
|
193
|
319
|
194 def generate(self, grammar, headers, output_file):
|
318
|
195 print_grammar(grammar)
|
|
196 self.grammar = grammar
|
319
|
197 self.headers = headers
|
318
|
198 self.action_table, self.goto_table = grammar.doGenerate()
|
|
199 self.generate_python_script(output_file)
|
|
200
|
319
|
201 def generate_python_script(self, output_file):
|
318
|
202 """ Generate python script with the parser table """
|
319
|
203 print('#!/usr/bin/python', file=output_file)
|
318
|
204 stamp = datetime.datetime.now().ctime()
|
319
|
205 print('""" Automatically generated by xacc on {} """'.format(stamp), file=output_file)
|
|
206 print('from pyyacc import LRParser, Reduce, Shift, Accept, Production, Grammar', file=output_file)
|
|
207 print('from ppci import Token', file=output_file)
|
|
208 print(file=output_file)
|
|
209 for h in self.headers:
|
|
210 print(h, file=output_file)
|
|
211 print(file=output_file)
|
|
212 print('class Parser(LRParser):', file=output_file)
|
|
213 print(' def __init__(self):', file=output_file)
|
318
|
214 # Generate rules:
|
319
|
215 print(' self.start_symbol = "{}"'.format(self.grammar.start_symbol), file=output_file)
|
|
216 print(' self.grammar = Grammar({})'.format(self.grammar.terminals), file=output_file)
|
318
|
217 for rule_number, rule in enumerate(self.grammar.productions):
|
|
218 rule.f_name = 'action_{}_{}'.format(rule.name, rule_number)
|
319
|
219 print(' self.grammar.add_production("{}", {}, self.{})'.format(rule.name, rule.symbols, rule.f_name), file=output_file)
|
318
|
220 # Fill action table:
|
319
|
221 print(' self.action_table = {}', file=output_file)
|
318
|
222 for state in self.action_table:
|
|
223 action = self.action_table[state]
|
319
|
224 print(' self.action_table[{}] = {}'.format(state, action), file=output_file)
|
|
225 print('', file=output_file)
|
318
|
226
|
|
227 # Fill goto table:
|
319
|
228 print(' self.goto_table = {}', file=output_file)
|
318
|
229 for gt in self.goto_table:
|
|
230 to = self.goto_table[gt]
|
319
|
231 print(' self.goto_table[{}] = {}'.format(gt, to), file=output_file)
|
|
232 print('', file=output_file)
|
318
|
233
|
|
234 # Generate a function for each action:
|
|
235 for rule in self.grammar.productions:
|
|
236 M = len(rule.symbols)
|
|
237 args = ', '.join('arg{}'.format(n + 1) for n in range(M))
|
319
|
238 print(' def {}(self, {}):'.format(rule.f_name, args), file=output_file)
|
318
|
239 if rule.f == None:
|
|
240 semantics = 'pass'
|
|
241 else:
|
|
242 semantics = str(rule.f)
|
|
243 if semantics.strip() == '':
|
|
244 semantics = 'pass'
|
|
245 for n in range(M):
|
|
246 semantics = semantics.replace('${}'.format(n + 1), 'arg{}'.format(n + 1))
|
319
|
247 print(' {}'.format(semantics), file=output_file)
|
318
|
248
|
|
249
|
|
250 def main():
|
|
251 # Parse arguments:
|
|
252 parser = argparse.ArgumentParser(description='xacc compiler compiler')
|
|
253 parser.add_argument('source', type=argparse.FileType('r'), \
|
|
254 help='the parser specification')
|
|
255 parser.add_argument('-o', '--output', type=argparse.FileType('w'), \
|
|
256 default=sys.stdout)
|
|
257 args = parser.parse_args()
|
|
258 src = args.source.read()
|
|
259 args.source.close()
|
|
260
|
|
261 # Construction of generator parts:
|
|
262 lexer = XaccLexer()
|
|
263 parser = XaccParser(lexer)
|
|
264 generator = XaccGenerator()
|
|
265
|
|
266 # Sequence source through the generator parts:
|
|
267 lexer.feed(src)
|
|
268 grammar = parser.parse_grammar()
|
319
|
269 generator.generate(grammar, parser.headers, args.output)
|
318
|
270 args.output.close()
|
|
271
|
|
272
|
|
273 if __name__ == '__main__':
|
|
274 main()
|