annotate python/baselex.py @ 382:0c44e494ef58

Made lexer more generic
author Windel Bouwman
date Sun, 27 Apr 2014 12:24:21 +0200
parents 818be710e13d
children fb3c1f029b30
rev   line source
319
8d07a4254f04 Work on burg
Windel Bouwman
parents:
diff changeset
1
8d07a4254f04 Work on burg
Windel Bouwman
parents:
diff changeset
2 import re
357
818be710e13d Added acceptance function to burg
Windel Bouwman
parents: 319
diff changeset
3 from ppci import Token, CompilerError
382
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
4 from pyyacc import EOF
319
8d07a4254f04 Work on burg
Windel Bouwman
parents:
diff changeset
5
382
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
6
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
7 class BaseLexer:
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
8 """ Base class for a lexer """
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
9 def __init__(self, tok_spec):
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
10 tok_re = '|'.join('(?P<{}>{})'.format(pair[0], pair[1]) for pair in tok_spec)
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
11 self.gettok = re.compile(tok_re).match
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
12 self.func_map = {pair[0]: pair[2] for pair in tok_spec}
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
13
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
14 def feed(self, txt):
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
15 """ Feeds the lexer with extra input """
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
16 self.tokens = self.tokenize(txt)
319
8d07a4254f04 Work on burg
Windel Bouwman
parents:
diff changeset
17
382
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
18 def tokenize(self, txt):
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
19 """ Generator that generates tokens from text """
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
20 mo = self.gettok(txt)
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
21 pos = 0
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
22 while mo:
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
23 typ = mo.lastgroup
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
24 val = mo.group(typ)
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
25 func = self.func_map[typ]
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
26 if func:
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
27 typ, val = func(typ, val)
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
28 yield Token(typ, val)
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
29 pos = mo.end()
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
30 mo = self.gettok(txt, pos)
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
31 if len(txt) != pos:
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
32 raise CompilerError('Lex fault at {}'.format(txt[pos:]))
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
33
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
34 def next_token(self):
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
35 try:
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
36 return self.tokens.__next__()
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
37 except StopIteration:
0c44e494ef58 Made lexer more generic
Windel Bouwman
parents: 357
diff changeset
38 return Token(EOF, EOF)