annotate python/pylex.py @ 222:c3f1ce8b638f

Fixup of parser
author Windel Bouwman
date Tue, 09 Jul 2013 17:36:31 +0200
parents c694ec551f34
children
rev   line source
178
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
1 import collections
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
2
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
3 """ Simple script to try fsa construction """
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
4 # 0. possible characters:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
5 chars = ['a', 'b', 'c', 'd']
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
6 # 1. pattern definitions:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
7 P1 = "aab"
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
8 P2 = "abbad"
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
9 P3 = "caababbac"
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
10 patterns = [P1, P2, P3]
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
11 # 2. input string:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
12 txt = "ababaabaaabaabaacabbadbababaabbaab"
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
13
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
14 # Generator functions to generate states and transitions:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
15 class Item:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
16 def __init__(self, regex, dotpos):
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
17 self.regex = regex
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
18 self.dotpos = dotpos
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
19 def __eq__(self, other):
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
20 if type(self) is type(other):
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
21 return self.regex == other.regex and self.dotpos == other.dotpos
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
22 return False
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
23 def __repr__(self):
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
24 return '{0} _dot_ {1}'.format(self.regex[0:self.dotpos], self.regex[self.dotpos:])
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
25 def __hash__(self):
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
26 return (self.regex, self.dotpos).__hash__()
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
27 def matches(self, ch):
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
28 if self.dotpos < len(self.regex):
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
29 if self.regex[self.dotpos] == ch:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
30 return True
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
31 return False
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
32 @property
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
33 def IsReduce(self):
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
34 return self.dotpos == len(self.regex)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
35
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
36 def InitialItemSet():
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
37 s = set()
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
38 for p in patterns:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
39 s.add(Item(p, 0))
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
40 return frozenset(s)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
41
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
42 def NextItemSet(its, ch):
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
43 nis = set()
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
44 for item in its:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
45 if item.matches(ch):
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
46 nis.add(Item(item.regex, item.dotpos + 1))
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
47 return frozenset(nis)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
48
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
49 def ClassOfToken(its):
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
50 for i in its:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
51 if i.IsReduce:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
52 return i.regex
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
53 return None
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
54
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
55 sw = frozenset() # Empty set
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
56 states = set()
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
57 transitions = {}
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
58 s0 = InitialItemSet()
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
59
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
60 def preCalc():
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
61 states.add(s0)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
62 worklist = [s0]
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
63 while len(worklist) > 0:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
64 its = worklist.pop(0)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
65 for ch in chars:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
66 nis = NextItemSet(its, ch)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
67 if nis and not nis in states:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
68 states.add(nis)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
69 worklist.append(nis)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
70 transitions[(its, ch)] = nis
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
71
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
72 preCalc()
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
73 print(len(states), states)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
74 print(len(transitions), transitions)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
75
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
76 # Lex function:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
77 Token = collections.namedtuple('Token', 'pos len name')
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
78
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
79 def getNextToken():
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
80 readIndex = 0
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
81 while True:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
82 startTok = readIndex
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
83 endTok = None
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
84 classTok = None
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
85 itemSet = s0
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
86 while itemSet:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
87 ch = txt[readIndex]
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
88 # Lookup next itemset:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
89 if (itemSet, ch) in transitions:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
90 itemSet = transitions[(itemSet, ch)]
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
91 else:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
92 itemSet = sw
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
93 # check what we recognized:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
94 ct = ClassOfToken(itemSet)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
95 if ct:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
96 classTok = ct
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
97 endTok = readIndex
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
98 readIndex += 1
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
99 print('restart')
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
100 if classTok:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
101 token = Token(startTok, endTok - startTok, classTok)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
102 readIndex = endTok + 1
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
103 else:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
104 readIndex = startTok + 1
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
105 token = None
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
106 yield token
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
107
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
108 for tok in getNextToken():
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
109 print(tok)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
110 if tok:
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
111 print(txt)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
112 print(' ' * tok.pos + tok.name)
c694ec551f34 Added lex yacc test scripts
Windel Bouwman
parents:
diff changeset
113