178
|
1 import collections
|
|
2
|
|
3 """ Simple script to try fsa construction """
|
|
4 # 0. possible characters:
|
|
5 chars = ['a', 'b', 'c', 'd']
|
|
6 # 1. pattern definitions:
|
|
7 P1 = "aab"
|
|
8 P2 = "abbad"
|
|
9 P3 = "caababbac"
|
|
10 patterns = [P1, P2, P3]
|
|
11 # 2. input string:
|
|
12 txt = "ababaabaaabaabaacabbadbababaabbaab"
|
|
13
|
|
14 # Generator functions to generate states and transitions:
|
|
15 class Item:
|
|
16 def __init__(self, regex, dotpos):
|
|
17 self.regex = regex
|
|
18 self.dotpos = dotpos
|
|
19 def __eq__(self, other):
|
|
20 if type(self) is type(other):
|
|
21 return self.regex == other.regex and self.dotpos == other.dotpos
|
|
22 return False
|
|
23 def __repr__(self):
|
|
24 return '{0} _dot_ {1}'.format(self.regex[0:self.dotpos], self.regex[self.dotpos:])
|
|
25 def __hash__(self):
|
|
26 return (self.regex, self.dotpos).__hash__()
|
|
27 def matches(self, ch):
|
|
28 if self.dotpos < len(self.regex):
|
|
29 if self.regex[self.dotpos] == ch:
|
|
30 return True
|
|
31 return False
|
|
32 @property
|
|
33 def IsReduce(self):
|
|
34 return self.dotpos == len(self.regex)
|
|
35
|
|
36 def InitialItemSet():
|
|
37 s = set()
|
|
38 for p in patterns:
|
|
39 s.add(Item(p, 0))
|
|
40 return frozenset(s)
|
|
41
|
|
42 def NextItemSet(its, ch):
|
|
43 nis = set()
|
|
44 for item in its:
|
|
45 if item.matches(ch):
|
|
46 nis.add(Item(item.regex, item.dotpos + 1))
|
|
47 return frozenset(nis)
|
|
48
|
|
49 def ClassOfToken(its):
|
|
50 for i in its:
|
|
51 if i.IsReduce:
|
|
52 return i.regex
|
|
53 return None
|
|
54
|
|
55 sw = frozenset() # Empty set
|
|
56 states = set()
|
|
57 transitions = {}
|
|
58 s0 = InitialItemSet()
|
|
59
|
|
60 def preCalc():
|
|
61 states.add(s0)
|
|
62 worklist = [s0]
|
|
63 while len(worklist) > 0:
|
|
64 its = worklist.pop(0)
|
|
65 for ch in chars:
|
|
66 nis = NextItemSet(its, ch)
|
|
67 if nis and not nis in states:
|
|
68 states.add(nis)
|
|
69 worklist.append(nis)
|
|
70 transitions[(its, ch)] = nis
|
|
71
|
|
72 preCalc()
|
|
73 print(len(states), states)
|
|
74 print(len(transitions), transitions)
|
|
75
|
|
76 # Lex function:
|
|
77 Token = collections.namedtuple('Token', 'pos len name')
|
|
78
|
|
79 def getNextToken():
|
|
80 readIndex = 0
|
|
81 while True:
|
|
82 startTok = readIndex
|
|
83 endTok = None
|
|
84 classTok = None
|
|
85 itemSet = s0
|
|
86 while itemSet:
|
|
87 ch = txt[readIndex]
|
|
88 # Lookup next itemset:
|
|
89 if (itemSet, ch) in transitions:
|
|
90 itemSet = transitions[(itemSet, ch)]
|
|
91 else:
|
|
92 itemSet = sw
|
|
93 # check what we recognized:
|
|
94 ct = ClassOfToken(itemSet)
|
|
95 if ct:
|
|
96 classTok = ct
|
|
97 endTok = readIndex
|
|
98 readIndex += 1
|
|
99 print('restart')
|
|
100 if classTok:
|
|
101 token = Token(startTok, endTok - startTok, classTok)
|
|
102 readIndex = endTok + 1
|
|
103 else:
|
|
104 readIndex = startTok + 1
|
|
105 token = None
|
|
106 yield token
|
|
107
|
|
108 for tok in getNextToken():
|
|
109 print(tok)
|
|
110 if tok:
|
|
111 print(txt)
|
|
112 print(' ' * tok.pos + tok.name)
|
|
113
|