Mercurial > lcfOS
view python/pylex.py @ 407:9eb1fc6aad6c
Minor improvements
author | Windel Bouwman |
---|---|
date | Fri, 20 Feb 2015 15:47:54 +0100 |
parents | c694ec551f34 |
children |
line wrap: on
line source
import collections """ Simple script to try fsa construction """ # 0. possible characters: chars = ['a', 'b', 'c', 'd'] # 1. pattern definitions: P1 = "aab" P2 = "abbad" P3 = "caababbac" patterns = [P1, P2, P3] # 2. input string: txt = "ababaabaaabaabaacabbadbababaabbaab" # Generator functions to generate states and transitions: class Item: def __init__(self, regex, dotpos): self.regex = regex self.dotpos = dotpos def __eq__(self, other): if type(self) is type(other): return self.regex == other.regex and self.dotpos == other.dotpos return False def __repr__(self): return '{0} _dot_ {1}'.format(self.regex[0:self.dotpos], self.regex[self.dotpos:]) def __hash__(self): return (self.regex, self.dotpos).__hash__() def matches(self, ch): if self.dotpos < len(self.regex): if self.regex[self.dotpos] == ch: return True return False @property def IsReduce(self): return self.dotpos == len(self.regex) def InitialItemSet(): s = set() for p in patterns: s.add(Item(p, 0)) return frozenset(s) def NextItemSet(its, ch): nis = set() for item in its: if item.matches(ch): nis.add(Item(item.regex, item.dotpos + 1)) return frozenset(nis) def ClassOfToken(its): for i in its: if i.IsReduce: return i.regex return None sw = frozenset() # Empty set states = set() transitions = {} s0 = InitialItemSet() def preCalc(): states.add(s0) worklist = [s0] while len(worklist) > 0: its = worklist.pop(0) for ch in chars: nis = NextItemSet(its, ch) if nis and not nis in states: states.add(nis) worklist.append(nis) transitions[(its, ch)] = nis preCalc() print(len(states), states) print(len(transitions), transitions) # Lex function: Token = collections.namedtuple('Token', 'pos len name') def getNextToken(): readIndex = 0 while True: startTok = readIndex endTok = None classTok = None itemSet = s0 while itemSet: ch = txt[readIndex] # Lookup next itemset: if (itemSet, ch) in transitions: itemSet = transitions[(itemSet, ch)] else: itemSet = sw # check what we recognized: ct = ClassOfToken(itemSet) if ct: classTok = ct endTok = readIndex readIndex += 1 print('restart') if classTok: token = Token(startTok, endTok - startTok, classTok) readIndex = endTok + 1 else: readIndex = startTok + 1 token = None yield token for tok in getNextToken(): print(tok) if tok: print(txt) print(' ' * tok.pos + tok.name)