diff python/pylex.py @ 178:c694ec551f34

Added lex yacc test scripts
author Windel Bouwman
date Sat, 04 May 2013 12:07:17 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/python/pylex.py	Sat May 04 12:07:17 2013 +0200
@@ -0,0 +1,113 @@
+import collections
+
+""" Simple script to try fsa construction """
+# 0. possible characters:
+chars = ['a', 'b', 'c', 'd']
+# 1. pattern definitions:
+P1 = "aab"
+P2 = "abbad"
+P3 = "caababbac"
+patterns = [P1, P2, P3]
+# 2. input string:
+txt = "ababaabaaabaabaacabbadbababaabbaab"
+
+# Generator functions to generate states and transitions:
+class Item:
+   def __init__(self, regex, dotpos):
+      self.regex = regex
+      self.dotpos = dotpos
+   def __eq__(self, other):
+      if type(self) is type(other):
+         return self.regex == other.regex and self.dotpos == other.dotpos
+      return False
+   def __repr__(self):
+      return '{0} _dot_ {1}'.format(self.regex[0:self.dotpos], self.regex[self.dotpos:])
+   def __hash__(self):
+      return (self.regex, self.dotpos).__hash__()
+   def matches(self, ch):
+      if self.dotpos < len(self.regex):
+         if self.regex[self.dotpos] == ch:
+            return True
+      return False
+   @property
+   def IsReduce(self):
+      return self.dotpos == len(self.regex)
+
+def InitialItemSet():
+   s = set()
+   for p in patterns:
+      s.add(Item(p, 0))
+   return frozenset(s)
+
+def NextItemSet(its, ch):
+   nis = set()
+   for item in its:
+      if item.matches(ch):
+         nis.add(Item(item.regex, item.dotpos + 1))
+   return frozenset(nis)
+
+def ClassOfToken(its):
+   for i in its:
+      if i.IsReduce:
+         return i.regex
+   return None
+
+sw = frozenset() # Empty set
+states = set()
+transitions = {}
+s0 = InitialItemSet()
+
+def preCalc():
+   states.add(s0)
+   worklist = [s0]
+   while len(worklist) > 0:
+      its = worklist.pop(0)
+      for ch in chars:
+         nis = NextItemSet(its, ch)
+         if nis and not nis in states:
+            states.add(nis)
+            worklist.append(nis)
+            transitions[(its, ch)] = nis
+      
+preCalc()
+print(len(states), states)
+print(len(transitions), transitions)
+
+# Lex function:
+Token = collections.namedtuple('Token', 'pos len name')
+
+def getNextToken():
+   readIndex = 0
+   while True:
+      startTok = readIndex
+      endTok = None
+      classTok = None
+      itemSet = s0
+      while itemSet:
+         ch = txt[readIndex]
+         # Lookup next itemset:
+         if (itemSet, ch) in transitions:
+            itemSet = transitions[(itemSet, ch)]
+         else:
+            itemSet = sw
+         # check what we recognized:
+         ct = ClassOfToken(itemSet)
+         if ct:
+            classTok = ct
+            endTok = readIndex
+         readIndex += 1
+      print('restart')
+      if classTok:
+         token = Token(startTok, endTok - startTok, classTok)
+         readIndex = endTok + 1
+      else:
+         readIndex = startTok + 1
+         token = None
+      yield token
+
+for tok in getNextToken():
+   print(tok)
+   if tok:
+      print(txt)
+      print(' ' * tok.pos + tok.name)
+