Mercurial > pylearn

--- a/sandbox/embeddings/read.py	Tue Oct 07 23:00:10 2008 -0400
+++ b/sandbox/embeddings/read.py	Tue Oct 07 23:07:23 2008 -0400
@@ -38,11 +38,13 @@

 import re
 numberre = re.compile("[0-9]")
+slashre = re.compile("\\\/")

 def preprocess(l):
     """
     Convert a sequence so that it can be embedded directly.
     Returned the preprocessed sequence.
+    @note: Preprocessing is appropriate for Penn Treebank style documents.
     """
     read_embeddings()
     lnew = []
@@ -56,6 +58,7 @@
         else:
             w = origw
             w = string.lower(w)
+            w = slashre.sub("/", w)
             w = numberre.sub("NUMBER", w)
         if w not in WORD_TO_VECTOR:
             sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))