diff sandbox/embeddings/read.py @ 460:fda72e944104

\/ -> /
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 07 Oct 2008 23:07:23 -0400
parents f400f62e7f9e
children
line wrap: on
line diff
--- a/sandbox/embeddings/read.py	Tue Oct 07 23:00:10 2008 -0400
+++ b/sandbox/embeddings/read.py	Tue Oct 07 23:07:23 2008 -0400
@@ -38,11 +38,13 @@
 
 import re
 numberre = re.compile("[0-9]")
+slashre = re.compile("\\\/")
 
 def preprocess(l):
     """
     Convert a sequence so that it can be embedded directly.
     Returned the preprocessed sequence.
+    @note: Preprocessing is appropriate for Penn Treebank style documents.
     """
     read_embeddings()
     lnew = []
@@ -56,6 +58,7 @@
         else:
             w = origw
             w = string.lower(w)
+            w = slashre.sub("/", w)
             w = numberre.sub("NUMBER", w)
         if w not in WORD_TO_VECTOR:
             sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))