# HG changeset patch
# User Joseph Turian <turian@iro.umontreal.ca>
# Date 1223435243 14400
# Node ID fda72e944104ab6debb73acaa17902b9d47581ab
# Parent  f400f62e7f9e537a95e60704eb27d123cb9795d6
\/ -> /

diff -r f400f62e7f9e -r fda72e944104 sandbox/embeddings/read.py
--- a/sandbox/embeddings/read.py	Tue Oct 07 23:00:10 2008 -0400
+++ b/sandbox/embeddings/read.py	Tue Oct 07 23:07:23 2008 -0400
@@ -38,11 +38,13 @@
 
 import re
 numberre = re.compile("[0-9]")
+slashre = re.compile("\\\/")
 
 def preprocess(l):
     """
     Convert a sequence so that it can be embedded directly.
     Returned the preprocessed sequence.
+    @note: Preprocessing is appropriate for Penn Treebank style documents.
     """
     read_embeddings()
     lnew = []
@@ -56,6 +58,7 @@
         else:
             w = origw
             w = string.lower(w)
+            w = slashre.sub("/", w)
             w = numberre.sub("NUMBER", w)
         if w not in WORD_TO_VECTOR:
             sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))