# HG changeset patch
# User James Bergstra <bergstrj@iro.umontreal.ca>
# Date 1238629727 14400
# Node ID a5a41b7ddd266d17ae1b9a6f2408133df30f5566
# Parent  f3b7d69562092a4ff0fbacca594f0602168dd54c# Parent  8fff4bc26f4c1204dca826872634248c2a21fd66
merge

diff -r f3b7d6956209 -r a5a41b7ddd26 pylearn/datasets/embeddings/process.py
--- a/pylearn/datasets/embeddings/process.py	Wed Apr 01 19:48:32 2009 -0400
+++ b/pylearn/datasets/embeddings/process.py	Wed Apr 01 19:48:47 2009 -0400
@@ -50,17 +50,31 @@
 
 import re
 numberre = re.compile("[0-9]")
-
-def preprocess_word(w):
+slashre = re.compile("\\\/")
+ 
+def preprocess_word(origw):
     """
     Convert a word so that it can be embedded directly.
     Returned the preprocessed sequence.
-    @note: Perhaps run L{common.penntreebank.preprocess} on the word first.
+    @note: Preprocessing is appropriate for Penn Treebank style documents.
+    #@note: Perhaps run L{common.penntreebank.preprocess} on the word first.
     """
     read_embeddings()
-    if w not in __word_to_embedding:
-        w = string.lower(w)
-        w = numberre.sub("NUMBER", w)
+    if origw == "-LRB-": w = "("
+    elif origw == "-RRB-": w = ")"
+    elif origw == "-LCB-": w = "{"
+    elif origw == "-RCB-": w = "}"
+    elif origw == "-LSB-": w = "["
+    elif origw == "-RSB-": w = "]"
+    else:
+        w = origw
+        if w not in __word_to_embedding:
+            w = string.lower(w)
+            w = slashre.sub("/", w)
+            w = numberre.sub("NUMBER", w)
+#    if w not in __word_to_embedding:
+#        w = string.lower(w)
+#        w = numberre.sub("NUMBER", w)
     if w not in __word_to_embedding:
 #        sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
         w = UNKNOWN