pylearn: embeddings/process.py comparison

Added preprocessing back in

author	Joseph Turian <turian@gmail.com>
date	Thu, 20 Nov 2008 06:38:06 -0500
parents	c6563c629984
children

comparison

equal deleted inserted replaced

-:c6563c629984
+:27b1344a57b1
 for w in __word_to_embedding: assert len(__word_to_embedding[__words[0]]) == len(__word_to_embedding[w])
 sys.stderr.write("...done reading %s\n" % WEIGHTSFILE)
 import re
 numberre = re.compile("[0-9]")
+slashre = re.compile("\\\/")
-def preprocess_word(w):
+def preprocess_word(origw):
 """
 Convert a word so that it can be embedded directly.
 Returned the preprocessed sequence.
-@note: Perhaps run L{common.penntreebank.preprocess} on the word first.
+@note: Preprocessing is appropriate for Penn Treebank style documents.
+#@note: Perhaps run L{common.penntreebank.preprocess} on the word first.
 """
 read_embeddings()
-if w not in __word_to_embedding:
+if origw == "-LRB-": w = "("
-w = string.lower(w)
+elif origw == "-RRB-": w = ")"
-w = numberre.sub("NUMBER", w)
+elif origw == "-LCB-": w = "{"
+elif origw == "-RCB-": w = "}"
+elif origw == "-LSB-": w = "["
+elif origw == "-RSB-": w = "]"
+else:
+w = origw
+if w not in __word_to_embedding:
+w = string.lower(w)
+w = slashre.sub("/", w)
+w = numberre.sub("NUMBER", w)
+#    if w not in __word_to_embedding:
+#        w = string.lower(w)
+#        w = numberre.sub("NUMBER", w)
 if w not in __word_to_embedding:
 #        sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
 w = UNKNOWN
 assert w in __word_to_embedding
 return w

Mercurial > pylearn