pylearn: sandbox/embeddings/read.py comparison

\/ -> /

author	Joseph Turian <turian@iro.umontreal.ca>
date	Tue, 07 Oct 2008 23:07:23 -0400
parents	f400f62e7f9e
children

comparison

equal deleted inserted replaced

-:f400f62e7f9e
+:fda72e944104
 WORD_TO_VECTOR[w] = l
 __read = True
 import re
 numberre = re.compile("[0-9]")
+slashre = re.compile("\\\/")
 def preprocess(l):
 """
 Convert a sequence so that it can be embedded directly.
 Returned the preprocessed sequence.
+@note: Preprocessing is appropriate for Penn Treebank style documents.
 """
 read_embeddings()
 lnew = []
 for origw in l:
 if origw == "-LRB-": w = "("
 elif origw == "-LSB-": w = "["
 elif origw == "-RSB-": w = "]"
 else:
 w = origw
 w = string.lower(w)
+w = slashre.sub("/", w)
 w = numberre.sub("NUMBER", w)
 if w not in WORD_TO_VECTOR:
 sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
 w = UNKNOWN
 assert w in WORD_TO_VECTOR

Mercurial > pylearn