Mercurial > pylearn
diff sandbox/embeddings/read.py @ 460:fda72e944104
\/ -> /
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 07 Oct 2008 23:07:23 -0400 |
parents | f400f62e7f9e |
children |
line wrap: on
line diff
--- a/sandbox/embeddings/read.py Tue Oct 07 23:00:10 2008 -0400 +++ b/sandbox/embeddings/read.py Tue Oct 07 23:07:23 2008 -0400 @@ -38,11 +38,13 @@ import re numberre = re.compile("[0-9]") +slashre = re.compile("\\\/") def preprocess(l): """ Convert a sequence so that it can be embedded directly. Returned the preprocessed sequence. + @note: Preprocessing is appropriate for Penn Treebank style documents. """ read_embeddings() lnew = [] @@ -56,6 +58,7 @@ else: w = origw w = string.lower(w) + w = slashre.sub("/", w) w = numberre.sub("NUMBER", w) if w not in WORD_TO_VECTOR: sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))