# HG changeset patch # User Joseph Turian # Date 1223435243 14400 # Node ID fda72e944104ab6debb73acaa17902b9d47581ab # Parent f400f62e7f9e537a95e60704eb27d123cb9795d6 \/ -> / diff -r f400f62e7f9e -r fda72e944104 sandbox/embeddings/read.py --- a/sandbox/embeddings/read.py Tue Oct 07 23:00:10 2008 -0400 +++ b/sandbox/embeddings/read.py Tue Oct 07 23:07:23 2008 -0400 @@ -38,11 +38,13 @@ import re numberre = re.compile("[0-9]") +slashre = re.compile("\\\/") def preprocess(l): """ Convert a sequence so that it can be embedded directly. Returned the preprocessed sequence. + @note: Preprocessing is appropriate for Penn Treebank style documents. """ read_embeddings() lnew = [] @@ -56,6 +58,7 @@ else: w = origw w = string.lower(w) + w = slashre.sub("/", w) w = numberre.sub("NUMBER", w) if w not in WORD_TO_VECTOR: sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))