Mercurial > pylearn
diff embeddings/process.py @ 469:4335309f4924
Split into preprocess for words and sequences
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 21 Oct 2008 16:32:06 -0400 |
parents | a07948f780b9 |
children | 90a76a8238e8 |
line wrap: on
line diff
--- a/embeddings/process.py Tue Oct 21 16:24:44 2008 -0400 +++ b/embeddings/process.py Tue Oct 21 16:32:06 2008 -0400 @@ -45,7 +45,31 @@ numberre = re.compile("[0-9]") slashre = re.compile("\\\/") -def preprocess(l): +def preprocess_word(origw): + """ + Convert a word so that it can be embedded directly. + Returned the preprocessed sequence. + @note: Preprocessing is appropriate for Penn Treebank style documents. + """ + read_embeddings() + if origw == "-LRB-": w = "(" + elif origw == "-RRB-": w = ")" + elif origw == "-LCB-": w = "{" + elif origw == "-RCB-": w = "}" + elif origw == "-LSB-": w = "[" + elif origw == "-RSB-": w = "]" + else: + w = origw + w = string.lower(w) + w = slashre.sub("/", w) + w = numberre.sub("NUMBER", w) + if w not in __word_to_embedding: +# sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) + w = UNKNOWN + assert w in __word_to_embedding + return w + +def preprocess_seq(l): """ Convert a sequence so that it can be embedded directly. Returned the preprocessed sequence. @@ -54,21 +78,7 @@ read_embeddings() lnew = [] for origw in l: - if origw == "-LRB-": w = "(" - elif origw == "-RRB-": w = ")" - elif origw == "-LCB-": w = "{" - elif origw == "-RCB-": w = "}" - elif origw == "-LSB-": w = "[" - elif origw == "-RSB-": w = "]" - else: - w = origw - w = string.lower(w) - w = slashre.sub("/", w) - w = numberre.sub("NUMBER", w) - if w not in __word_to_embedding: - sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) - w = UNKNOWN - assert w in __word_to_embedding + w = preprocess_word(origw) lnew.append(w) return lnew