Mercurial > pylearn
changeset 536:c6563c629984
Moved word preprocessing out
author | Joseph Turian <turian@gmail.com> |
---|---|
date | Thu, 20 Nov 2008 06:11:52 -0500 |
parents | a6068dedfbd6 |
children | ecbad22bd2f5 27b1344a57b1 |
files | embeddings/process.py |
diffstat | 1 files changed, 5 insertions(+), 15 deletions(-) [+] |
line wrap: on
line diff
--- a/embeddings/process.py Wed Nov 19 18:14:43 2008 -0500 +++ b/embeddings/process.py Thu Nov 20 06:11:52 2008 -0500 @@ -50,27 +50,17 @@ import re numberre = re.compile("[0-9]") -slashre = re.compile("\\\/") -def preprocess_word(origw): +def preprocess_word(w): """ Convert a word so that it can be embedded directly. Returned the preprocessed sequence. - @note: Preprocessing is appropriate for Penn Treebank style documents. + @note: Perhaps run L{common.penntreebank.preprocess} on the word first. """ read_embeddings() - if origw == "-LRB-": w = "(" - elif origw == "-RRB-": w = ")" - elif origw == "-LCB-": w = "{" - elif origw == "-RCB-": w = "}" - elif origw == "-LSB-": w = "[" - elif origw == "-RSB-": w = "]" - else: - w = origw - if w not in __word_to_embedding: - w = string.lower(w) - w = slashre.sub("/", w) - w = numberre.sub("NUMBER", w) + if w not in __word_to_embedding: + w = string.lower(w) + w = numberre.sub("NUMBER", w) if w not in __word_to_embedding: # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) w = UNKNOWN