Mercurial > pylearn
changeset 675:a5a41b7ddd26
merge
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Wed, 01 Apr 2009 19:48:47 -0400 |
parents | f3b7d6956209 (current diff) 8fff4bc26f4c (diff) |
children | fec0ba6f8c8f |
files | |
diffstat | 1 files changed, 20 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/pylearn/datasets/embeddings/process.py Wed Apr 01 19:48:32 2009 -0400 +++ b/pylearn/datasets/embeddings/process.py Wed Apr 01 19:48:47 2009 -0400 @@ -50,17 +50,31 @@ import re numberre = re.compile("[0-9]") - -def preprocess_word(w): +slashre = re.compile("\\\/") + +def preprocess_word(origw): """ Convert a word so that it can be embedded directly. Returned the preprocessed sequence. - @note: Perhaps run L{common.penntreebank.preprocess} on the word first. + @note: Preprocessing is appropriate for Penn Treebank style documents. + #@note: Perhaps run L{common.penntreebank.preprocess} on the word first. """ read_embeddings() - if w not in __word_to_embedding: - w = string.lower(w) - w = numberre.sub("NUMBER", w) + if origw == "-LRB-": w = "(" + elif origw == "-RRB-": w = ")" + elif origw == "-LCB-": w = "{" + elif origw == "-RCB-": w = "}" + elif origw == "-LSB-": w = "[" + elif origw == "-RSB-": w = "]" + else: + w = origw + if w not in __word_to_embedding: + w = string.lower(w) + w = slashre.sub("/", w) + w = numberre.sub("NUMBER", w) +# if w not in __word_to_embedding: +# w = string.lower(w) +# w = numberre.sub("NUMBER", w) if w not in __word_to_embedding: # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) w = UNKNOWN