Mercurial > pylearn
changeset 532:34ee3aff3e8f
Improved embedding word preprocessing.
author | Joseph Turian <turian@gmail.com> |
---|---|
date | Tue, 18 Nov 2008 02:57:50 -0500 |
parents | 90a76a8238e8 |
children | de974b4fc4ea |
files | embeddings/process.py |
diffstat | 1 files changed, 4 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/embeddings/process.py Tue Nov 18 00:32:39 2008 -0500 +++ b/embeddings/process.py Tue Nov 18 02:57:50 2008 -0500 @@ -67,9 +67,10 @@ elif origw == "-RSB-": w = "]" else: w = origw - w = string.lower(w) - w = slashre.sub("/", w) - w = numberre.sub("NUMBER", w) + if w not in __word_to_embedding: + w = string.lower(w) + w = slashre.sub("/", w) + w = numberre.sub("NUMBER", w) if w not in __word_to_embedding: # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) w = UNKNOWN