changeset 532:34ee3aff3e8f

Improved embedding word preprocessing.
author Joseph Turian <turian@gmail.com>
date Tue, 18 Nov 2008 02:57:50 -0500
parents 90a76a8238e8
children de974b4fc4ea
files embeddings/process.py
diffstat 1 files changed, 4 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/embeddings/process.py	Tue Nov 18 00:32:39 2008 -0500
+++ b/embeddings/process.py	Tue Nov 18 02:57:50 2008 -0500
@@ -67,9 +67,10 @@
     elif origw == "-RSB-": w = "]"
     else:
         w = origw
-        w = string.lower(w)
-        w = slashre.sub("/", w)
-        w = numberre.sub("NUMBER", w)
+        if w not in __word_to_embedding:
+            w = string.lower(w)
+            w = slashre.sub("/", w)
+            w = numberre.sub("NUMBER", w)
     if w not in __word_to_embedding:
 #        sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
         w = UNKNOWN