changeset 536:c6563c629984

Moved word preprocessing out
author Joseph Turian <turian@gmail.com>
date Thu, 20 Nov 2008 06:11:52 -0500
parents a6068dedfbd6
children ecbad22bd2f5 27b1344a57b1
files embeddings/process.py
diffstat 1 files changed, 5 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/embeddings/process.py	Wed Nov 19 18:14:43 2008 -0500
+++ b/embeddings/process.py	Thu Nov 20 06:11:52 2008 -0500
@@ -50,27 +50,17 @@
 
 import re
 numberre = re.compile("[0-9]")
-slashre = re.compile("\\\/")
 
-def preprocess_word(origw):
+def preprocess_word(w):
     """
     Convert a word so that it can be embedded directly.
     Returned the preprocessed sequence.
-    @note: Preprocessing is appropriate for Penn Treebank style documents.
+    @note: Perhaps run L{common.penntreebank.preprocess} on the word first.
     """
     read_embeddings()
-    if origw == "-LRB-": w = "("
-    elif origw == "-RRB-": w = ")"
-    elif origw == "-LCB-": w = "{"
-    elif origw == "-RCB-": w = "}"
-    elif origw == "-LSB-": w = "["
-    elif origw == "-RSB-": w = "]"
-    else:
-        w = origw
-        if w not in __word_to_embedding:
-            w = string.lower(w)
-            w = slashre.sub("/", w)
-            w = numberre.sub("NUMBER", w)
+    if w not in __word_to_embedding:
+        w = string.lower(w)
+        w = numberre.sub("NUMBER", w)
     if w not in __word_to_embedding:
 #        sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
         w = UNKNOWN