changeset 469:4335309f4924

Split into preprocess for words and sequences
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 21 Oct 2008 16:32:06 -0400
parents a07948f780b9
children bd937e845bbb
files embeddings/process.py
diffstat 1 files changed, 26 insertions(+), 16 deletions(-) [+]
line wrap: on
line diff
--- a/embeddings/process.py	Tue Oct 21 16:24:44 2008 -0400
+++ b/embeddings/process.py	Tue Oct 21 16:32:06 2008 -0400
@@ -45,7 +45,31 @@
 numberre = re.compile("[0-9]")
 slashre = re.compile("\\\/")
 
-def preprocess(l):
+def preprocess_word(origw):
+    """
+    Convert a word so that it can be embedded directly.
+    Returned the preprocessed sequence.
+    @note: Preprocessing is appropriate for Penn Treebank style documents.
+    """
+    read_embeddings()
+    if origw == "-LRB-": w = "("
+    elif origw == "-RRB-": w = ")"
+    elif origw == "-LCB-": w = "{"
+    elif origw == "-RCB-": w = "}"
+    elif origw == "-LSB-": w = "["
+    elif origw == "-RSB-": w = "]"
+    else:
+        w = origw
+        w = string.lower(w)
+        w = slashre.sub("/", w)
+        w = numberre.sub("NUMBER", w)
+    if w not in __word_to_embedding:
+#        sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
+        w = UNKNOWN
+    assert w in __word_to_embedding
+    return w
+
+def preprocess_seq(l):
     """
     Convert a sequence so that it can be embedded directly.
     Returned the preprocessed sequence.
@@ -54,21 +78,7 @@
     read_embeddings()
     lnew = []
     for origw in l:
-        if origw == "-LRB-": w = "("
-        elif origw == "-RRB-": w = ")"
-        elif origw == "-LCB-": w = "{"
-        elif origw == "-RCB-": w = "}"
-        elif origw == "-LSB-": w = "["
-        elif origw == "-RSB-": w = "]"
-        else:
-            w = origw
-            w = string.lower(w)
-            w = slashre.sub("/", w)
-            w = numberre.sub("NUMBER", w)
-        if w not in __word_to_embedding:
-            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
-            w = UNKNOWN
-        assert w in __word_to_embedding
+        w = preprocess_word(origw)
         lnew.append(w)
     return lnew