changeset 459:f400f62e7f9e

Fixed embedding preprocessing
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 07 Oct 2008 23:00:10 -0400
parents ed6b0b3be8d2
children fda72e944104
files sandbox/embeddings/__init__.py sandbox/embeddings/files.py sandbox/embeddings/read.py
diffstat 2 files changed, 40 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/sandbox/embeddings/files.py	Tue Oct 07 19:13:53 2008 -0400
+++ b/sandbox/embeddings/files.py	Tue Oct 07 23:00:10 2008 -0400
@@ -5,3 +5,4 @@
 VOCABFILE       = "/u/turian/data/word_embeddings.collobert-and-weston/words.asc"
 NUMBER_OF_WORDS = 30000
 DIMENSIONS      = 50
+UNKNOWN         = "UNKNOWN"
--- a/sandbox/embeddings/read.py	Tue Oct 07 19:13:53 2008 -0400
+++ b/sandbox/embeddings/read.py	Tue Oct 07 23:00:10 2008 -0400
@@ -38,16 +38,46 @@
 
 import re
 numberre = re.compile("[0-9]")
-def convert_string(s):
+
+def preprocess(l):
     """
-    Convert a string to a sequence of embeddings.
+    Convert a sequence so that it can be embedded directly.
+    Returned the preprocessed sequence.
     """
     read_embeddings()
-    e = []
-    for origw in string.split(string.lower(s)):
-        w = numberre.sub("NUMBER", origw)
-        if w in WORD_TO_VECTOR:
-            e.append(WORD_TO_VECTOR[w])
+    lnew = []
+    for origw in l:
+        if origw == "-LRB-": w = "("
+        elif origw == "-RRB-": w = ")"
+        elif origw == "-LCB-": w = "{"
+        elif origw == "-RCB-": w = "}"
+        elif origw == "-LSB-": w = "["
+        elif origw == "-RSB-": w = "]"
         else:
-            sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw))
-    return e
+            w = origw
+            w = string.lower(w)
+            w = numberre.sub("NUMBER", w)
+        if w not in WORD_TO_VECTOR:
+            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
+            w = UNKNOWN
+        assert w in WORD_TO_VECTOR
+        lnew.append(w)
+    return lnew
+
+#def convert_string(s, strict=False):
+#    """
+#    Convert a string to a sequence of embeddings.
+#    @param strict: If strict, then words *must* be in the vocabulary.
+#    @todo: DEPRECATED Remove this function.
+#    """
+#    read_embeddings()
+#    e = []
+#    for origw in string.split(string.lower(s)):
+#        w = numberre.sub("NUMBER", origw)
+#        if w in WORD_TO_VECTOR:
+#            e.append(WORD_TO_VECTOR[w])
+#        else:
+#            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
+#            assert not strict
+#            e.append(WORD_TO_VECTOR[UNKNOWN])
+#    return e