diff sandbox/embeddings/process.py @ 461:1243716ade6a

Rearranged
author Joseph Turian <turian@iro.umontreal.ca>
date Wed, 08 Oct 2008 01:09:23 -0400
parents sandbox/embeddings/read.py@fda72e944104
children 121cc6db4481
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sandbox/embeddings/process.py	Wed Oct 08 01:09:23 2008 -0400
@@ -0,0 +1,91 @@
+"""
+Read in the weights file
+"""
+
+import string
+import sys
+
+from parameters import *
+
+__words = None
+__word_to_embedding = None
+__read = False
+
+def word_to_embedding(w):
+    read_embeddings()
+    return __word_to_embedding[w]
+
+def read_embeddings():
+    global __words
+    global __word_to_embedding
+    global __read 
+    if __read: return
+
+    __words = [string.strip(w) for w in open(VOCABFILE).readlines()]
+    assert len(__words) == NUMBER_OF_WORDS
+
+    import numpy, math
+    from common.str import percent
+
+    __word_to_embedding = {}
+
+    sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
+    f = open(WEIGHTSFILE)
+    f.readline()
+    vals = [float(v) for v in string.split(f.readline())]
+    assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
+    vals.reverse()
+    for i in range(NUMBER_OF_WORDS):
+        l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
+        w = __words[i]
+        __word_to_embedding[w] = l
+    __read = True
+
+import re
+numberre = re.compile("[0-9]")
+slashre = re.compile("\\\/")
+
+def preprocess(l):
+    """
+    Convert a sequence so that it can be embedded directly.
+    Returned the preprocessed sequence.
+    @note: Preprocessing is appropriate for Penn Treebank style documents.
+    """
+    read_embeddings()
+    lnew = []
+    for origw in l:
+        if origw == "-LRB-": w = "("
+        elif origw == "-RRB-": w = ")"
+        elif origw == "-LCB-": w = "{"
+        elif origw == "-RCB-": w = "}"
+        elif origw == "-LSB-": w = "["
+        elif origw == "-RSB-": w = "]"
+        else:
+            w = origw
+            w = string.lower(w)
+            w = slashre.sub("/", w)
+            w = numberre.sub("NUMBER", w)
+        if w not in __word_to_embedding:
+            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
+            w = UNKNOWN
+        assert w in __word_to_embedding
+        lnew.append(w)
+    return lnew
+
+#def convert_string(s, strict=False):
+#    """
+#    Convert a string to a sequence of embeddings.
+#    @param strict: If strict, then words *must* be in the vocabulary.
+#    @todo: DEPRECATED Remove this function.
+#    """
+#    read_embeddings()
+#    e = []
+#    for origw in string.split(string.lower(s)):
+#        w = numberre.sub("NUMBER", origw)
+#        if w in __word_to_embedding:
+#            e.append(__word_to_embedding[w])
+#        else:
+#            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
+#            assert not strict
+#            e.append(__word_to_embedding[UNKNOWN])
+#    return e