Mercurial > pylearn

--- a/sandbox/embeddings/README.txt	Tue Oct 07 23:07:23 2008 -0400
+++ b/sandbox/embeddings/README.txt	Wed Oct 08 01:09:23 2008 -0400
@@ -1,3 +1,3 @@
 Messy scripts for working with Jason + Ronan's embeddings.

-File locations are given in files.py
+Parameters are given in parameters.py
--- a/sandbox/embeddings/__init__.py	Tue Oct 07 23:07:23 2008 -0400
+++ b/sandbox/embeddings/__init__.py	Wed Oct 08 01:09:23 2008 -0400
@@ -0,0 +1,1 @@
+from process import *
--- a/sandbox/embeddings/files.py	Tue Oct 07 23:07:23 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-"""
-Locations of the embedding data files.
-"""
-WEIGHTSFILE     = "/u/turian/data/word_embeddings.collobert-and-weston/lm-weights.txt"
-VOCABFILE       = "/u/turian/data/word_embeddings.collobert-and-weston/words.asc"
-NUMBER_OF_WORDS = 30000
-DIMENSIONS      = 50
-UNKNOWN         = "UNKNOWN"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sandbox/embeddings/parameters.py	Wed Oct 08 01:09:23 2008 -0400
@@ -0,0 +1,8 @@
+"""
+Locations of the embedding data files.
+"""
+WEIGHTSFILE     = "/u/turian/data/word_embeddings.collobert-and-weston/lm-weights.txt"
+VOCABFILE       = "/u/turian/data/word_embeddings.collobert-and-weston/words.asc"
+NUMBER_OF_WORDS = 30000
+DIMENSIONS      = 50
+UNKNOWN         = "UNKNOWN"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sandbox/embeddings/process.py	Wed Oct 08 01:09:23 2008 -0400
@@ -0,0 +1,91 @@
+"""
+Read in the weights file
+"""
+
+import string
+import sys
+
+from parameters import *
+
+__words = None
+__word_to_embedding = None
+__read = False
+
+def word_to_embedding(w):
+    read_embeddings()
+    return __word_to_embedding[w]
+
+def read_embeddings():
+    global __words
+    global __word_to_embedding
+    global __read
+    if __read: return
+
+    __words = [string.strip(w) for w in open(VOCABFILE).readlines()]
+    assert len(__words) == NUMBER_OF_WORDS
+
+    import numpy, math
+    from common.str import percent
+
+    __word_to_embedding = {}
+
+    sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
+    f = open(WEIGHTSFILE)
+    f.readline()
+    vals = [float(v) for v in string.split(f.readline())]
+    assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
+    vals.reverse()
+    for i in range(NUMBER_OF_WORDS):
+        l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
+        w = __words[i]
+        __word_to_embedding[w] = l
+    __read = True
+
+import re
+numberre = re.compile("[0-9]")
+slashre = re.compile("\\\/")
+
+def preprocess(l):
+    """
+    Convert a sequence so that it can be embedded directly.
+    Returned the preprocessed sequence.
+    @note: Preprocessing is appropriate for Penn Treebank style documents.
+    """
+    read_embeddings()
+    lnew = []
+    for origw in l:
+        if origw == "-LRB-": w = "("
+        elif origw == "-RRB-": w = ")"
+        elif origw == "-LCB-": w = "{"
+        elif origw == "-RCB-": w = "}"
+        elif origw == "-LSB-": w = "["
+        elif origw == "-RSB-": w = "]"
+        else:
+            w = origw
+            w = string.lower(w)
+            w = slashre.sub("/", w)
+            w = numberre.sub("NUMBER", w)
+        if w not in __word_to_embedding:
+            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
+            w = UNKNOWN
+        assert w in __word_to_embedding
+        lnew.append(w)
+    return lnew
+
+#def convert_string(s, strict=False):
+#    """
+#    Convert a string to a sequence of embeddings.
+#    @param strict: If strict, then words *must* be in the vocabulary.
+#    @todo: DEPRECATED Remove this function.
+#    """
+#    read_embeddings()
+#    e = []
+#    for origw in string.split(string.lower(s)):
+#        w = numberre.sub("NUMBER", origw)
+#        if w in __word_to_embedding:
+#            e.append(__word_to_embedding[w])
+#        else:
+#            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
+#            assert not strict
+#            e.append(__word_to_embedding[UNKNOWN])
+#    return e
--- a/sandbox/embeddings/read.py	Tue Oct 07 23:07:23 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,86 +0,0 @@
-"""
-Read in the weights file
-"""
-
-import string
-import sys
-
-from files import *
-
-WORDS = None
-WORD_TO_VECTOR = None
-__read = False
-def read_embeddings():
-    global WORDS
-    global WORD_TO_VECTOR
-    global __read
-    if __read: return
-
-    WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()]
-    assert len(WORDS) == NUMBER_OF_WORDS
-
-    import numpy, math
-    from common.str import percent
-
-    WORD_TO_VECTOR = {}
-
-    sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
-    f = open(WEIGHTSFILE)
-    f.readline()
-    vals = [float(v) for v in string.split(f.readline())]
-    assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
-    vals.reverse()
-    for i in range(NUMBER_OF_WORDS):
-        l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
-        w = WORDS[i]
-        WORD_TO_VECTOR[w] = l
-    __read = True
-
-import re
-numberre = re.compile("[0-9]")
-slashre = re.compile("\\\/")
-
-def preprocess(l):
-    """
-    Convert a sequence so that it can be embedded directly.
-    Returned the preprocessed sequence.
-    @note: Preprocessing is appropriate for Penn Treebank style documents.
-    """
-    read_embeddings()
-    lnew = []
-    for origw in l:
-        if origw == "-LRB-": w = "("
-        elif origw == "-RRB-": w = ")"
-        elif origw == "-LCB-": w = "{"
-        elif origw == "-RCB-": w = "}"
-        elif origw == "-LSB-": w = "["
-        elif origw == "-RSB-": w = "]"
-        else:
-            w = origw
-            w = string.lower(w)
-            w = slashre.sub("/", w)
-            w = numberre.sub("NUMBER", w)
-        if w not in WORD_TO_VECTOR:
-            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
-            w = UNKNOWN
-        assert w in WORD_TO_VECTOR
-        lnew.append(w)
-    return lnew
-
-#def convert_string(s, strict=False):
-#    """
-#    Convert a string to a sequence of embeddings.
-#    @param strict: If strict, then words *must* be in the vocabulary.
-#    @todo: DEPRECATED Remove this function.
-#    """
-#    read_embeddings()
-#    e = []
-#    for origw in string.split(string.lower(s)):
-#        w = numberre.sub("NUMBER", origw)
-#        if w in WORD_TO_VECTOR:
-#            e.append(WORD_TO_VECTOR[w])
-#        else:
-#            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
-#            assert not strict
-#            e.append(WORD_TO_VECTOR[UNKNOWN])
-#    return e