changeset 468:a07948f780b9

Moved embeddings out of sandbox
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 21 Oct 2008 16:24:44 -0400
parents f3711bcc467e
children 4335309f4924
files embeddings/README.txt embeddings/__init__.py embeddings/convert.py embeddings/one-per-line.py embeddings/parameters.py embeddings/process.py embeddings/read-original.py sandbox/embeddings/README.txt sandbox/embeddings/__init__.py sandbox/embeddings/convert.py sandbox/embeddings/one-per-line.py sandbox/embeddings/parameters.py sandbox/embeddings/process.py sandbox/embeddings/read-original.py
diffstat 14 files changed, 215 insertions(+), 215 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/embeddings/README.txt	Tue Oct 21 16:24:44 2008 -0400
@@ -0,0 +1,3 @@
+Messy scripts for working with Jason + Ronan's embeddings.
+
+Parameters are given in parameters.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/embeddings/__init__.py	Tue Oct 21 16:24:44 2008 -0400
@@ -0,0 +1,1 @@
+from process import *
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/embeddings/convert.py	Tue Oct 21 16:24:44 2008 -0400
@@ -0,0 +1,15 @@
+#!/usr/bin/python
+"""
+Convert stdin sentences to word embeddings, and output YAML.
+"""
+
+import sys, string
+import read
+import yaml
+
+output = []
+for l in sys.stdin:
+    l = string.strip(l)
+    output.append((l, read.convert_string(l)))
+
+print yaml.dump(output)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/embeddings/one-per-line.py	Tue Oct 21 16:24:44 2008 -0400
@@ -0,0 +1,27 @@
+#!/usr/bin/python
+
+import string
+#import psyco
+
+weightsfile = "lm-weights.txt"
+vocabfile = "words.asc"
+size = 30000
+dimensions = 50
+
+import numpy, math
+import sys
+from common.str import percent
+
+word_to_vector = {}
+
+f = open(weightsfile)
+f.readline()
+vals = [float(v) for v in string.split(f.readline())]
+assert len(vals) == size * dimensions
+vals.reverse()
+#for i in range(size):
+r = range(size)
+r.reverse()
+for i in r:
+    l = vals[dimensions*i:dimensions*(i+1)]
+    print string.join([`s` for s in l], "\t")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/embeddings/parameters.py	Tue Oct 21 16:24:44 2008 -0400
@@ -0,0 +1,8 @@
+"""
+Locations of the embedding data files.
+"""
+WEIGHTSFILE     = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/lm-weights.txt"
+VOCABFILE       = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/words.asc"
+NUMBER_OF_WORDS = 30000
+DIMENSIONS      = 50
+UNKNOWN         = "UNKNOWN"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/embeddings/process.py	Tue Oct 21 16:24:44 2008 -0400
@@ -0,0 +1,114 @@
+"""
+Read in the weights file
+"""
+
+import string
+import sys
+
+from parameters import *
+
+__words = None
+__word_to_embedding = None
+__read = False
+
+def word_to_embedding(w):
+    read_embeddings()
+    return __word_to_embedding[w]
+
+def read_embeddings():
+    global __words
+    global __word_to_embedding
+    global __read 
+    if __read: return
+
+    __words = [string.strip(w) for w in open(VOCABFILE).readlines()]
+    assert len(__words) == NUMBER_OF_WORDS
+
+    import numpy, math
+    from common.str import percent
+
+    __word_to_embedding = {}
+
+    sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
+    f = open(WEIGHTSFILE)
+    f.readline()
+    vals = [float(v) for v in string.split(f.readline())]
+    assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
+    for i in range(NUMBER_OF_WORDS):
+        l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
+        w = __words[i]
+        __word_to_embedding[w] = l
+    __read = True
+    sys.stderr.write("...done reading %s\n" % WEIGHTSFILE)
+
+import re
+numberre = re.compile("[0-9]")
+slashre = re.compile("\\\/")
+
+def preprocess(l):
+    """
+    Convert a sequence so that it can be embedded directly.
+    Returned the preprocessed sequence.
+    @note: Preprocessing is appropriate for Penn Treebank style documents.
+    """
+    read_embeddings()
+    lnew = []
+    for origw in l:
+        if origw == "-LRB-": w = "("
+        elif origw == "-RRB-": w = ")"
+        elif origw == "-LCB-": w = "{"
+        elif origw == "-RCB-": w = "}"
+        elif origw == "-LSB-": w = "["
+        elif origw == "-RSB-": w = "]"
+        else:
+            w = origw
+            w = string.lower(w)
+            w = slashre.sub("/", w)
+            w = numberre.sub("NUMBER", w)
+        if w not in __word_to_embedding:
+            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
+            w = UNKNOWN
+        assert w in __word_to_embedding
+        lnew.append(w)
+    return lnew
+
+#def convert_string(s, strict=False):
+#    """
+#    Convert a string to a sequence of embeddings.
+#    @param strict: If strict, then words *must* be in the vocabulary.
+#    @todo: DEPRECATED Remove this function.
+#    """
+#    read_embeddings()
+#    e = []
+#    for origw in string.split(string.lower(s)):
+#        w = numberre.sub("NUMBER", origw)
+#        if w in __word_to_embedding:
+#            e.append(__word_to_embedding[w])
+#        else:
+#            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
+#            assert not strict
+#            e.append(__word_to_embedding[UNKNOWN])
+#    return e
+
+#def test():
+#    """
+#    Debugging code.
+#    """
+#    read_embeddings()
+#    for w in __word_to_embedding:
+#        assert len(__word_to_embedding[w]) == 50
+#    import numpy
+#    for w1 in __words:
+#        e1 = numpy.asarray(__word_to_embedding[w1])
+#        lst = []
+#        print w1, numpy.dot(e1, e1)
+#        for w2 in __word_to_embedding:
+#            if w1 >= w2: continue
+#            e2 = numpy.asarray(__word_to_embedding[w2])
+#            d = (e1 - e2)
+#            l2 = numpy.dot(d, d)
+#            lst.append((l2, w1, w2))
+#        lst.sort()
+#        print lst[:10]
+#
+#test()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/embeddings/read-original.py	Tue Oct 21 16:24:44 2008 -0400
@@ -0,0 +1,47 @@
+#!/usr/bin/python
+
+import string
+#import psyco
+
+weightsfile = "lm-weights.txt"
+vocabfile = "words.asc"
+size = 30000
+dimensions = 50
+
+words = [string.strip(w) for w in open(vocabfile).readlines()]
+assert len(words) == 30000
+
+import numpy, math
+import sys
+from common.str import percent
+
+word_to_vector = {}
+
+f = open(weightsfile)
+f.readline()
+vals = [float(v) for v in string.split(f.readline())]
+assert len(vals) == size * dimensions
+vals.reverse()
+for i in range(size):
+    l = vals[dimensions*i:dimensions*(i+1)]
+    w = words[i]
+    word_to_vector[w] = l
+
+#    l2 = numpy.asarray(l)
+#    print math.fabs(50 - numpy.sum(l2*l2)), w
+
+cnt = 0
+for i1 in range(len(words)):
+    for i2 in range(len(words)):
+        w1 = words[i1]
+        w2 = words[i2]
+        cnt += 1
+        if i1 <= i2: continue
+        l1 = numpy.asarray(word_to_vector[w1])
+        l2 = numpy.asarray(word_to_vector[w2])
+        d = l2 - l1
+        dist = numpy.sum(d * d)
+        if dist < 50:
+            print numpy.sum(d * d), w1, w2, i1, i2
+        if cnt % 1000 == 0:
+            sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector)))
--- a/sandbox/embeddings/README.txt	Mon Oct 20 19:14:06 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-Messy scripts for working with Jason + Ronan's embeddings.
-
-Parameters are given in parameters.py
--- a/sandbox/embeddings/__init__.py	Mon Oct 20 19:14:06 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-from process import *
--- a/sandbox/embeddings/convert.py	Mon Oct 20 19:14:06 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,15 +0,0 @@
-#!/usr/bin/python
-"""
-Convert stdin sentences to word embeddings, and output YAML.
-"""
-
-import sys, string
-import read
-import yaml
-
-output = []
-for l in sys.stdin:
-    l = string.strip(l)
-    output.append((l, read.convert_string(l)))
-
-print yaml.dump(output)
--- a/sandbox/embeddings/one-per-line.py	Mon Oct 20 19:14:06 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,27 +0,0 @@
-#!/usr/bin/python
-
-import string
-#import psyco
-
-weightsfile = "lm-weights.txt"
-vocabfile = "words.asc"
-size = 30000
-dimensions = 50
-
-import numpy, math
-import sys
-from common.str import percent
-
-word_to_vector = {}
-
-f = open(weightsfile)
-f.readline()
-vals = [float(v) for v in string.split(f.readline())]
-assert len(vals) == size * dimensions
-vals.reverse()
-#for i in range(size):
-r = range(size)
-r.reverse()
-for i in r:
-    l = vals[dimensions*i:dimensions*(i+1)]
-    print string.join([`s` for s in l], "\t")
--- a/sandbox/embeddings/parameters.py	Mon Oct 20 19:14:06 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-"""
-Locations of the embedding data files.
-"""
-WEIGHTSFILE     = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/lm-weights.txt"
-VOCABFILE       = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/words.asc"
-NUMBER_OF_WORDS = 30000
-DIMENSIONS      = 50
-UNKNOWN         = "UNKNOWN"
--- a/sandbox/embeddings/process.py	Mon Oct 20 19:14:06 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,114 +0,0 @@
-"""
-Read in the weights file
-"""
-
-import string
-import sys
-
-from parameters import *
-
-__words = None
-__word_to_embedding = None
-__read = False
-
-def word_to_embedding(w):
-    read_embeddings()
-    return __word_to_embedding[w]
-
-def read_embeddings():
-    global __words
-    global __word_to_embedding
-    global __read 
-    if __read: return
-
-    __words = [string.strip(w) for w in open(VOCABFILE).readlines()]
-    assert len(__words) == NUMBER_OF_WORDS
-
-    import numpy, math
-    from common.str import percent
-
-    __word_to_embedding = {}
-
-    sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
-    f = open(WEIGHTSFILE)
-    f.readline()
-    vals = [float(v) for v in string.split(f.readline())]
-    assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
-    for i in range(NUMBER_OF_WORDS):
-        l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
-        w = __words[i]
-        __word_to_embedding[w] = l
-    __read = True
-    sys.stderr.write("...done reading %s\n" % WEIGHTSFILE)
-
-import re
-numberre = re.compile("[0-9]")
-slashre = re.compile("\\\/")
-
-def preprocess(l):
-    """
-    Convert a sequence so that it can be embedded directly.
-    Returned the preprocessed sequence.
-    @note: Preprocessing is appropriate for Penn Treebank style documents.
-    """
-    read_embeddings()
-    lnew = []
-    for origw in l:
-        if origw == "-LRB-": w = "("
-        elif origw == "-RRB-": w = ")"
-        elif origw == "-LCB-": w = "{"
-        elif origw == "-RCB-": w = "}"
-        elif origw == "-LSB-": w = "["
-        elif origw == "-RSB-": w = "]"
-        else:
-            w = origw
-            w = string.lower(w)
-            w = slashre.sub("/", w)
-            w = numberre.sub("NUMBER", w)
-        if w not in __word_to_embedding:
-            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
-            w = UNKNOWN
-        assert w in __word_to_embedding
-        lnew.append(w)
-    return lnew
-
-#def convert_string(s, strict=False):
-#    """
-#    Convert a string to a sequence of embeddings.
-#    @param strict: If strict, then words *must* be in the vocabulary.
-#    @todo: DEPRECATED Remove this function.
-#    """
-#    read_embeddings()
-#    e = []
-#    for origw in string.split(string.lower(s)):
-#        w = numberre.sub("NUMBER", origw)
-#        if w in __word_to_embedding:
-#            e.append(__word_to_embedding[w])
-#        else:
-#            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
-#            assert not strict
-#            e.append(__word_to_embedding[UNKNOWN])
-#    return e
-
-#def test():
-#    """
-#    Debugging code.
-#    """
-#    read_embeddings()
-#    for w in __word_to_embedding:
-#        assert len(__word_to_embedding[w]) == 50
-#    import numpy
-#    for w1 in __words:
-#        e1 = numpy.asarray(__word_to_embedding[w1])
-#        lst = []
-#        print w1, numpy.dot(e1, e1)
-#        for w2 in __word_to_embedding:
-#            if w1 >= w2: continue
-#            e2 = numpy.asarray(__word_to_embedding[w2])
-#            d = (e1 - e2)
-#            l2 = numpy.dot(d, d)
-#            lst.append((l2, w1, w2))
-#        lst.sort()
-#        print lst[:10]
-#
-#test()
--- a/sandbox/embeddings/read-original.py	Mon Oct 20 19:14:06 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,47 +0,0 @@
-#!/usr/bin/python
-
-import string
-#import psyco
-
-weightsfile = "lm-weights.txt"
-vocabfile = "words.asc"
-size = 30000
-dimensions = 50
-
-words = [string.strip(w) for w in open(vocabfile).readlines()]
-assert len(words) == 30000
-
-import numpy, math
-import sys
-from common.str import percent
-
-word_to_vector = {}
-
-f = open(weightsfile)
-f.readline()
-vals = [float(v) for v in string.split(f.readline())]
-assert len(vals) == size * dimensions
-vals.reverse()
-for i in range(size):
-    l = vals[dimensions*i:dimensions*(i+1)]
-    w = words[i]
-    word_to_vector[w] = l
-
-#    l2 = numpy.asarray(l)
-#    print math.fabs(50 - numpy.sum(l2*l2)), w
-
-cnt = 0
-for i1 in range(len(words)):
-    for i2 in range(len(words)):
-        w1 = words[i1]
-        w2 = words[i2]
-        cnt += 1
-        if i1 <= i2: continue
-        l1 = numpy.asarray(word_to_vector[w1])
-        l2 = numpy.asarray(word_to_vector[w2])
-        d = l2 - l1
-        dist = numpy.sum(d * d)
-        if dist < 50:
-            print numpy.sum(d * d), w1, w2, i1, i2
-        if cnt % 1000 == 0:
-            sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector)))