changeset 456:131e19dfe793

Added sandbox.embeddings
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 07 Oct 2008 17:56:52 -0400
parents fb62f0e4bcfe
children 34acf8db186d
files sandbox/embeddings/README.txt sandbox/embeddings/convert.py sandbox/embeddings/one-per-line.py sandbox/embeddings/original.py sandbox/embeddings/read-original.py
diffstat 5 files changed, 140 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sandbox/embeddings/README.txt	Tue Oct 07 17:56:52 2008 -0400
@@ -0,0 +1,1 @@
+Messy scripts for working with Jason + Ronan's embeddings.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sandbox/embeddings/convert.py	Tue Oct 07 17:56:52 2008 -0400
@@ -0,0 +1,12 @@
+#!/usr/bin/python
+
+import sys, string
+import original
+import yaml
+
+output = []
+for l in sys.stdin:
+    l = string.strip(l)
+    output.append((l, original.convert_string(l)))
+
+print yaml.dump(output)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sandbox/embeddings/one-per-line.py	Tue Oct 07 17:56:52 2008 -0400
@@ -0,0 +1,27 @@
+#!/usr/bin/python
+
+import string
+#import psyco
+
+weightsfile = "lm-weights.txt"
+vocabfile = "words.asc"
+size = 30000
+dimensions = 50
+
+import numpy, math
+import sys
+from common.str import percent
+
+word_to_vector = {}
+
+f = open(weightsfile)
+f.readline()
+vals = [float(v) for v in string.split(f.readline())]
+assert len(vals) == size * dimensions
+vals.reverse()
+#for i in range(size):
+r = range(size)
+r.reverse()
+for i in r:
+    l = vals[dimensions*i:dimensions*(i+1)]
+    print string.join([`s` for s in l], "\t")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sandbox/embeddings/original.py	Tue Oct 07 17:56:52 2008 -0400
@@ -0,0 +1,53 @@
+"""
+Read in the weights file
+"""
+
+import string
+import sys
+
+WORDS = None
+WORD_TO_VECTOR = None
+def read():
+    global WORDS
+    global WORD_TO_VECTOR
+
+    weightsfile = "lm-weights.txt"
+    vocabfile = "words.asc"
+    size = 30000
+    dimensions = 50
+
+    WORDS = [string.strip(w) for w in open(vocabfile).readlines()]
+    assert len(WORDS) == 30000
+
+    import numpy, math
+    from common.str import percent
+
+    WORD_TO_VECTOR = {}
+
+    sys.stderr.write("Reading %s...\n" % weightsfile)
+    f = open(weightsfile)
+    f.readline()
+    vals = [float(v) for v in string.split(f.readline())]
+    assert len(vals) == size * dimensions
+    vals.reverse()
+    for i in range(size):
+        l = vals[dimensions*i:dimensions*(i+1)]
+        w = WORDS[i]
+        WORD_TO_VECTOR[w] = l
+
+import re
+numberre = re.compile("[0-9]")
+def convert_string(s):
+    """
+    Convert a string to a sequence of embeddings.
+    """
+    e = []
+    for origw in string.split(string.lower(s)):
+        w = numberre.sub("NUMBER", origw)
+        if w in WORD_TO_VECTOR:
+            e.append(WORD_TO_VECTOR[w])
+        else:
+            sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw))
+    return e
+
+read()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sandbox/embeddings/read-original.py	Tue Oct 07 17:56:52 2008 -0400
@@ -0,0 +1,47 @@
+#!/usr/bin/python
+
+import string
+#import psyco
+
+weightsfile = "lm-weights.txt"
+vocabfile = "words.asc"
+size = 30000
+dimensions = 50
+
+words = [string.strip(w) for w in open(vocabfile).readlines()]
+assert len(words) == 30000
+
+import numpy, math
+import sys
+from common.str import percent
+
+word_to_vector = {}
+
+f = open(weightsfile)
+f.readline()
+vals = [float(v) for v in string.split(f.readline())]
+assert len(vals) == size * dimensions
+vals.reverse()
+for i in range(size):
+    l = vals[dimensions*i:dimensions*(i+1)]
+    w = words[i]
+    word_to_vector[w] = l
+
+#    l2 = numpy.asarray(l)
+#    print math.fabs(50 - numpy.sum(l2*l2)), w
+
+cnt = 0
+for i1 in range(len(words)):
+    for i2 in range(len(words)):
+        w1 = words[i1]
+        w2 = words[i2]
+        cnt += 1
+        if i1 <= i2: continue
+        l1 = numpy.asarray(word_to_vector[w1])
+        l2 = numpy.asarray(word_to_vector[w2])
+        d = l2 - l1
+        dist = numpy.sum(d * d)
+        if dist < 50:
+            print numpy.sum(d * d), w1, w2, i1, i2
+        if cnt % 1000 == 0:
+            sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector)))