pylearn: sandbox/embeddings/read-original.py comparison

Added sandbox.embeddings

author	Joseph Turian <turian@iro.umontreal.ca>
date	Tue, 07 Oct 2008 17:56:52 -0400
parents
children

comparison

equal deleted inserted replaced

-:fb62f0e4bcfe
+:131e19dfe793
+#!/usr/bin/python
+import string
+#import psyco
+weightsfile = "lm-weights.txt"
+vocabfile = "words.asc"
+size = 30000
+dimensions = 50
+words = [string.strip(w) for w in open(vocabfile).readlines()]
+assert len(words) == 30000
+import numpy, math
+import sys
+from common.str import percent
+word_to_vector = {}
+f = open(weightsfile)
+f.readline()
+vals = [float(v) for v in string.split(f.readline())]
+assert len(vals) == size * dimensions
+vals.reverse()
+for i in range(size):
+l = vals[dimensions*i:dimensions*(i+1)]
+w = words[i]
+word_to_vector[w] = l
+#    l2 = numpy.asarray(l)
+#    print math.fabs(50 - numpy.sum(l2*l2)), w
+cnt = 0
+for i1 in range(len(words)):
+for i2 in range(len(words)):
+w1 = words[i1]
+w2 = words[i2]
+cnt += 1
+if i1 <= i2: continue
+l1 = numpy.asarray(word_to_vector[w1])
+l2 = numpy.asarray(word_to_vector[w2])
+d = l2 - l1
+dist = numpy.sum(d * d)
+if dist < 50:
+print numpy.sum(d * d), w1, w2, i1, i2
+if cnt % 1000 == 0:
+sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector)))

Mercurial > pylearn