comparison embeddings/read-original.py @ 468:a07948f780b9

Moved embeddings out of sandbox
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 21 Oct 2008 16:24:44 -0400
parents sandbox/embeddings/read-original.py@131e19dfe793
children
comparison
equal deleted inserted replaced
467:f3711bcc467e 468:a07948f780b9
1 #!/usr/bin/python
2
3 import string
4 #import psyco
5
6 weightsfile = "lm-weights.txt"
7 vocabfile = "words.asc"
8 size = 30000
9 dimensions = 50
10
11 words = [string.strip(w) for w in open(vocabfile).readlines()]
12 assert len(words) == 30000
13
14 import numpy, math
15 import sys
16 from common.str import percent
17
18 word_to_vector = {}
19
20 f = open(weightsfile)
21 f.readline()
22 vals = [float(v) for v in string.split(f.readline())]
23 assert len(vals) == size * dimensions
24 vals.reverse()
25 for i in range(size):
26 l = vals[dimensions*i:dimensions*(i+1)]
27 w = words[i]
28 word_to_vector[w] = l
29
30 # l2 = numpy.asarray(l)
31 # print math.fabs(50 - numpy.sum(l2*l2)), w
32
33 cnt = 0
34 for i1 in range(len(words)):
35 for i2 in range(len(words)):
36 w1 = words[i1]
37 w2 = words[i2]
38 cnt += 1
39 if i1 <= i2: continue
40 l1 = numpy.asarray(word_to_vector[w1])
41 l2 = numpy.asarray(word_to_vector[w2])
42 d = l2 - l1
43 dist = numpy.sum(d * d)
44 if dist < 50:
45 print numpy.sum(d * d), w1, w2, i1, i2
46 if cnt % 1000 == 0:
47 sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector)))