Mercurial > pylearn
comparison embeddings/read-original.py @ 468:a07948f780b9
Moved embeddings out of sandbox
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 21 Oct 2008 16:24:44 -0400 |
parents | sandbox/embeddings/read-original.py@131e19dfe793 |
children |
comparison
equal
deleted
inserted
replaced
467:f3711bcc467e | 468:a07948f780b9 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 import string | |
4 #import psyco | |
5 | |
6 weightsfile = "lm-weights.txt" | |
7 vocabfile = "words.asc" | |
8 size = 30000 | |
9 dimensions = 50 | |
10 | |
11 words = [string.strip(w) for w in open(vocabfile).readlines()] | |
12 assert len(words) == 30000 | |
13 | |
14 import numpy, math | |
15 import sys | |
16 from common.str import percent | |
17 | |
18 word_to_vector = {} | |
19 | |
20 f = open(weightsfile) | |
21 f.readline() | |
22 vals = [float(v) for v in string.split(f.readline())] | |
23 assert len(vals) == size * dimensions | |
24 vals.reverse() | |
25 for i in range(size): | |
26 l = vals[dimensions*i:dimensions*(i+1)] | |
27 w = words[i] | |
28 word_to_vector[w] = l | |
29 | |
30 # l2 = numpy.asarray(l) | |
31 # print math.fabs(50 - numpy.sum(l2*l2)), w | |
32 | |
33 cnt = 0 | |
34 for i1 in range(len(words)): | |
35 for i2 in range(len(words)): | |
36 w1 = words[i1] | |
37 w2 = words[i2] | |
38 cnt += 1 | |
39 if i1 <= i2: continue | |
40 l1 = numpy.asarray(word_to_vector[w1]) | |
41 l2 = numpy.asarray(word_to_vector[w2]) | |
42 d = l2 - l1 | |
43 dist = numpy.sum(d * d) | |
44 if dist < 50: | |
45 print numpy.sum(d * d), w1, w2, i1, i2 | |
46 if cnt % 1000 == 0: | |
47 sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector))) |