annotate embeddings/read-original.py @ 672:27b1344a57b1

Added preprocessing back in
author Joseph Turian <turian@gmail.com>
date Thu, 20 Nov 2008 06:38:06 -0500
parents a07948f780b9
children
rev   line source
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
1 #!/usr/bin/python
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
2
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
3 import string
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
4 #import psyco
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
5
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
6 weightsfile = "lm-weights.txt"
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
7 vocabfile = "words.asc"
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
8 size = 30000
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
9 dimensions = 50
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
10
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
11 words = [string.strip(w) for w in open(vocabfile).readlines()]
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
12 assert len(words) == 30000
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
13
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
14 import numpy, math
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
15 import sys
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
16 from common.str import percent
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
17
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
18 word_to_vector = {}
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
19
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
20 f = open(weightsfile)
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
21 f.readline()
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
22 vals = [float(v) for v in string.split(f.readline())]
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
23 assert len(vals) == size * dimensions
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
24 vals.reverse()
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
25 for i in range(size):
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
26 l = vals[dimensions*i:dimensions*(i+1)]
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
27 w = words[i]
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
28 word_to_vector[w] = l
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
29
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
30 # l2 = numpy.asarray(l)
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
31 # print math.fabs(50 - numpy.sum(l2*l2)), w
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
32
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
33 cnt = 0
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
34 for i1 in range(len(words)):
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
35 for i2 in range(len(words)):
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
36 w1 = words[i1]
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
37 w2 = words[i2]
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
38 cnt += 1
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
39 if i1 <= i2: continue
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
40 l1 = numpy.asarray(word_to_vector[w1])
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
41 l2 = numpy.asarray(word_to_vector[w2])
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
42 d = l2 - l1
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
43 dist = numpy.sum(d * d)
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
44 if dist < 50:
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
45 print numpy.sum(d * d), w1, w2, i1, i2
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
46 if cnt % 1000 == 0:
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
47 sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector)))