Mercurial > pylearn
annotate embeddings/read-original.py @ 672:27b1344a57b1
Added preprocessing back in
author | Joseph Turian <turian@gmail.com> |
---|---|
date | Thu, 20 Nov 2008 06:38:06 -0500 |
parents | a07948f780b9 |
children |
rev | line source |
---|---|
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
1 #!/usr/bin/python |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
2 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
3 import string |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
4 #import psyco |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
5 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
6 weightsfile = "lm-weights.txt" |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
7 vocabfile = "words.asc" |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
8 size = 30000 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
9 dimensions = 50 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
10 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
11 words = [string.strip(w) for w in open(vocabfile).readlines()] |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
12 assert len(words) == 30000 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
13 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
14 import numpy, math |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
15 import sys |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
16 from common.str import percent |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
17 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
18 word_to_vector = {} |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
19 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
20 f = open(weightsfile) |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
21 f.readline() |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
22 vals = [float(v) for v in string.split(f.readline())] |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
23 assert len(vals) == size * dimensions |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
24 vals.reverse() |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
25 for i in range(size): |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
26 l = vals[dimensions*i:dimensions*(i+1)] |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
27 w = words[i] |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
28 word_to_vector[w] = l |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
29 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
30 # l2 = numpy.asarray(l) |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
31 # print math.fabs(50 - numpy.sum(l2*l2)), w |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
32 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
33 cnt = 0 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
34 for i1 in range(len(words)): |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
35 for i2 in range(len(words)): |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
36 w1 = words[i1] |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
37 w2 = words[i2] |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
38 cnt += 1 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
39 if i1 <= i2: continue |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
40 l1 = numpy.asarray(word_to_vector[w1]) |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
41 l2 = numpy.asarray(word_to_vector[w2]) |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
42 d = l2 - l1 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
43 dist = numpy.sum(d * d) |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
44 if dist < 50: |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
45 print numpy.sum(d * d), w1, w2, i1, i2 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
46 if cnt % 1000 == 0: |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
47 sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector))) |