Mercurial > pylearn
annotate embeddings/one-per-line.py @ 520:82bafb80ba65
merge
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Fri, 14 Nov 2008 02:09:23 -0500 |
parents | a07948f780b9 |
children |
rev | line source |
---|---|
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
1 #!/usr/bin/python |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
2 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
3 import string |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
4 #import psyco |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
5 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
6 weightsfile = "lm-weights.txt" |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
7 vocabfile = "words.asc" |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
8 size = 30000 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
9 dimensions = 50 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
10 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
11 import numpy, math |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
12 import sys |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
13 from common.str import percent |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
14 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
15 word_to_vector = {} |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
16 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
17 f = open(weightsfile) |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
18 f.readline() |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
19 vals = [float(v) for v in string.split(f.readline())] |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
20 assert len(vals) == size * dimensions |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
21 vals.reverse() |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
22 #for i in range(size): |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
23 r = range(size) |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
24 r.reverse() |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
25 for i in r: |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
26 l = vals[dimensions*i:dimensions*(i+1)] |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
27 print string.join([`s` for s in l], "\t") |