annotate embeddings/one-per-line.py @ 672:27b1344a57b1

Added preprocessing back in
author Joseph Turian <turian@gmail.com>
date Thu, 20 Nov 2008 06:38:06 -0500
parents a07948f780b9
children
rev   line source
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
1 #!/usr/bin/python
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
2
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
3 import string
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
4 #import psyco
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
5
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
6 weightsfile = "lm-weights.txt"
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
7 vocabfile = "words.asc"
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
8 size = 30000
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
9 dimensions = 50
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
10
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
11 import numpy, math
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
12 import sys
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
13 from common.str import percent
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
14
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
15 word_to_vector = {}
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
16
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
17 f = open(weightsfile)
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
18 f.readline()
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
19 vals = [float(v) for v in string.split(f.readline())]
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
20 assert len(vals) == size * dimensions
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
21 vals.reverse()
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
22 #for i in range(size):
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
23 r = range(size)
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
24 r.reverse()
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
25 for i in r:
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
26 l = vals[dimensions*i:dimensions*(i+1)]
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
27 print string.join([`s` for s in l], "\t")