annotate sandbox/embeddings/original.py @ 456:131e19dfe793

Added sandbox.embeddings
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 07 Oct 2008 17:56:52 -0400
parents
children
rev   line source
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
1 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
2 Read in the weights file
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
3 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
4
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
5 import string
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
6 import sys
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
7
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
8 WORDS = None
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
9 WORD_TO_VECTOR = None
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
10 def read():
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
11 global WORDS
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
12 global WORD_TO_VECTOR
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
13
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
14 weightsfile = "lm-weights.txt"
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
15 vocabfile = "words.asc"
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
16 size = 30000
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
17 dimensions = 50
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
18
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
19 WORDS = [string.strip(w) for w in open(vocabfile).readlines()]
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
20 assert len(WORDS) == 30000
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
21
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
22 import numpy, math
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
23 from common.str import percent
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
24
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
25 WORD_TO_VECTOR = {}
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
26
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
27 sys.stderr.write("Reading %s...\n" % weightsfile)
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
28 f = open(weightsfile)
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
29 f.readline()
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
30 vals = [float(v) for v in string.split(f.readline())]
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
31 assert len(vals) == size * dimensions
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
32 vals.reverse()
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
33 for i in range(size):
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
34 l = vals[dimensions*i:dimensions*(i+1)]
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
35 w = WORDS[i]
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
36 WORD_TO_VECTOR[w] = l
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
37
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
38 import re
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
39 numberre = re.compile("[0-9]")
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
40 def convert_string(s):
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
41 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
42 Convert a string to a sequence of embeddings.
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
43 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
44 e = []
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
45 for origw in string.split(string.lower(s)):
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
46 w = numberre.sub("NUMBER", origw)
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
47 if w in WORD_TO_VECTOR:
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
48 e.append(WORD_TO_VECTOR[w])
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
49 else:
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
50 sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw))
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
51 return e
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
52
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
53 read()