comparison sandbox/embeddings/process.py @ 467:f3711bcc467e

Fixed a bug in how embeddings are read
author Joseph Turian <turian@iro.umontreal.ca>
date Mon, 20 Oct 2008 19:14:06 -0400
parents 121cc6db4481
children
comparison
equal deleted inserted replaced
466:23221eefb70e 467:f3711bcc467e
32 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) 32 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
33 f = open(WEIGHTSFILE) 33 f = open(WEIGHTSFILE)
34 f.readline() 34 f.readline()
35 vals = [float(v) for v in string.split(f.readline())] 35 vals = [float(v) for v in string.split(f.readline())]
36 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS 36 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
37 vals.reverse()
38 for i in range(NUMBER_OF_WORDS): 37 for i in range(NUMBER_OF_WORDS):
39 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] 38 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
40 w = __words[i] 39 w = __words[i]
41 __word_to_embedding[w] = l 40 __word_to_embedding[w] = l
42 __read = True 41 __read = True
88 # else: 87 # else:
89 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) 88 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
90 # assert not strict 89 # assert not strict
91 # e.append(__word_to_embedding[UNKNOWN]) 90 # e.append(__word_to_embedding[UNKNOWN])
92 # return e 91 # return e
92
93 #def test():
94 # """
95 # Debugging code.
96 # """
97 # read_embeddings()
98 # for w in __word_to_embedding:
99 # assert len(__word_to_embedding[w]) == 50
100 # import numpy
101 # for w1 in __words:
102 # e1 = numpy.asarray(__word_to_embedding[w1])
103 # lst = []
104 # print w1, numpy.dot(e1, e1)
105 # for w2 in __word_to_embedding:
106 # if w1 >= w2: continue
107 # e2 = numpy.asarray(__word_to_embedding[w2])
108 # d = (e1 - e2)
109 # l2 = numpy.dot(d, d)
110 # lst.append((l2, w1, w2))
111 # lst.sort()
112 # print lst[:10]
113 #
114 #test()