Mercurial > pylearn
diff sandbox/embeddings/read-original.py @ 456:131e19dfe793
Added sandbox.embeddings
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 07 Oct 2008 17:56:52 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/read-original.py Tue Oct 07 17:56:52 2008 -0400 @@ -0,0 +1,47 @@ +#!/usr/bin/python + +import string +#import psyco + +weightsfile = "lm-weights.txt" +vocabfile = "words.asc" +size = 30000 +dimensions = 50 + +words = [string.strip(w) for w in open(vocabfile).readlines()] +assert len(words) == 30000 + +import numpy, math +import sys +from common.str import percent + +word_to_vector = {} + +f = open(weightsfile) +f.readline() +vals = [float(v) for v in string.split(f.readline())] +assert len(vals) == size * dimensions +vals.reverse() +for i in range(size): + l = vals[dimensions*i:dimensions*(i+1)] + w = words[i] + word_to_vector[w] = l + +# l2 = numpy.asarray(l) +# print math.fabs(50 - numpy.sum(l2*l2)), w + +cnt = 0 +for i1 in range(len(words)): + for i2 in range(len(words)): + w1 = words[i1] + w2 = words[i2] + cnt += 1 + if i1 <= i2: continue + l1 = numpy.asarray(word_to_vector[w1]) + l2 = numpy.asarray(word_to_vector[w2]) + d = l2 - l1 + dist = numpy.sum(d * d) + if dist < 50: + print numpy.sum(d * d), w1, w2, i1, i2 + if cnt % 1000 == 0: + sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector)))