Mercurial > pylearn
view sandbox/embeddings/read-original.py @ 458:ed6b0b3be8d2
Polished embeddings module
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 07 Oct 2008 19:13:53 -0400 |
parents | 131e19dfe793 |
children |
line wrap: on
line source
#!/usr/bin/python import string #import psyco weightsfile = "lm-weights.txt" vocabfile = "words.asc" size = 30000 dimensions = 50 words = [string.strip(w) for w in open(vocabfile).readlines()] assert len(words) == 30000 import numpy, math import sys from common.str import percent word_to_vector = {} f = open(weightsfile) f.readline() vals = [float(v) for v in string.split(f.readline())] assert len(vals) == size * dimensions vals.reverse() for i in range(size): l = vals[dimensions*i:dimensions*(i+1)] w = words[i] word_to_vector[w] = l # l2 = numpy.asarray(l) # print math.fabs(50 - numpy.sum(l2*l2)), w cnt = 0 for i1 in range(len(words)): for i2 in range(len(words)): w1 = words[i1] w2 = words[i2] cnt += 1 if i1 <= i2: continue l1 = numpy.asarray(word_to_vector[w1]) l2 = numpy.asarray(word_to_vector[w2]) d = l2 - l1 dist = numpy.sum(d * d) if dist < 50: print numpy.sum(d * d), w1, w2, i1, i2 if cnt % 1000 == 0: sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector)))