view embeddings/read-original.py @ 468:a07948f780b9

Moved embeddings out of sandbox
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 21 Oct 2008 16:24:44 -0400
parents sandbox/embeddings/read-original.py@131e19dfe793
children
line wrap: on
line source

#!/usr/bin/python

import string
#import psyco

weightsfile = "lm-weights.txt"
vocabfile = "words.asc"
size = 30000
dimensions = 50

words = [string.strip(w) for w in open(vocabfile).readlines()]
assert len(words) == 30000

import numpy, math
import sys
from common.str import percent

word_to_vector = {}

f = open(weightsfile)
f.readline()
vals = [float(v) for v in string.split(f.readline())]
assert len(vals) == size * dimensions
vals.reverse()
for i in range(size):
    l = vals[dimensions*i:dimensions*(i+1)]
    w = words[i]
    word_to_vector[w] = l

#    l2 = numpy.asarray(l)
#    print math.fabs(50 - numpy.sum(l2*l2)), w

cnt = 0
for i1 in range(len(words)):
    for i2 in range(len(words)):
        w1 = words[i1]
        w2 = words[i2]
        cnt += 1
        if i1 <= i2: continue
        l1 = numpy.asarray(word_to_vector[w1])
        l2 = numpy.asarray(word_to_vector[w2])
        d = l2 - l1
        dist = numpy.sum(d * d)
        if dist < 50:
            print numpy.sum(d * d), w1, w2, i1, i2
        if cnt % 1000 == 0:
            sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector)))