# HG changeset patch # User Joseph Turian # Date 1223416612 14400 # Node ID 131e19dfe793dbe92482412926a748ab3db8f1ed # Parent fb62f0e4bcfea3d3e0c3d5cb16e9de50402e743d Added sandbox.embeddings diff -r fb62f0e4bcfe -r 131e19dfe793 sandbox/embeddings/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/README.txt Tue Oct 07 17:56:52 2008 -0400 @@ -0,0 +1,1 @@ +Messy scripts for working with Jason + Ronan's embeddings. diff -r fb62f0e4bcfe -r 131e19dfe793 sandbox/embeddings/convert.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/convert.py Tue Oct 07 17:56:52 2008 -0400 @@ -0,0 +1,12 @@ +#!/usr/bin/python + +import sys, string +import original +import yaml + +output = [] +for l in sys.stdin: + l = string.strip(l) + output.append((l, original.convert_string(l))) + +print yaml.dump(output) diff -r fb62f0e4bcfe -r 131e19dfe793 sandbox/embeddings/one-per-line.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/one-per-line.py Tue Oct 07 17:56:52 2008 -0400 @@ -0,0 +1,27 @@ +#!/usr/bin/python + +import string +#import psyco + +weightsfile = "lm-weights.txt" +vocabfile = "words.asc" +size = 30000 +dimensions = 50 + +import numpy, math +import sys +from common.str import percent + +word_to_vector = {} + +f = open(weightsfile) +f.readline() +vals = [float(v) for v in string.split(f.readline())] +assert len(vals) == size * dimensions +vals.reverse() +#for i in range(size): +r = range(size) +r.reverse() +for i in r: + l = vals[dimensions*i:dimensions*(i+1)] + print string.join([`s` for s in l], "\t") diff -r fb62f0e4bcfe -r 131e19dfe793 sandbox/embeddings/original.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/original.py Tue Oct 07 17:56:52 2008 -0400 @@ -0,0 +1,53 @@ +""" +Read in the weights file +""" + +import string +import sys + +WORDS = None +WORD_TO_VECTOR = None +def read(): + global WORDS + global WORD_TO_VECTOR + + weightsfile = "lm-weights.txt" + vocabfile = "words.asc" + size = 30000 + dimensions = 50 + + WORDS = [string.strip(w) for w in open(vocabfile).readlines()] + assert len(WORDS) == 30000 + + import numpy, math + from common.str import percent + + WORD_TO_VECTOR = {} + + sys.stderr.write("Reading %s...\n" % weightsfile) + f = open(weightsfile) + f.readline() + vals = [float(v) for v in string.split(f.readline())] + assert len(vals) == size * dimensions + vals.reverse() + for i in range(size): + l = vals[dimensions*i:dimensions*(i+1)] + w = WORDS[i] + WORD_TO_VECTOR[w] = l + +import re +numberre = re.compile("[0-9]") +def convert_string(s): + """ + Convert a string to a sequence of embeddings. + """ + e = [] + for origw in string.split(string.lower(s)): + w = numberre.sub("NUMBER", origw) + if w in WORD_TO_VECTOR: + e.append(WORD_TO_VECTOR[w]) + else: + sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw)) + return e + +read() diff -r fb62f0e4bcfe -r 131e19dfe793 sandbox/embeddings/read-original.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/read-original.py Tue Oct 07 17:56:52 2008 -0400 @@ -0,0 +1,47 @@ +#!/usr/bin/python + +import string +#import psyco + +weightsfile = "lm-weights.txt" +vocabfile = "words.asc" +size = 30000 +dimensions = 50 + +words = [string.strip(w) for w in open(vocabfile).readlines()] +assert len(words) == 30000 + +import numpy, math +import sys +from common.str import percent + +word_to_vector = {} + +f = open(weightsfile) +f.readline() +vals = [float(v) for v in string.split(f.readline())] +assert len(vals) == size * dimensions +vals.reverse() +for i in range(size): + l = vals[dimensions*i:dimensions*(i+1)] + w = words[i] + word_to_vector[w] = l + +# l2 = numpy.asarray(l) +# print math.fabs(50 - numpy.sum(l2*l2)), w + +cnt = 0 +for i1 in range(len(words)): + for i2 in range(len(words)): + w1 = words[i1] + w2 = words[i2] + cnt += 1 + if i1 <= i2: continue + l1 = numpy.asarray(word_to_vector[w1]) + l2 = numpy.asarray(word_to_vector[w2]) + d = l2 - l1 + dist = numpy.sum(d * d) + if dist < 50: + print numpy.sum(d * d), w1, w2, i1, i2 + if cnt % 1000 == 0: + sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector)))