Mercurial > pylearn
changeset 456:131e19dfe793
Added sandbox.embeddings
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 07 Oct 2008 17:56:52 -0400 |
parents | fb62f0e4bcfe |
children | 34acf8db186d |
files | sandbox/embeddings/README.txt sandbox/embeddings/convert.py sandbox/embeddings/one-per-line.py sandbox/embeddings/original.py sandbox/embeddings/read-original.py |
diffstat | 5 files changed, 140 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/README.txt Tue Oct 07 17:56:52 2008 -0400 @@ -0,0 +1,1 @@ +Messy scripts for working with Jason + Ronan's embeddings.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/convert.py Tue Oct 07 17:56:52 2008 -0400 @@ -0,0 +1,12 @@ +#!/usr/bin/python + +import sys, string +import original +import yaml + +output = [] +for l in sys.stdin: + l = string.strip(l) + output.append((l, original.convert_string(l))) + +print yaml.dump(output)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/one-per-line.py Tue Oct 07 17:56:52 2008 -0400 @@ -0,0 +1,27 @@ +#!/usr/bin/python + +import string +#import psyco + +weightsfile = "lm-weights.txt" +vocabfile = "words.asc" +size = 30000 +dimensions = 50 + +import numpy, math +import sys +from common.str import percent + +word_to_vector = {} + +f = open(weightsfile) +f.readline() +vals = [float(v) for v in string.split(f.readline())] +assert len(vals) == size * dimensions +vals.reverse() +#for i in range(size): +r = range(size) +r.reverse() +for i in r: + l = vals[dimensions*i:dimensions*(i+1)] + print string.join([`s` for s in l], "\t")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/original.py Tue Oct 07 17:56:52 2008 -0400 @@ -0,0 +1,53 @@ +""" +Read in the weights file +""" + +import string +import sys + +WORDS = None +WORD_TO_VECTOR = None +def read(): + global WORDS + global WORD_TO_VECTOR + + weightsfile = "lm-weights.txt" + vocabfile = "words.asc" + size = 30000 + dimensions = 50 + + WORDS = [string.strip(w) for w in open(vocabfile).readlines()] + assert len(WORDS) == 30000 + + import numpy, math + from common.str import percent + + WORD_TO_VECTOR = {} + + sys.stderr.write("Reading %s...\n" % weightsfile) + f = open(weightsfile) + f.readline() + vals = [float(v) for v in string.split(f.readline())] + assert len(vals) == size * dimensions + vals.reverse() + for i in range(size): + l = vals[dimensions*i:dimensions*(i+1)] + w = WORDS[i] + WORD_TO_VECTOR[w] = l + +import re +numberre = re.compile("[0-9]") +def convert_string(s): + """ + Convert a string to a sequence of embeddings. + """ + e = [] + for origw in string.split(string.lower(s)): + w = numberre.sub("NUMBER", origw) + if w in WORD_TO_VECTOR: + e.append(WORD_TO_VECTOR[w]) + else: + sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw)) + return e + +read()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/read-original.py Tue Oct 07 17:56:52 2008 -0400 @@ -0,0 +1,47 @@ +#!/usr/bin/python + +import string +#import psyco + +weightsfile = "lm-weights.txt" +vocabfile = "words.asc" +size = 30000 +dimensions = 50 + +words = [string.strip(w) for w in open(vocabfile).readlines()] +assert len(words) == 30000 + +import numpy, math +import sys +from common.str import percent + +word_to_vector = {} + +f = open(weightsfile) +f.readline() +vals = [float(v) for v in string.split(f.readline())] +assert len(vals) == size * dimensions +vals.reverse() +for i in range(size): + l = vals[dimensions*i:dimensions*(i+1)] + w = words[i] + word_to_vector[w] = l + +# l2 = numpy.asarray(l) +# print math.fabs(50 - numpy.sum(l2*l2)), w + +cnt = 0 +for i1 in range(len(words)): + for i2 in range(len(words)): + w1 = words[i1] + w2 = words[i2] + cnt += 1 + if i1 <= i2: continue + l1 = numpy.asarray(word_to_vector[w1]) + l2 = numpy.asarray(word_to_vector[w2]) + d = l2 - l1 + dist = numpy.sum(d * d) + if dist < 50: + print numpy.sum(d * d), w1, w2, i1, i2 + if cnt % 1000 == 0: + sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector)))