Mercurial > pylearn
changeset 468:a07948f780b9
Moved embeddings out of sandbox
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 21 Oct 2008 16:24:44 -0400 |
parents | f3711bcc467e |
children | 4335309f4924 |
files | embeddings/README.txt embeddings/__init__.py embeddings/convert.py embeddings/one-per-line.py embeddings/parameters.py embeddings/process.py embeddings/read-original.py sandbox/embeddings/README.txt sandbox/embeddings/__init__.py sandbox/embeddings/convert.py sandbox/embeddings/one-per-line.py sandbox/embeddings/parameters.py sandbox/embeddings/process.py sandbox/embeddings/read-original.py |
diffstat | 14 files changed, 215 insertions(+), 215 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/embeddings/README.txt Tue Oct 21 16:24:44 2008 -0400 @@ -0,0 +1,3 @@ +Messy scripts for working with Jason + Ronan's embeddings. + +Parameters are given in parameters.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/embeddings/__init__.py Tue Oct 21 16:24:44 2008 -0400 @@ -0,0 +1,1 @@ +from process import *
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/embeddings/convert.py Tue Oct 21 16:24:44 2008 -0400 @@ -0,0 +1,15 @@ +#!/usr/bin/python +""" +Convert stdin sentences to word embeddings, and output YAML. +""" + +import sys, string +import read +import yaml + +output = [] +for l in sys.stdin: + l = string.strip(l) + output.append((l, read.convert_string(l))) + +print yaml.dump(output)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/embeddings/one-per-line.py Tue Oct 21 16:24:44 2008 -0400 @@ -0,0 +1,27 @@ +#!/usr/bin/python + +import string +#import psyco + +weightsfile = "lm-weights.txt" +vocabfile = "words.asc" +size = 30000 +dimensions = 50 + +import numpy, math +import sys +from common.str import percent + +word_to_vector = {} + +f = open(weightsfile) +f.readline() +vals = [float(v) for v in string.split(f.readline())] +assert len(vals) == size * dimensions +vals.reverse() +#for i in range(size): +r = range(size) +r.reverse() +for i in r: + l = vals[dimensions*i:dimensions*(i+1)] + print string.join([`s` for s in l], "\t")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/embeddings/parameters.py Tue Oct 21 16:24:44 2008 -0400 @@ -0,0 +1,8 @@ +""" +Locations of the embedding data files. +""" +WEIGHTSFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/lm-weights.txt" +VOCABFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/words.asc" +NUMBER_OF_WORDS = 30000 +DIMENSIONS = 50 +UNKNOWN = "UNKNOWN"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/embeddings/process.py Tue Oct 21 16:24:44 2008 -0400 @@ -0,0 +1,114 @@ +""" +Read in the weights file +""" + +import string +import sys + +from parameters import * + +__words = None +__word_to_embedding = None +__read = False + +def word_to_embedding(w): + read_embeddings() + return __word_to_embedding[w] + +def read_embeddings(): + global __words + global __word_to_embedding + global __read + if __read: return + + __words = [string.strip(w) for w in open(VOCABFILE).readlines()] + assert len(__words) == NUMBER_OF_WORDS + + import numpy, math + from common.str import percent + + __word_to_embedding = {} + + sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) + f = open(WEIGHTSFILE) + f.readline() + vals = [float(v) for v in string.split(f.readline())] + assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS + for i in range(NUMBER_OF_WORDS): + l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] + w = __words[i] + __word_to_embedding[w] = l + __read = True + sys.stderr.write("...done reading %s\n" % WEIGHTSFILE) + +import re +numberre = re.compile("[0-9]") +slashre = re.compile("\\\/") + +def preprocess(l): + """ + Convert a sequence so that it can be embedded directly. + Returned the preprocessed sequence. + @note: Preprocessing is appropriate for Penn Treebank style documents. + """ + read_embeddings() + lnew = [] + for origw in l: + if origw == "-LRB-": w = "(" + elif origw == "-RRB-": w = ")" + elif origw == "-LCB-": w = "{" + elif origw == "-RCB-": w = "}" + elif origw == "-LSB-": w = "[" + elif origw == "-RSB-": w = "]" + else: + w = origw + w = string.lower(w) + w = slashre.sub("/", w) + w = numberre.sub("NUMBER", w) + if w not in __word_to_embedding: + sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) + w = UNKNOWN + assert w in __word_to_embedding + lnew.append(w) + return lnew + +#def convert_string(s, strict=False): +# """ +# Convert a string to a sequence of embeddings. +# @param strict: If strict, then words *must* be in the vocabulary. +# @todo: DEPRECATED Remove this function. +# """ +# read_embeddings() +# e = [] +# for origw in string.split(string.lower(s)): +# w = numberre.sub("NUMBER", origw) +# if w in __word_to_embedding: +# e.append(__word_to_embedding[w]) +# else: +# sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) +# assert not strict +# e.append(__word_to_embedding[UNKNOWN]) +# return e + +#def test(): +# """ +# Debugging code. +# """ +# read_embeddings() +# for w in __word_to_embedding: +# assert len(__word_to_embedding[w]) == 50 +# import numpy +# for w1 in __words: +# e1 = numpy.asarray(__word_to_embedding[w1]) +# lst = [] +# print w1, numpy.dot(e1, e1) +# for w2 in __word_to_embedding: +# if w1 >= w2: continue +# e2 = numpy.asarray(__word_to_embedding[w2]) +# d = (e1 - e2) +# l2 = numpy.dot(d, d) +# lst.append((l2, w1, w2)) +# lst.sort() +# print lst[:10] +# +#test()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/embeddings/read-original.py Tue Oct 21 16:24:44 2008 -0400 @@ -0,0 +1,47 @@ +#!/usr/bin/python + +import string +#import psyco + +weightsfile = "lm-weights.txt" +vocabfile = "words.asc" +size = 30000 +dimensions = 50 + +words = [string.strip(w) for w in open(vocabfile).readlines()] +assert len(words) == 30000 + +import numpy, math +import sys +from common.str import percent + +word_to_vector = {} + +f = open(weightsfile) +f.readline() +vals = [float(v) for v in string.split(f.readline())] +assert len(vals) == size * dimensions +vals.reverse() +for i in range(size): + l = vals[dimensions*i:dimensions*(i+1)] + w = words[i] + word_to_vector[w] = l + +# l2 = numpy.asarray(l) +# print math.fabs(50 - numpy.sum(l2*l2)), w + +cnt = 0 +for i1 in range(len(words)): + for i2 in range(len(words)): + w1 = words[i1] + w2 = words[i2] + cnt += 1 + if i1 <= i2: continue + l1 = numpy.asarray(word_to_vector[w1]) + l2 = numpy.asarray(word_to_vector[w2]) + d = l2 - l1 + dist = numpy.sum(d * d) + if dist < 50: + print numpy.sum(d * d), w1, w2, i1, i2 + if cnt % 1000 == 0: + sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector)))
--- a/sandbox/embeddings/README.txt Mon Oct 20 19:14:06 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -Messy scripts for working with Jason + Ronan's embeddings. - -Parameters are given in parameters.py
--- a/sandbox/embeddings/__init__.py Mon Oct 20 19:14:06 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -from process import *
--- a/sandbox/embeddings/convert.py Mon Oct 20 19:14:06 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -#!/usr/bin/python -""" -Convert stdin sentences to word embeddings, and output YAML. -""" - -import sys, string -import read -import yaml - -output = [] -for l in sys.stdin: - l = string.strip(l) - output.append((l, read.convert_string(l))) - -print yaml.dump(output)
--- a/sandbox/embeddings/one-per-line.py Mon Oct 20 19:14:06 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ -#!/usr/bin/python - -import string -#import psyco - -weightsfile = "lm-weights.txt" -vocabfile = "words.asc" -size = 30000 -dimensions = 50 - -import numpy, math -import sys -from common.str import percent - -word_to_vector = {} - -f = open(weightsfile) -f.readline() -vals = [float(v) for v in string.split(f.readline())] -assert len(vals) == size * dimensions -vals.reverse() -#for i in range(size): -r = range(size) -r.reverse() -for i in r: - l = vals[dimensions*i:dimensions*(i+1)] - print string.join([`s` for s in l], "\t")
--- a/sandbox/embeddings/parameters.py Mon Oct 20 19:14:06 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ -""" -Locations of the embedding data files. -""" -WEIGHTSFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/lm-weights.txt" -VOCABFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/words.asc" -NUMBER_OF_WORDS = 30000 -DIMENSIONS = 50 -UNKNOWN = "UNKNOWN"
--- a/sandbox/embeddings/process.py Mon Oct 20 19:14:06 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,114 +0,0 @@ -""" -Read in the weights file -""" - -import string -import sys - -from parameters import * - -__words = None -__word_to_embedding = None -__read = False - -def word_to_embedding(w): - read_embeddings() - return __word_to_embedding[w] - -def read_embeddings(): - global __words - global __word_to_embedding - global __read - if __read: return - - __words = [string.strip(w) for w in open(VOCABFILE).readlines()] - assert len(__words) == NUMBER_OF_WORDS - - import numpy, math - from common.str import percent - - __word_to_embedding = {} - - sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) - f = open(WEIGHTSFILE) - f.readline() - vals = [float(v) for v in string.split(f.readline())] - assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS - for i in range(NUMBER_OF_WORDS): - l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] - w = __words[i] - __word_to_embedding[w] = l - __read = True - sys.stderr.write("...done reading %s\n" % WEIGHTSFILE) - -import re -numberre = re.compile("[0-9]") -slashre = re.compile("\\\/") - -def preprocess(l): - """ - Convert a sequence so that it can be embedded directly. - Returned the preprocessed sequence. - @note: Preprocessing is appropriate for Penn Treebank style documents. - """ - read_embeddings() - lnew = [] - for origw in l: - if origw == "-LRB-": w = "(" - elif origw == "-RRB-": w = ")" - elif origw == "-LCB-": w = "{" - elif origw == "-RCB-": w = "}" - elif origw == "-LSB-": w = "[" - elif origw == "-RSB-": w = "]" - else: - w = origw - w = string.lower(w) - w = slashre.sub("/", w) - w = numberre.sub("NUMBER", w) - if w not in __word_to_embedding: - sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) - w = UNKNOWN - assert w in __word_to_embedding - lnew.append(w) - return lnew - -#def convert_string(s, strict=False): -# """ -# Convert a string to a sequence of embeddings. -# @param strict: If strict, then words *must* be in the vocabulary. -# @todo: DEPRECATED Remove this function. -# """ -# read_embeddings() -# e = [] -# for origw in string.split(string.lower(s)): -# w = numberre.sub("NUMBER", origw) -# if w in __word_to_embedding: -# e.append(__word_to_embedding[w]) -# else: -# sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) -# assert not strict -# e.append(__word_to_embedding[UNKNOWN]) -# return e - -#def test(): -# """ -# Debugging code. -# """ -# read_embeddings() -# for w in __word_to_embedding: -# assert len(__word_to_embedding[w]) == 50 -# import numpy -# for w1 in __words: -# e1 = numpy.asarray(__word_to_embedding[w1]) -# lst = [] -# print w1, numpy.dot(e1, e1) -# for w2 in __word_to_embedding: -# if w1 >= w2: continue -# e2 = numpy.asarray(__word_to_embedding[w2]) -# d = (e1 - e2) -# l2 = numpy.dot(d, d) -# lst.append((l2, w1, w2)) -# lst.sort() -# print lst[:10] -# -#test()
--- a/sandbox/embeddings/read-original.py Mon Oct 20 19:14:06 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -#!/usr/bin/python - -import string -#import psyco - -weightsfile = "lm-weights.txt" -vocabfile = "words.asc" -size = 30000 -dimensions = 50 - -words = [string.strip(w) for w in open(vocabfile).readlines()] -assert len(words) == 30000 - -import numpy, math -import sys -from common.str import percent - -word_to_vector = {} - -f = open(weightsfile) -f.readline() -vals = [float(v) for v in string.split(f.readline())] -assert len(vals) == size * dimensions -vals.reverse() -for i in range(size): - l = vals[dimensions*i:dimensions*(i+1)] - w = words[i] - word_to_vector[w] = l - -# l2 = numpy.asarray(l) -# print math.fabs(50 - numpy.sum(l2*l2)), w - -cnt = 0 -for i1 in range(len(words)): - for i2 in range(len(words)): - w1 = words[i1] - w2 = words[i2] - cnt += 1 - if i1 <= i2: continue - l1 = numpy.asarray(word_to_vector[w1]) - l2 = numpy.asarray(word_to_vector[w2]) - d = l2 - l1 - dist = numpy.sum(d * d) - if dist < 50: - print numpy.sum(d * d), w1, w2, i1, i2 - if cnt % 1000 == 0: - sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector)))