# HG changeset patch # User Joseph Turian # Date 1223421233 14400 # Node ID ed6b0b3be8d294d1e470dc265ceccdfc02b4ce07 # Parent 34acf8db186dcbe05115abd0f42f8153619553a7 Polished embeddings module diff -r 34acf8db186d -r ed6b0b3be8d2 sandbox/embeddings/README.txt --- a/sandbox/embeddings/README.txt Tue Oct 07 18:21:32 2008 -0400 +++ b/sandbox/embeddings/README.txt Tue Oct 07 19:13:53 2008 -0400 @@ -1,1 +1,3 @@ Messy scripts for working with Jason + Ronan's embeddings. + +File locations are given in files.py diff -r 34acf8db186d -r ed6b0b3be8d2 sandbox/embeddings/convert.py --- a/sandbox/embeddings/convert.py Tue Oct 07 18:21:32 2008 -0400 +++ b/sandbox/embeddings/convert.py Tue Oct 07 19:13:53 2008 -0400 @@ -1,12 +1,15 @@ #!/usr/bin/python +""" +Convert stdin sentences to word embeddings, and output YAML. +""" import sys, string -import original +import read import yaml output = [] for l in sys.stdin: l = string.strip(l) - output.append((l, original.convert_string(l))) + output.append((l, read.convert_string(l))) print yaml.dump(output) diff -r 34acf8db186d -r ed6b0b3be8d2 sandbox/embeddings/files.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/files.py Tue Oct 07 19:13:53 2008 -0400 @@ -0,0 +1,7 @@ +""" +Locations of the embedding data files. +""" +WEIGHTSFILE = "/u/turian/data/word_embeddings.collobert-and-weston/lm-weights.txt" +VOCABFILE = "/u/turian/data/word_embeddings.collobert-and-weston/words.asc" +NUMBER_OF_WORDS = 30000 +DIMENSIONS = 50 diff -r 34acf8db186d -r ed6b0b3be8d2 sandbox/embeddings/original.py --- a/sandbox/embeddings/original.py Tue Oct 07 18:21:32 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ -""" -Read in the weights file -""" - -import string -import sys - -WORDS = None -WORD_TO_VECTOR = None -def read(): - global WORDS - global WORD_TO_VECTOR - - weightsfile = "lm-weights.txt" - vocabfile = "words.asc" - size = 30000 - dimensions = 50 - - WORDS = [string.strip(w) for w in open(vocabfile).readlines()] - assert len(WORDS) == 30000 - - import numpy, math - from common.str import percent - - WORD_TO_VECTOR = {} - - sys.stderr.write("Reading %s...\n" % weightsfile) - f = open(weightsfile) - f.readline() - vals = [float(v) for v in string.split(f.readline())] - assert len(vals) == size * dimensions - vals.reverse() - for i in range(size): - l = vals[dimensions*i:dimensions*(i+1)] - w = WORDS[i] - WORD_TO_VECTOR[w] = l - -import re -numberre = re.compile("[0-9]") -def convert_string(s): - """ - Convert a string to a sequence of embeddings. - """ - e = [] - for origw in string.split(string.lower(s)): - w = numberre.sub("NUMBER", origw) - if w in WORD_TO_VECTOR: - e.append(WORD_TO_VECTOR[w]) - else: - sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw)) - return e - -read() diff -r 34acf8db186d -r ed6b0b3be8d2 sandbox/embeddings/read.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/read.py Tue Oct 07 19:13:53 2008 -0400 @@ -0,0 +1,53 @@ +""" +Read in the weights file +""" + +import string +import sys + +from files import * + +WORDS = None +WORD_TO_VECTOR = None +__read = False +def read_embeddings(): + global WORDS + global WORD_TO_VECTOR + global __read + if __read: return + + WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()] + assert len(WORDS) == NUMBER_OF_WORDS + + import numpy, math + from common.str import percent + + WORD_TO_VECTOR = {} + + sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) + f = open(WEIGHTSFILE) + f.readline() + vals = [float(v) for v in string.split(f.readline())] + assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS + vals.reverse() + for i in range(NUMBER_OF_WORDS): + l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] + w = WORDS[i] + WORD_TO_VECTOR[w] = l + __read = True + +import re +numberre = re.compile("[0-9]") +def convert_string(s): + """ + Convert a string to a sequence of embeddings. + """ + read_embeddings() + e = [] + for origw in string.split(string.lower(s)): + w = numberre.sub("NUMBER", origw) + if w in WORD_TO_VECTOR: + e.append(WORD_TO_VECTOR[w]) + else: + sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw)) + return e