# HG changeset patch # User Joseph Turian # Date 1223442563 14400 # Node ID 1243716ade6a6478db9dfdc579e27bd6cc75d22d # Parent fda72e944104ab6debb73acaa17902b9d47581ab Rearranged diff -r fda72e944104 -r 1243716ade6a sandbox/embeddings/README.txt --- a/sandbox/embeddings/README.txt Tue Oct 07 23:07:23 2008 -0400 +++ b/sandbox/embeddings/README.txt Wed Oct 08 01:09:23 2008 -0400 @@ -1,3 +1,3 @@ Messy scripts for working with Jason + Ronan's embeddings. -File locations are given in files.py +Parameters are given in parameters.py diff -r fda72e944104 -r 1243716ade6a sandbox/embeddings/__init__.py --- a/sandbox/embeddings/__init__.py Tue Oct 07 23:07:23 2008 -0400 +++ b/sandbox/embeddings/__init__.py Wed Oct 08 01:09:23 2008 -0400 @@ -0,0 +1,1 @@ +from process import * diff -r fda72e944104 -r 1243716ade6a sandbox/embeddings/files.py --- a/sandbox/embeddings/files.py Tue Oct 07 23:07:23 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ -""" -Locations of the embedding data files. -""" -WEIGHTSFILE = "/u/turian/data/word_embeddings.collobert-and-weston/lm-weights.txt" -VOCABFILE = "/u/turian/data/word_embeddings.collobert-and-weston/words.asc" -NUMBER_OF_WORDS = 30000 -DIMENSIONS = 50 -UNKNOWN = "UNKNOWN" diff -r fda72e944104 -r 1243716ade6a sandbox/embeddings/parameters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/parameters.py Wed Oct 08 01:09:23 2008 -0400 @@ -0,0 +1,8 @@ +""" +Locations of the embedding data files. +""" +WEIGHTSFILE = "/u/turian/data/word_embeddings.collobert-and-weston/lm-weights.txt" +VOCABFILE = "/u/turian/data/word_embeddings.collobert-and-weston/words.asc" +NUMBER_OF_WORDS = 30000 +DIMENSIONS = 50 +UNKNOWN = "UNKNOWN" diff -r fda72e944104 -r 1243716ade6a sandbox/embeddings/process.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/process.py Wed Oct 08 01:09:23 2008 -0400 @@ -0,0 +1,91 @@ +""" +Read in the weights file +""" + +import string +import sys + +from parameters import * + +__words = None +__word_to_embedding = None +__read = False + +def word_to_embedding(w): + read_embeddings() + return __word_to_embedding[w] + +def read_embeddings(): + global __words + global __word_to_embedding + global __read + if __read: return + + __words = [string.strip(w) for w in open(VOCABFILE).readlines()] + assert len(__words) == NUMBER_OF_WORDS + + import numpy, math + from common.str import percent + + __word_to_embedding = {} + + sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) + f = open(WEIGHTSFILE) + f.readline() + vals = [float(v) for v in string.split(f.readline())] + assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS + vals.reverse() + for i in range(NUMBER_OF_WORDS): + l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] + w = __words[i] + __word_to_embedding[w] = l + __read = True + +import re +numberre = re.compile("[0-9]") +slashre = re.compile("\\\/") + +def preprocess(l): + """ + Convert a sequence so that it can be embedded directly. + Returned the preprocessed sequence. + @note: Preprocessing is appropriate for Penn Treebank style documents. + """ + read_embeddings() + lnew = [] + for origw in l: + if origw == "-LRB-": w = "(" + elif origw == "-RRB-": w = ")" + elif origw == "-LCB-": w = "{" + elif origw == "-RCB-": w = "}" + elif origw == "-LSB-": w = "[" + elif origw == "-RSB-": w = "]" + else: + w = origw + w = string.lower(w) + w = slashre.sub("/", w) + w = numberre.sub("NUMBER", w) + if w not in __word_to_embedding: + sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) + w = UNKNOWN + assert w in __word_to_embedding + lnew.append(w) + return lnew + +#def convert_string(s, strict=False): +# """ +# Convert a string to a sequence of embeddings. +# @param strict: If strict, then words *must* be in the vocabulary. +# @todo: DEPRECATED Remove this function. +# """ +# read_embeddings() +# e = [] +# for origw in string.split(string.lower(s)): +# w = numberre.sub("NUMBER", origw) +# if w in __word_to_embedding: +# e.append(__word_to_embedding[w]) +# else: +# sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) +# assert not strict +# e.append(__word_to_embedding[UNKNOWN]) +# return e diff -r fda72e944104 -r 1243716ade6a sandbox/embeddings/read.py --- a/sandbox/embeddings/read.py Tue Oct 07 23:07:23 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,86 +0,0 @@ -""" -Read in the weights file -""" - -import string -import sys - -from files import * - -WORDS = None -WORD_TO_VECTOR = None -__read = False -def read_embeddings(): - global WORDS - global WORD_TO_VECTOR - global __read - if __read: return - - WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()] - assert len(WORDS) == NUMBER_OF_WORDS - - import numpy, math - from common.str import percent - - WORD_TO_VECTOR = {} - - sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) - f = open(WEIGHTSFILE) - f.readline() - vals = [float(v) for v in string.split(f.readline())] - assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS - vals.reverse() - for i in range(NUMBER_OF_WORDS): - l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] - w = WORDS[i] - WORD_TO_VECTOR[w] = l - __read = True - -import re -numberre = re.compile("[0-9]") -slashre = re.compile("\\\/") - -def preprocess(l): - """ - Convert a sequence so that it can be embedded directly. - Returned the preprocessed sequence. - @note: Preprocessing is appropriate for Penn Treebank style documents. - """ - read_embeddings() - lnew = [] - for origw in l: - if origw == "-LRB-": w = "(" - elif origw == "-RRB-": w = ")" - elif origw == "-LCB-": w = "{" - elif origw == "-RCB-": w = "}" - elif origw == "-LSB-": w = "[" - elif origw == "-RSB-": w = "]" - else: - w = origw - w = string.lower(w) - w = slashre.sub("/", w) - w = numberre.sub("NUMBER", w) - if w not in WORD_TO_VECTOR: - sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) - w = UNKNOWN - assert w in WORD_TO_VECTOR - lnew.append(w) - return lnew - -#def convert_string(s, strict=False): -# """ -# Convert a string to a sequence of embeddings. -# @param strict: If strict, then words *must* be in the vocabulary. -# @todo: DEPRECATED Remove this function. -# """ -# read_embeddings() -# e = [] -# for origw in string.split(string.lower(s)): -# w = numberre.sub("NUMBER", origw) -# if w in WORD_TO_VECTOR: -# e.append(WORD_TO_VECTOR[w]) -# else: -# sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) -# assert not strict -# e.append(WORD_TO_VECTOR[UNKNOWN]) -# return e