# HG changeset patch # User Joseph Turian # Date 1223434810 14400 # Node ID f400f62e7f9e537a95e60704eb27d123cb9795d6 # Parent ed6b0b3be8d294d1e470dc265ceccdfc02b4ce07 Fixed embedding preprocessing diff -r ed6b0b3be8d2 -r f400f62e7f9e sandbox/embeddings/__init__.py diff -r ed6b0b3be8d2 -r f400f62e7f9e sandbox/embeddings/files.py --- a/sandbox/embeddings/files.py Tue Oct 07 19:13:53 2008 -0400 +++ b/sandbox/embeddings/files.py Tue Oct 07 23:00:10 2008 -0400 @@ -5,3 +5,4 @@ VOCABFILE = "/u/turian/data/word_embeddings.collobert-and-weston/words.asc" NUMBER_OF_WORDS = 30000 DIMENSIONS = 50 +UNKNOWN = "UNKNOWN" diff -r ed6b0b3be8d2 -r f400f62e7f9e sandbox/embeddings/read.py --- a/sandbox/embeddings/read.py Tue Oct 07 19:13:53 2008 -0400 +++ b/sandbox/embeddings/read.py Tue Oct 07 23:00:10 2008 -0400 @@ -38,16 +38,46 @@ import re numberre = re.compile("[0-9]") -def convert_string(s): + +def preprocess(l): """ - Convert a string to a sequence of embeddings. + Convert a sequence so that it can be embedded directly. + Returned the preprocessed sequence. """ read_embeddings() - e = [] - for origw in string.split(string.lower(s)): - w = numberre.sub("NUMBER", origw) - if w in WORD_TO_VECTOR: - e.append(WORD_TO_VECTOR[w]) + lnew = [] + for origw in l: + if origw == "-LRB-": w = "(" + elif origw == "-RRB-": w = ")" + elif origw == "-LCB-": w = "{" + elif origw == "-RCB-": w = "}" + elif origw == "-LSB-": w = "[" + elif origw == "-RSB-": w = "]" else: - sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw)) - return e + w = origw + w = string.lower(w) + w = numberre.sub("NUMBER", w) + if w not in WORD_TO_VECTOR: + sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) + w = UNKNOWN + assert w in WORD_TO_VECTOR + lnew.append(w) + return lnew + +#def convert_string(s, strict=False): +# """ +# Convert a string to a sequence of embeddings. +# @param strict: If strict, then words *must* be in the vocabulary. +# @todo: DEPRECATED Remove this function. +# """ +# read_embeddings() +# e = [] +# for origw in string.split(string.lower(s)): +# w = numberre.sub("NUMBER", origw) +# if w in WORD_TO_VECTOR: +# e.append(WORD_TO_VECTOR[w]) +# else: +# sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) +# assert not strict +# e.append(WORD_TO_VECTOR[UNKNOWN]) +# return e