Mercurial > pylearn
diff sandbox/embeddings/read.py @ 458:ed6b0b3be8d2
Polished embeddings module
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 07 Oct 2008 19:13:53 -0400 |
parents | sandbox/embeddings/original.py@131e19dfe793 |
children | f400f62e7f9e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/read.py Tue Oct 07 19:13:53 2008 -0400 @@ -0,0 +1,53 @@ +""" +Read in the weights file +""" + +import string +import sys + +from files import * + +WORDS = None +WORD_TO_VECTOR = None +__read = False +def read_embeddings(): + global WORDS + global WORD_TO_VECTOR + global __read + if __read: return + + WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()] + assert len(WORDS) == NUMBER_OF_WORDS + + import numpy, math + from common.str import percent + + WORD_TO_VECTOR = {} + + sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) + f = open(WEIGHTSFILE) + f.readline() + vals = [float(v) for v in string.split(f.readline())] + assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS + vals.reverse() + for i in range(NUMBER_OF_WORDS): + l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] + w = WORDS[i] + WORD_TO_VECTOR[w] = l + __read = True + +import re +numberre = re.compile("[0-9]") +def convert_string(s): + """ + Convert a string to a sequence of embeddings. + """ + read_embeddings() + e = [] + for origw in string.split(string.lower(s)): + w = numberre.sub("NUMBER", origw) + if w in WORD_TO_VECTOR: + e.append(WORD_TO_VECTOR[w]) + else: + sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw)) + return e