Mercurial > pylearn
diff sandbox/embeddings/original.py @ 456:131e19dfe793
Added sandbox.embeddings
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 07 Oct 2008 17:56:52 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sandbox/embeddings/original.py Tue Oct 07 17:56:52 2008 -0400 @@ -0,0 +1,53 @@ +""" +Read in the weights file +""" + +import string +import sys + +WORDS = None +WORD_TO_VECTOR = None +def read(): + global WORDS + global WORD_TO_VECTOR + + weightsfile = "lm-weights.txt" + vocabfile = "words.asc" + size = 30000 + dimensions = 50 + + WORDS = [string.strip(w) for w in open(vocabfile).readlines()] + assert len(WORDS) == 30000 + + import numpy, math + from common.str import percent + + WORD_TO_VECTOR = {} + + sys.stderr.write("Reading %s...\n" % weightsfile) + f = open(weightsfile) + f.readline() + vals = [float(v) for v in string.split(f.readline())] + assert len(vals) == size * dimensions + vals.reverse() + for i in range(size): + l = vals[dimensions*i:dimensions*(i+1)] + w = WORDS[i] + WORD_TO_VECTOR[w] = l + +import re +numberre = re.compile("[0-9]") +def convert_string(s): + """ + Convert a string to a sequence of embeddings. + """ + e = [] + for origw in string.split(string.lower(s)): + w = numberre.sub("NUMBER", origw) + if w in WORD_TO_VECTOR: + e.append(WORD_TO_VECTOR[w]) + else: + sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw)) + return e + +read()