Mercurial > pylearn
view sandbox/embeddings/original.py @ 456:131e19dfe793
Added sandbox.embeddings
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 07 Oct 2008 17:56:52 -0400 |
parents | |
children |
line wrap: on
line source
""" Read in the weights file """ import string import sys WORDS = None WORD_TO_VECTOR = None def read(): global WORDS global WORD_TO_VECTOR weightsfile = "lm-weights.txt" vocabfile = "words.asc" size = 30000 dimensions = 50 WORDS = [string.strip(w) for w in open(vocabfile).readlines()] assert len(WORDS) == 30000 import numpy, math from common.str import percent WORD_TO_VECTOR = {} sys.stderr.write("Reading %s...\n" % weightsfile) f = open(weightsfile) f.readline() vals = [float(v) for v in string.split(f.readline())] assert len(vals) == size * dimensions vals.reverse() for i in range(size): l = vals[dimensions*i:dimensions*(i+1)] w = WORDS[i] WORD_TO_VECTOR[w] = l import re numberre = re.compile("[0-9]") def convert_string(s): """ Convert a string to a sequence of embeddings. """ e = [] for origw in string.split(string.lower(s)): w = numberre.sub("NUMBER", origw) if w in WORD_TO_VECTOR: e.append(WORD_TO_VECTOR[w]) else: sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw)) return e read()