Mercurial > pylearn
view sandbox/embeddings/read.py @ 458:ed6b0b3be8d2
Polished embeddings module
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 07 Oct 2008 19:13:53 -0400 |
parents | sandbox/embeddings/original.py@131e19dfe793 |
children | f400f62e7f9e |
line wrap: on
line source
""" Read in the weights file """ import string import sys from files import * WORDS = None WORD_TO_VECTOR = None __read = False def read_embeddings(): global WORDS global WORD_TO_VECTOR global __read if __read: return WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()] assert len(WORDS) == NUMBER_OF_WORDS import numpy, math from common.str import percent WORD_TO_VECTOR = {} sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) f = open(WEIGHTSFILE) f.readline() vals = [float(v) for v in string.split(f.readline())] assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS vals.reverse() for i in range(NUMBER_OF_WORDS): l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] w = WORDS[i] WORD_TO_VECTOR[w] = l __read = True import re numberre = re.compile("[0-9]") def convert_string(s): """ Convert a string to a sequence of embeddings. """ read_embeddings() e = [] for origw in string.split(string.lower(s)): w = numberre.sub("NUMBER", origw) if w in WORD_TO_VECTOR: e.append(WORD_TO_VECTOR[w]) else: sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw)) return e