Mercurial > pylearn
view sandbox/embeddings/read.py @ 460:fda72e944104
\/ -> /
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 07 Oct 2008 23:07:23 -0400 |
parents | f400f62e7f9e |
children |
line wrap: on
line source
""" Read in the weights file """ import string import sys from files import * WORDS = None WORD_TO_VECTOR = None __read = False def read_embeddings(): global WORDS global WORD_TO_VECTOR global __read if __read: return WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()] assert len(WORDS) == NUMBER_OF_WORDS import numpy, math from common.str import percent WORD_TO_VECTOR = {} sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) f = open(WEIGHTSFILE) f.readline() vals = [float(v) for v in string.split(f.readline())] assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS vals.reverse() for i in range(NUMBER_OF_WORDS): l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] w = WORDS[i] WORD_TO_VECTOR[w] = l __read = True import re numberre = re.compile("[0-9]") slashre = re.compile("\\\/") def preprocess(l): """ Convert a sequence so that it can be embedded directly. Returned the preprocessed sequence. @note: Preprocessing is appropriate for Penn Treebank style documents. """ read_embeddings() lnew = [] for origw in l: if origw == "-LRB-": w = "(" elif origw == "-RRB-": w = ")" elif origw == "-LCB-": w = "{" elif origw == "-RCB-": w = "}" elif origw == "-LSB-": w = "[" elif origw == "-RSB-": w = "]" else: w = origw w = string.lower(w) w = slashre.sub("/", w) w = numberre.sub("NUMBER", w) if w not in WORD_TO_VECTOR: sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) w = UNKNOWN assert w in WORD_TO_VECTOR lnew.append(w) return lnew #def convert_string(s, strict=False): # """ # Convert a string to a sequence of embeddings. # @param strict: If strict, then words *must* be in the vocabulary. # @todo: DEPRECATED Remove this function. # """ # read_embeddings() # e = [] # for origw in string.split(string.lower(s)): # w = numberre.sub("NUMBER", origw) # if w in WORD_TO_VECTOR: # e.append(WORD_TO_VECTOR[w]) # else: # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) # assert not strict # e.append(WORD_TO_VECTOR[UNKNOWN]) # return e