Mercurial > pylearn
view embeddings/process.py @ 469:4335309f4924
Split into preprocess for words and sequences
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 21 Oct 2008 16:32:06 -0400 |
parents | a07948f780b9 |
children | 90a76a8238e8 |
line wrap: on
line source
""" Read in the weights file """ import string import sys from parameters import * __words = None __word_to_embedding = None __read = False def word_to_embedding(w): read_embeddings() return __word_to_embedding[w] def read_embeddings(): global __words global __word_to_embedding global __read if __read: return __words = [string.strip(w) for w in open(VOCABFILE).readlines()] assert len(__words) == NUMBER_OF_WORDS import numpy, math from common.str import percent __word_to_embedding = {} sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) f = open(WEIGHTSFILE) f.readline() vals = [float(v) for v in string.split(f.readline())] assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS for i in range(NUMBER_OF_WORDS): l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] w = __words[i] __word_to_embedding[w] = l __read = True sys.stderr.write("...done reading %s\n" % WEIGHTSFILE) import re numberre = re.compile("[0-9]") slashre = re.compile("\\\/") def preprocess_word(origw): """ Convert a word so that it can be embedded directly. Returned the preprocessed sequence. @note: Preprocessing is appropriate for Penn Treebank style documents. """ read_embeddings() if origw == "-LRB-": w = "(" elif origw == "-RRB-": w = ")" elif origw == "-LCB-": w = "{" elif origw == "-RCB-": w = "}" elif origw == "-LSB-": w = "[" elif origw == "-RSB-": w = "]" else: w = origw w = string.lower(w) w = slashre.sub("/", w) w = numberre.sub("NUMBER", w) if w not in __word_to_embedding: # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) w = UNKNOWN assert w in __word_to_embedding return w def preprocess_seq(l): """ Convert a sequence so that it can be embedded directly. Returned the preprocessed sequence. @note: Preprocessing is appropriate for Penn Treebank style documents. """ read_embeddings() lnew = [] for origw in l: w = preprocess_word(origw) lnew.append(w) return lnew #def convert_string(s, strict=False): # """ # Convert a string to a sequence of embeddings. # @param strict: If strict, then words *must* be in the vocabulary. # @todo: DEPRECATED Remove this function. # """ # read_embeddings() # e = [] # for origw in string.split(string.lower(s)): # w = numberre.sub("NUMBER", origw) # if w in __word_to_embedding: # e.append(__word_to_embedding[w]) # else: # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) # assert not strict # e.append(__word_to_embedding[UNKNOWN]) # return e #def test(): # """ # Debugging code. # """ # read_embeddings() # for w in __word_to_embedding: # assert len(__word_to_embedding[w]) == 50 # import numpy # for w1 in __words: # e1 = numpy.asarray(__word_to_embedding[w1]) # lst = [] # print w1, numpy.dot(e1, e1) # for w2 in __word_to_embedding: # if w1 >= w2: continue # e2 = numpy.asarray(__word_to_embedding[w2]) # d = (e1 - e2) # l2 = numpy.dot(d, d) # lst.append((l2, w1, w2)) # lst.sort() # print lst[:10] # #test()