ift6266: deep/autoencoder/DA_training.py comparison

comparison deep/autoencoder/DA_training.py @ 190:70a9df1cd20e

initial commit for autoencoder training

author	youssouf
date	Tue, 02 Mar 2010 09:52:27 -0500
parents
children	e12702b88a2d

comparison

equal deleted inserted replaced

-:0d0677773533
+:70a9df1cd20e
+"""
+This tutorial introduces stacked denoising auto-encoders (SdA) using Theano.
+Denoising autoencoders are the building blocks for SDAE.
+They are based on auto-encoders as the ones used in Bengio et al. 2007.
+An autoencoder takes an input x and first maps it to a hidden representation
+y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting
+latent representation y is then mapped back to a "reconstructed" vector
+z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b').  The weight
+matrix W' can optionally be constrained such that W' = W^T, in which case
+the autoencoder is said to have tied weights. The network is trained such
+that to minimize the reconstruction error (the error between x and z).
+For the denosing autoencoder, during training, first x is corrupted into
+\tilde{x}, where \tilde{x} is a partially destroyed version of x by means
+of a stochastic mapping. Afterwards y is computed as before (using
+\tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction
+error is now measured between z and the uncorrupted input x, which is
+computed as the cross-entropy :
+- \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
+For X iteration of the main program loop it takes *** minutes on an
+Intel Core i7 and *** minutes on GPU (NVIDIA GTX 285 graphics processor).
+References :
+- P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and
+Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
+2008
+- Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
+Training of Deep Networks, Advances in Neural Information Processing
+Systems 19, 2007
+"""
+import numpy
+import theano
+import time
+import theano.tensor as T
+from theano.tensor.shared_randomstreams import RandomStreams
+import gzip
+import cPickle
+from pylearn.io import filetensor as ft
+class dA():
+"""Denoising Auto-Encoder class (dA)
+A denoising autoencoders tries to reconstruct the input from a corrupted
+version of it by projecting it first in a latent space and reprojecting
+it afterwards back in the input space. Please refer to Vincent et al.,2008
+for more details. If x is the input then equation (1) computes a partially
+destroyed version of x by means of a stochastic mapping q_D. Equation (2)
+computes the projection of the input into the latent space. Equation (3)
+computes the reconstruction of the input, while equation (4) computes the
+reconstruction error.
+.. math::
+\tilde{x} ~ q_D(\tilde{x}|x)                                         (1)
+y = s(W \tilde{x} + b)                                               (2)
+z = s(W' y  + b')                                                    (3)
+L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)]          (4)
+"""
+def __init__(self, n_visible= 784, n_hidden= 500, complexity = 0.1, input= None):
+"""
+Initialize the DAE class by specifying the number of visible units (the
+dimension d of the input ), the number of hidden units ( the dimension
+d' of the latent or hidden space ) and by giving a symbolic variable
+for the input. Such a symbolic variable is useful when the input is
+the result of some computations. For example when dealing with SDAEs,
+the dA on layer 2 gets as input the output of the DAE on layer 1.
+This output can be written as a function of the input to the entire
+model, and as such can be computed by theano whenever needed.
+:param n_visible: number of visible units
+:param n_hidden:  number of hidden units
+:param input:     a symbolic description of the input or None
+"""
+self.n_visible = n_visible
+self.n_hidden  = n_hidden
+# create a Theano random generator that gives symbolic random values
+theano_rng = RandomStreams()
+# create a numpy random generator
+numpy_rng = numpy.random.RandomState()
+# initial values for weights and biases
+# note : W' was written as `W_prime` and b' as `b_prime`
+# W is initialized with `initial_W` which is uniformely sampled
+# from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
+# the output of uniform if converted using asarray to dtype
+# theano.config.floatX so that the code is runable on GPU
+initial_W = numpy.asarray( numpy.random.uniform( \
+low = -numpy.sqrt(6./(n_visible+n_hidden)), \
+high = numpy.sqrt(6./(n_visible+n_hidden)), \
+size = (n_visible, n_hidden)), dtype = theano.config.floatX)
+initial_b       = numpy.zeros(n_hidden)
+# W' is initialized with `initial_W_prime` which is uniformely sampled
+# from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
+# the output of uniform if converted using asarray to dtype
+# theano.config.floatX so that the code is runable on GPU
+initial_b_prime= numpy.zeros(n_visible)
+# theano shared variables for weights and biases
+self.W       = theano.shared(value = initial_W,       name = "W")
+self.b       = theano.shared(value = initial_b,       name = "b")
+# tied weights, therefore W_prime is W transpose
+self.W_prime = self.W.T
+self.b_prime = theano.shared(value = initial_b_prime, name = "b'")
+# if no input is given, generate a variable representing the input
+if input == None :
+# we use a matrix because we expect a minibatch of several examples,
+# each example being a row
+x = T.dmatrix(name = 'input')
+else:
+x = input
+# Equation (1)
+# note : first argument of theano.rng.binomial is the shape(size) of
+#        random numbers that it should produce
+#        second argument is the number of trials
+#        third argument is the probability of success of any trial
+#
+#        this will produce an array of 0s and 1s where 1 has a
+#        probability of 0.9 and 0 of 0.1
+tilde_x  = theano_rng.binomial( x.shape,  1,  1-complexity) * x
+# Equation (2)
+# note  : y is stored as an attribute of the class so that it can be
+#         used later when stacking dAs.
+self.y   = T.nnet.sigmoid(T.dot(tilde_x, self.W      ) + self.b)
+# Equation (3)
+z        = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
+# Equation (4)
+self.L = - T.sum( x*T.log(z) + (1-x)*T.log(1-z), axis=1 )
+# note : L is now a vector, where each element is the cross-entropy cost
+#        of the reconstruction of the corresponding example of the
+#        minibatch. We need to compute the average of all these to get
+#        the cost of the minibatch
+self.cost = T.mean(self.L)
+# note : y is computed from the corrupted `tilde_x`. Later on,
+#        we will need the hidden layer obtained from the uncorrupted
+#        input when for example we will pass this as input to the layer
+#        above
+self.hidden_values = T.nnet.sigmoid( T.dot(x, self.W) + self.b)
+def sgd_optimization_nist( learning_rate=0.01,  \
+n_iter = 300, n_code_layer = 400, \
+complexity = 0.1):
+"""
+Demonstrate stochastic gradient descent optimization for a denoising autoencoder
+This is demonstrated on MNIST.
+:param learning_rate: learning rate used (factor for the stochastic
+gradient
+:param pretraining_epochs: number of epoch to do pretraining
+:param pretrain_lr: learning rate to be used during pre-training
+:param n_iter: maximal number of iterations ot run the optimizer
+"""
+#open file to save the validation and test curve
+filename = 'lr_' + str(learning_rate) + 'ni_' + str(n_iter) + 'nc_' + str(n_code_layer) + \
+'c_' + str(complexity) + '.txt'
+result_file = open(filename, 'w')
+data_path = '/data/lisa/data/nist/by_class/'
+f = open(data_path+'all/all_train_data.ft')
+g = open(data_path+'all/all_train_labels.ft')
+h = open(data_path+'all/all_test_data.ft')
+i = open(data_path+'all/all_test_labels.ft')
+train_set_x = ft.read(f)
+train_set_y = ft.read(g)
+test_set_x = ft.read(h)
+test_set_y = ft.read(i)
+f.close()
+g.close()
+i.close()
+h.close()
+# make minibatches of size 20
+batch_size = 20    # sized of the minibatch
+#create a validation set the same size as the test size
+#use the end of the training array for this purpose
+#discard the last remaining so we get a %batch_size number
+test_size=len(test_set_y)
+test_size = int(test_size/batch_size)
+test_size*=batch_size
+train_size = len(train_set_x)
+train_size = int(train_size/batch_size)
+train_size*=batch_size
+validation_size =test_size
+offset = train_size-test_size
+if True:
+print 'train size = %d' %train_size
+print 'test size = %d' %test_size
+print 'valid size = %d' %validation_size
+print 'offset = %d' %offset
+#train_set = (train_set_x,train_set_y)
+train_batches = []
+for i in xrange(0, train_size-test_size, batch_size):
+train_batches = train_batches + \
+[(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
+test_batches = []
+for i in xrange(0, test_size, batch_size):
+test_batches = test_batches + \
+[(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
+valid_batches = []
+for i in xrange(0, test_size, batch_size):
+valid_batches = valid_batches + \
+[(train_set_x[offset+i:offset+i+batch_size], \
+train_set_y[offset+i:offset+i+batch_size])]
+ishape     = (32,32) # this is the size of NIST images
+# allocate symbolic variables for the data
+x = T.fmatrix()  # the data is presented as rasterized images
+y = T.lvector()  # the labels are presented as 1D vector of
+# [long int] labels
+# construct the denoising autoencoder class
+n_ins = 32*32
+encoder = dA(n_ins, n_code_layer, input = x.reshape((batch_size,n_ins)))
+# Train autoencoder
+# compute gradients of the layer parameters
+gW       = T.grad(encoder.cost, encoder.W)
+gb       = T.grad(encoder.cost, encoder.b)
+gb_prime = T.grad(encoder.cost, encoder.b_prime)
+# compute the updated value of the parameters after one step
+updated_W       = encoder.W       - gW       * learning_rate
+updated_b       = encoder.b       - gb       * learning_rate
+updated_b_prime = encoder.b_prime - gb_prime * learning_rate
+# defining the function that evaluate the symbolic description of
+# one update step
+train_model = theano.function([x], encoder.cost, updates=\
+{ encoder.W       : updated_W, \
+encoder.b       : updated_b, \
+encoder.b_prime : updated_b_prime } )
+# compiling a theano function that computes the mistakes that are made
+# by the model on a minibatch
+test_model = theano.function([x], encoder.cost)
+normalize = numpy.asarray(255, dtype=theano.config.floatX)
+n_minibatches        = len(train_batches)
+# early-stopping parameters
+patience              = 10000000 / batch_size # look as this many examples regardless
+patience_increase     = 2     # wait this much longer when a new best is
+# found
+improvement_threshold = 0.995 # a relative improvement of this much is
+# considered significant
+validation_frequency  = n_minibatches  # go through this many
+# minibatche before checking the network
+# on the validation set; in this case we
+# check every epoch
+best_params          = None
+best_validation_loss = float('inf')
+best_iter            = 0
+test_score           = 0.
+start_time = time.clock()
+# have a maximum of `n_iter` iterations through the entire dataset
+for iter in xrange(n_iter* n_minibatches):
+# get epoch and minibatch index
+epoch           = iter / n_minibatches
+minibatch_index =  iter % n_minibatches
+# get the minibatches corresponding to `iter` modulo
+# `len(train_batches)`
+x,y = train_batches[ minibatch_index ]
+'''
+if iter == 0:
+b = numpy.asarray(255, dtype=theano.config.floatX)
+x = x / b
+print x
+print y
+print x.__class__
+print x.shape
+print x.dtype.name
+print y.dtype.name
+print x.min(), x.max()
+'''
+cost_ij = train_model(x/normalize)
+if (iter+1) % validation_frequency == 0:
+# compute zero-one loss on validation set
+this_validation_loss = 0.
+for x,y in valid_batches:
+# sum up the errors for each minibatch
+this_validation_loss += test_model(x/normalize)
+# get the average by dividing with the number of minibatches
+this_validation_loss /= len(valid_batches)
+print('epoch %i, minibatch %i/%i, validation error %f ' % \
+(epoch, minibatch_index+1, n_minibatches, \
+this_validation_loss))
+# save value in file
+result_file.write(str(epoch) + ' ' + str(this_validation_loss)+ '\n')
+# if we got the best validation score until now
+if this_validation_loss < best_validation_loss:
+#improve patience if loss improvement is good enough
+if this_validation_loss < best_validation_loss *  \
+improvement_threshold :
+patience = max(patience, iter * patience_increase)
+best_validation_loss = this_validation_loss
+best_iter = iter
+# test it on the test set
+test_score = 0.
+for x,y in test_batches:
+test_score += test_model(x/normalize)
+test_score /= len(test_batches)
+print(('     epoch %i, minibatch %i/%i, test error of best '
+'model %f ') %
+(epoch, minibatch_index+1, n_minibatches,
+test_score))
+if patience <= iter :
+print('iter (%i) is superior than patience(%i). break', iter, patience)
+break
+end_time = time.clock()
+print(('Optimization complete with best validation score of %f ,'
+'with test performance %f ') %
+(best_validation_loss, test_score))
+print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
+result_file.close()
+return (best_validation_loss, test_score, (end_time-start_time)/60, best_iter)
+def sgd_optimization_mnist( learning_rate=0.01,  \
+n_iter = 1, n_code_layer = 400, \
+complexity = 0.1):
+"""
+Demonstrate stochastic gradient descent optimization for a denoising autoencoder
+This is demonstrated on MNIST.
+:param learning_rate: learning rate used (factor for the stochastic
+gradient
+:param pretraining_epochs: number of epoch to do pretraining
+:param pretrain_lr: learning rate to be used during pre-training
+:param n_iter: maximal number of iterations ot run the optimizer
+"""
+#open file to save the validation and test curve
+filename = 'lr_' + str(learning_rate) + 'ni_' + str(n_iter) + 'nc_' + str(n_code_layer) + \
+'c_' + str(complexity) + '.txt'
+result_file = open(filename, 'w')
+# Load the dataset
+f = gzip.open('/u/lisa/HTML/deep/data/mnist/mnist.pkl.gz','rb')
+train_set, valid_set, test_set = cPickle.load(f)
+f.close()
+# make minibatches of size 20
+batch_size = 20    # sized of the minibatch
+# Dealing with the training set
+# get the list of training images (x) and their labels (y)
+(train_set_x, train_set_y) = train_set
+# initialize the list of training minibatches with empty list
+train_batches = []
+for i in xrange(0, len(train_set_x), batch_size):
+# add to the list of minibatches the minibatch starting at
+# position i, ending at position i+batch_size
+# a minibatch is a pair ; the first element of the pair is a list
+# of datapoints, the second element is the list of corresponding
+# labels
+train_batches = train_batches + \
+[(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
+# Dealing with the validation set
+(valid_set_x, valid_set_y) = valid_set
+# initialize the list of validation minibatches
+valid_batches = []
+for i in xrange(0, len(valid_set_x), batch_size):
+valid_batches = valid_batches + \
+[(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
+# Dealing with the testing set
+(test_set_x, test_set_y) = test_set
+# initialize the list of testing minibatches
+test_batches = []
+for i in xrange(0, len(test_set_x), batch_size):
+test_batches = test_batches + \
+[(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
+ishape     = (28,28) # this is the size of MNIST images
+# allocate symbolic variables for the data
+x = T.fmatrix()  # the data is presented as rasterized images
+y = T.lvector()  # the labels are presented as 1D vector of
+# [long int] labels
+# construct the denoising autoencoder class
+n_ins = 28*28
+encoder = dA(n_ins, n_code_layer, input = x.reshape((batch_size,n_ins)))
+# Train autoencoder
+# compute gradients of the layer parameters
+gW       = T.grad(encoder.cost, encoder.W)
+gb       = T.grad(encoder.cost, encoder.b)
+gb_prime = T.grad(encoder.cost, encoder.b_prime)
+# compute the updated value of the parameters after one step
+updated_W       = encoder.W       - gW       * learning_rate
+updated_b       = encoder.b       - gb       * learning_rate
+updated_b_prime = encoder.b_prime - gb_prime * learning_rate
+# defining the function that evaluate the symbolic description of
+# one update step
+train_model = theano.function([x], encoder.cost, updates=\
+{ encoder.W       : updated_W, \
+encoder.b       : updated_b, \
+encoder.b_prime : updated_b_prime } )
+# compiling a theano function that computes the mistakes that are made
+# by the model on a minibatch
+test_model = theano.function([x], encoder.cost)
+n_minibatches        = len(train_batches)
+# early-stopping parameters
+patience              = 10000# look as this many examples regardless
+patience_increase     = 2     # wait this much longer when a new best is
+# found
+improvement_threshold = 0.995 # a relative improvement of this much is
+# considered significant
+validation_frequency  = n_minibatches  # go through this many
+# minibatche before checking the network
+# on the validation set; in this case we
+# check every epoch
+best_params          = None
+best_validation_loss = float('inf')
+best_iter            = 0
+test_score           = 0.
+start_time = time.clock()
+# have a maximum of `n_iter` iterations through the entire dataset
+for iter in xrange(n_iter* n_minibatches):
+# get epoch and minibatch index
+epoch           = iter / n_minibatches
+minibatch_index =  iter % n_minibatches
+# get the minibatches corresponding to `iter` modulo
+# `len(train_batches)`
+x,y = train_batches[ minibatch_index ]
+cost_ij = train_model(x)
+if (iter+1) % validation_frequency == 0:
+# compute zero-one loss on validation set
+this_validation_loss = 0.
+for x,y in valid_batches:
+# sum up the errors for each minibatch
+this_validation_loss += test_model(x)
+# get the average by dividing with the number of minibatches
+this_validation_loss /= len(valid_batches)
+print('epoch %i, minibatch %i/%i, validation error %f ' % \
+(epoch, minibatch_index+1, n_minibatches, \
+this_validation_loss))
+# save value in file
+result_file.write(str(epoch) + ' ' + str(this_validation_loss)+ '\n')
+# if we got the best validation score until now
+if this_validation_loss < best_validation_loss:
+#improve patience if loss improvement is good enough
+if this_validation_loss < best_validation_loss *  \
+improvement_threshold :
+patience = max(patience, iter * patience_increase)
+best_validation_loss = this_validation_loss
+best_iter = iter
+# test it on the test set
+test_score = 0.
+for x,y in test_batches:
+test_score += test_model(x)
+test_score /= len(test_batches)
+print(('     epoch %i, minibatch %i/%i, test error of best '
+'model %f ') %
+(epoch, minibatch_index+1, n_minibatches,
+test_score))
+if patience <= iter :
+print('iter (%i) is superior than patience(%i). break', iter, patience)
+break
+end_time = time.clock()
+print(('Optimization complete with best validation score of %f ,'
+'with test performance %f ') %
+(best_validation_loss, test_score))
+print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
+result_file.close()
+return (best_validation_loss, test_score, (end_time-start_time)/60, best_iter)
+def experiment(state,channel):
+(best_validation_loss, test_score, minutes_trained, iter) = \
+sgd_optimization_mnist(state.learning_rate, state.n_iter, state.n_code_layer,
+state.complexity)
+state.best_validation_loss = best_validation_loss
+state.test_score = test_score
+state.minutes_trained = minutes_trained
+state.iter = iter
+return channel.COMPLETE
+def experiment_nist(state,channel):
+(best_validation_loss, test_score, minutes_trained, iter) = \
+sgd_optimization_nist(state.learning_rate, state.n_iter, state.n_code_layer,
+state.complexity)
+state.best_validation_loss = best_validation_loss
+state.test_score = test_score
+state.minutes_trained = minutes_trained
+state.iter = iter
+return channel.COMPLETE
+if __name__ == '__main__':
+sgd_optimization_nist()

Mercurial > ift6266

comparison deep/autoencoder/DA_training.py @ 190:70a9df1cd20e