view deep/autoencoder/DA_training.py @ 239:42005ec87747

Mergé (manuellement) les changements de Sylvain pour utiliser le code de dataset d'Arnaud, à cette différence près que je n'utilse pas les givens. J'ai probablement une approche différente pour limiter la taille du dataset dans mon débuggage, aussi.
author fsavard
date Mon, 15 Mar 2010 18:30:21 -0400
parents e12702b88a2d
children
line wrap: on
line source

"""
 This tutorial introduces stacked denoising auto-encoders (SdA) using Theano.

 Denoising autoencoders are the building blocks for SDAE. 
 They are based on auto-encoders as the ones used in Bengio et al. 2007.
 An autoencoder takes an input x and first maps it to a hidden representation
 y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting 
 latent representation y is then mapped back to a "reconstructed" vector 
 z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b').  The weight 
 matrix W' can optionally be constrained such that W' = W^T, in which case 
 the autoencoder is said to have tied weights. The network is trained such 
 that to minimize the reconstruction error (the error between x and z).

 For the denosing autoencoder, during training, first x is corrupted into 
 \tilde{x}, where \tilde{x} is a partially destroyed version of x by means 
 of a stochastic mapping. Afterwards y is computed as before (using 
 \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction 
 error is now measured between z and the uncorrupted input x, which is 
 computed as the cross-entropy : 
      - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]

 For X iteration of the main program loop it takes *** minutes on an 
 Intel Core i7 and *** minutes on GPU (NVIDIA GTX 285 graphics processor).


 References :
   - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and 
   Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
   2008
   - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
   Training of Deep Networks, Advances in Neural Information Processing 
   Systems 19, 2007

"""

import numpy 
import theano
import time
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams

import gzip
import cPickle

from pylearn.io import filetensor as ft

class dA():
  """Denoising Auto-Encoder class (dA) 

  A denoising autoencoders tries to reconstruct the input from a corrupted 
  version of it by projecting it first in a latent space and reprojecting 
  it afterwards back in the input space. Please refer to Vincent et al.,2008
  for more details. If x is the input then equation (1) computes a partially
  destroyed version of x by means of a stochastic mapping q_D. Equation (2) 
  computes the projection of the input into the latent space. Equation (3) 
  computes the reconstruction of the input, while equation (4) computes the 
  reconstruction error.
  
  .. math::

    \tilde{x} ~ q_D(\tilde{x}|x)                                         (1)

    y = s(W \tilde{x} + b)                                               (2)

    z = s(W' y  + b')                                                    (3)

    L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)]          (4)

  """

  def __init__(self, n_visible= 784, n_hidden= 500, complexity = 0.1, input= None):
        """
        Initialize the DAE class by specifying the number of visible units (the 
        dimension d of the input ), the number of hidden units ( the dimension 
        d' of the latent or hidden space ) and by giving a symbolic variable 
        for the input. Such a symbolic variable is useful when the input is 
        the result of some computations. For example when dealing with SDAEs,
        the dA on layer 2 gets as input the output of the DAE on layer 1. 
        This output can be written as a function of the input to the entire 
        model, and as such can be computed by theano whenever needed. 
        
        :param n_visible: number of visible units

        :param n_hidden:  number of hidden units

        :param input:     a symbolic description of the input or None 

        """
        self.n_visible = n_visible
        self.n_hidden  = n_hidden
        
        # create a Theano random generator that gives symbolic random values
        theano_rng = RandomStreams()
        # create a numpy random generator
        numpy_rng = numpy.random.RandomState()
		
        # print the parameter of the DA
        if True :
            print 'input size = %d' %n_visible
            print 'hidden size = %d' %n_hidden
            print 'complexity = %2.2f' %complexity
         
        # initial values for weights and biases
        # note : W' was written as `W_prime` and b' as `b_prime`

        # W is initialized with `initial_W` which is uniformely sampled
        # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
        # the output of uniform if converted using asarray to dtype 
        # theano.config.floatX so that the code is runable on GPU
        initial_W = numpy.asarray( numpy.random.uniform( \
                  low = -numpy.sqrt(6./(n_visible+n_hidden)), \
                  high = numpy.sqrt(6./(n_visible+n_hidden)), \
                  size = (n_visible, n_hidden)), dtype = theano.config.floatX)
        initial_b       = numpy.zeros(n_hidden)

        # W' is initialized with `initial_W_prime` which is uniformely sampled
        # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
        # the output of uniform if converted using asarray to dtype 
        # theano.config.floatX so that the code is runable on GPU
        initial_b_prime= numpy.zeros(n_visible)
         
        
        # theano shared variables for weights and biases
        self.W       = theano.shared(value = initial_W,       name = "W")
        self.b       = theano.shared(value = initial_b,       name = "b")
        # tied weights, therefore W_prime is W transpose
        self.W_prime = self.W.T 
        self.b_prime = theano.shared(value = initial_b_prime, name = "b'")

        # if no input is given, generate a variable representing the input
        if input == None : 
            # we use a matrix because we expect a minibatch of several examples,
            # each example being a row
            x = T.dmatrix(name = 'input') 
        else:
            x = input
        # Equation (1)
        # note : first argument of theano.rng.binomial is the shape(size) of 
        #        random numbers that it should produce
        #        second argument is the number of trials 
        #        third argument is the probability of success of any trial
        #
        #        this will produce an array of 0s and 1s where 1 has a 
        #        probability of 0.9 and 0 of 0.1

        tilde_x  = theano_rng.binomial( x.shape,  1,  1-complexity) * x
        # Equation (2)
        # note  : y is stored as an attribute of the class so that it can be 
        #         used later when stacking dAs. 
        self.y   = T.nnet.sigmoid(T.dot(tilde_x, self.W      ) + self.b)
        # Equation (3)
        z        = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
        # Equation (4)
        self.L = - T.sum( x*T.log(z) + (1-x)*T.log(1-z), axis=1 ) 
        # note : L is now a vector, where each element is the cross-entropy cost 
        #        of the reconstruction of the corresponding example of the 
        #        minibatch. We need to compute the average of all these to get 
        #        the cost of the minibatch
        self.cost = T.mean(self.L)
        # note : y is computed from the corrupted `tilde_x`. Later on, 
        #        we will need the hidden layer obtained from the uncorrupted 
        #        input when for example we will pass this as input to the layer 
        #        above
        self.hidden_values = T.nnet.sigmoid( T.dot(x, self.W) + self.b)



def sgd_optimization_nist( learning_rate=0.01,  \
                            n_iter = 300, n_code_layer = 400, \
                            complexity = 0.1):
    """
    Demonstrate stochastic gradient descent optimization for a denoising autoencoder

    This is demonstrated on MNIST.

    :param learning_rate: learning rate used (factor for the stochastic 
    gradient

    :param pretraining_epochs: number of epoch to do pretraining

    :param pretrain_lr: learning rate to be used during pre-training

    :param n_iter: maximal number of iterations ot run the optimizer 

    """
    #open file to save the validation and test curve
    filename = 'lr_' + str(learning_rate) + 'ni_' + str(n_iter) + 'nc_' + str(n_code_layer) + \
    'c_' + str(complexity) + '.txt'

    result_file = open(filename, 'w')



    data_path = '/data/lisa/data/nist/by_class/'
    f = open(data_path+'all/all_train_data.ft')
    g = open(data_path+'all/all_train_labels.ft')
    h = open(data_path+'all/all_test_data.ft')
    i = open(data_path+'all/all_test_labels.ft')
    
    train_set_x = ft.read(f)
    train_set_y = ft.read(g)
    test_set_x = ft.read(h)
    test_set_y = ft.read(i)
    
    f.close()
    g.close()
    i.close()
    h.close()

    # make minibatches of size 20 
    batch_size = 20    # sized of the minibatch

    #create a validation set the same size as the test size
    #use the end of the training array for this purpose
    #discard the last remaining so we get a %batch_size number
    test_size=len(test_set_y)
    test_size = int(test_size/batch_size)
    test_size*=batch_size
    train_size = len(train_set_x)
    train_size = int(train_size/batch_size)
    train_size*=batch_size
    validation_size =test_size 
    offset = train_size-test_size
    if True:
        print 'train size = %d' %train_size
        print 'test size = %d' %test_size
        print 'valid size = %d' %validation_size
        print 'offset = %d' %offset
    
    
    #train_set = (train_set_x,train_set_y)
    train_batches = []
    for i in xrange(0, train_size-test_size, batch_size):
        train_batches = train_batches + \
            [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
            
    test_batches = []
    for i in xrange(0, test_size, batch_size):
        test_batches = test_batches + \
            [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
    
    valid_batches = []
    for i in xrange(0, test_size, batch_size):
        valid_batches = valid_batches + \
            [(train_set_x[offset+i:offset+i+batch_size], \
            train_set_y[offset+i:offset+i+batch_size])]


    ishape     = (32,32) # this is the size of NIST images

    # allocate symbolic variables for the data
    x = T.fmatrix()  # the data is presented as rasterized images
    y = T.lvector()  # the labels are presented as 1D vector of 
                          # [long int] labels

    # construct the denoising autoencoder class
    n_ins = 32*32
    encoder = dA(n_ins, n_code_layer, complexity, input = x.reshape((batch_size,n_ins)))

    # Train autoencoder
    
    # compute gradients of the layer parameters
    gW       = T.grad(encoder.cost, encoder.W)
    gb       = T.grad(encoder.cost, encoder.b)
    gb_prime = T.grad(encoder.cost, encoder.b_prime)
    # compute the updated value of the parameters after one step
    updated_W       = encoder.W       - gW       * learning_rate
    updated_b       = encoder.b       - gb       * learning_rate
    updated_b_prime = encoder.b_prime - gb_prime * learning_rate

    # defining the function that evaluate the symbolic description of 
    # one update step 
    train_model = theano.function([x], encoder.cost, updates=\
                    { encoder.W       : updated_W, \
                      encoder.b       : updated_b, \
                      encoder.b_prime : updated_b_prime } )


 

    # compiling a theano function that computes the mistakes that are made  
    # by the model on a minibatch
    test_model = theano.function([x], encoder.cost)

    normalize = numpy.asarray(255, dtype=theano.config.floatX)

  
    n_minibatches        = len(train_batches) 
 
    # early-stopping parameters
    patience              = 10000000 / batch_size # look as this many examples regardless
    patience_increase     = 2     # wait this much longer when a new best is 
                                  # found
    improvement_threshold = 0.995 # a relative improvement of this much is 
                                  # considered significant
    validation_frequency  = n_minibatches  # go through this many 
                                  # minibatche before checking the network 
                                  # on the validation set; in this case we 
                                  # check every epoch 


    best_params          = None
    best_validation_loss = float('inf')
    best_iter            = 0
    test_score           = 0.
    start_time = time.clock()
    # have a maximum of `n_iter` iterations through the entire dataset
    for iter in xrange(n_iter* n_minibatches):

        # get epoch and minibatch index
        epoch           = iter / n_minibatches
        minibatch_index =  iter % n_minibatches

        # get the minibatches corresponding to `iter` modulo
        # `len(train_batches)`
        x,y = train_batches[ minibatch_index ]
        '''
        if iter == 0:
            b = numpy.asarray(255, dtype=theano.config.floatX)
            x = x / b
            print x
            print y
            print x.__class__
            print x.shape
            print x.dtype.name
            print y.dtype.name
            print x.min(), x.max()
        '''
        
        cost_ij = train_model(x/normalize)

        if (iter+1) % validation_frequency == 0: 
            # compute zero-one loss on validation set 
            this_validation_loss = 0.
            for x,y in valid_batches:
                # sum up the errors for each minibatch
                this_validation_loss += test_model(x/normalize)
            # get the average by dividing with the number of minibatches
            this_validation_loss /= len(valid_batches)

            print('epoch %i, minibatch %i/%i, validation error %f ' % \
                   (epoch, minibatch_index+1, n_minibatches, \
                    this_validation_loss))

            # save value in file
            result_file.write(str(epoch) + ' ' + str(this_validation_loss)+ '\n')


            # if we got the best validation score until now
            if this_validation_loss < best_validation_loss:

                #improve patience if loss improvement is good enough
                if this_validation_loss < best_validation_loss *  \
                       improvement_threshold :
                    patience = max(patience, iter * patience_increase)

                best_validation_loss = this_validation_loss
                best_iter = iter
                # test it on the test set
            
                test_score = 0.
                for x,y in test_batches:
                    test_score += test_model(x/normalize)
                test_score /= len(test_batches)
                print(('     epoch %i, minibatch %i/%i, test error of best '
                      'model %f ') % 
                             (epoch, minibatch_index+1, n_minibatches,
                              test_score))

        if patience <= iter :
                print('iter (%i) is superior than patience(%i). break', (iter, patience))
                break

        

    end_time = time.clock()
    print(('Optimization complete with best validation score of %f ,'
           'with test performance %f ') %  
                 (best_validation_loss, test_score))
    print ('The code ran for %f minutes' % ((end_time-start_time)/60.))

    
    result_file.close()

    return (best_validation_loss, test_score, (end_time-start_time)/60, best_iter)

def sgd_optimization_mnist( learning_rate=0.01,  \
                            n_iter = 1, n_code_layer = 400, \
                            complexity = 0.1):
    """
    Demonstrate stochastic gradient descent optimization for a denoising autoencoder

    This is demonstrated on MNIST.

    :param learning_rate: learning rate used (factor for the stochastic 
    gradient

    :param pretraining_epochs: number of epoch to do pretraining

    :param pretrain_lr: learning rate to be used during pre-training

    :param n_iter: maximal number of iterations ot run the optimizer 

    """
    #open file to save the validation and test curve
    filename = 'lr_' + str(learning_rate) + 'ni_' + str(n_iter) + 'nc_' + str(n_code_layer) + \
    'c_' + str(complexity) + '.txt'

    result_file = open(filename, 'w')

    # Load the dataset 
    f = gzip.open('/u/lisa/HTML/deep/data/mnist/mnist.pkl.gz','rb')
    train_set, valid_set, test_set = cPickle.load(f)
    f.close()

    # make minibatches of size 20 
    batch_size = 20    # sized of the minibatch

    # Dealing with the training set
    # get the list of training images (x) and their labels (y)
    (train_set_x, train_set_y) = train_set
    # initialize the list of training minibatches with empty list
    train_batches = []
    for i in xrange(0, len(train_set_x), batch_size):
        # add to the list of minibatches the minibatch starting at 
        # position i, ending at position i+batch_size
        # a minibatch is a pair ; the first element of the pair is a list 
        # of datapoints, the second element is the list of corresponding 
        # labels
        train_batches = train_batches + \
               [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]

    # Dealing with the validation set
    (valid_set_x, valid_set_y) = valid_set
    # initialize the list of validation minibatches 
    valid_batches = []
    for i in xrange(0, len(valid_set_x), batch_size):
        valid_batches = valid_batches + \
               [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]

    # Dealing with the testing set
    (test_set_x, test_set_y) = test_set
    # initialize the list of testing minibatches 
    test_batches = []
    for i in xrange(0, len(test_set_x), batch_size):
        test_batches = test_batches + \
              [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]


    ishape     = (28,28) # this is the size of MNIST images

    # allocate symbolic variables for the data
    x = T.fmatrix()  # the data is presented as rasterized images
    y = T.lvector()  # the labels are presented as 1D vector of 
                          # [long int] labels

    # construct the denoising autoencoder class
    n_ins = 28*28
    encoder = dA(n_ins, n_code_layer, complexity, input = x.reshape((batch_size,n_ins)))

    # Train autoencoder
    
    # compute gradients of the layer parameters
    gW       = T.grad(encoder.cost, encoder.W)
    gb       = T.grad(encoder.cost, encoder.b)
    gb_prime = T.grad(encoder.cost, encoder.b_prime)
    # compute the updated value of the parameters after one step
    updated_W       = encoder.W       - gW       * learning_rate
    updated_b       = encoder.b       - gb       * learning_rate
    updated_b_prime = encoder.b_prime - gb_prime * learning_rate

    # defining the function that evaluate the symbolic description of 
    # one update step 
    train_model = theano.function([x], encoder.cost, updates=\
                    { encoder.W       : updated_W, \
                      encoder.b       : updated_b, \
                      encoder.b_prime : updated_b_prime } )


 

    # compiling a theano function that computes the mistakes that are made  
    # by the model on a minibatch
    test_model = theano.function([x], encoder.cost)



  
    n_minibatches        = len(train_batches) 
 
    # early-stopping parameters
    patience              = 10000# look as this many examples regardless
    patience_increase     = 2     # wait this much longer when a new best is 
                                  # found
    improvement_threshold = 0.995 # a relative improvement of this much is 
                                  # considered significant
    validation_frequency  = n_minibatches  # go through this many 
                                  # minibatche before checking the network 
                                  # on the validation set; in this case we 
                                  # check every epoch 


    best_params          = None
    best_validation_loss = float('inf')
    best_iter            = 0
    test_score           = 0.
    start_time = time.clock()
    # have a maximum of `n_iter` iterations through the entire dataset
    for iter in xrange(n_iter* n_minibatches):

        # get epoch and minibatch index
        epoch           = iter / n_minibatches
        minibatch_index =  iter % n_minibatches

        # get the minibatches corresponding to `iter` modulo
        # `len(train_batches)`
        x,y = train_batches[ minibatch_index ]
        cost_ij = train_model(x)

        if (iter+1) % validation_frequency == 0: 
            # compute zero-one loss on validation set 
            this_validation_loss = 0.
            for x,y in valid_batches:
                # sum up the errors for each minibatch
                this_validation_loss += test_model(x)
            # get the average by dividing with the number of minibatches
            this_validation_loss /= len(valid_batches)

            print('epoch %i, minibatch %i/%i, validation error %f ' % \
                   (epoch, minibatch_index+1, n_minibatches, \
                    this_validation_loss))

            # save value in file
            result_file.write(str(epoch) + ' ' + str(this_validation_loss)+ '\n')


            # if we got the best validation score until now
            if this_validation_loss < best_validation_loss:

                #improve patience if loss improvement is good enough
                if this_validation_loss < best_validation_loss *  \
                       improvement_threshold :
                    patience = max(patience, iter * patience_increase)

                best_validation_loss = this_validation_loss
                best_iter = iter
                # test it on the test set
            
                test_score = 0.
                for x,y in test_batches:
                    test_score += test_model(x)
                test_score /= len(test_batches)
                print(('     epoch %i, minibatch %i/%i, test error of best '
                      'model %f ') % 
                             (epoch, minibatch_index+1, n_minibatches,
                              test_score))

        if patience <= iter :
                print('iter (%i) is superior than patience(%i). break', iter, patience)
                break


    end_time = time.clock()
    print(('Optimization complete with best validation score of %f ,'
           'with test performance %f ') %  
                 (best_validation_loss, test_score))
    print ('The code ran for %f minutes' % ((end_time-start_time)/60.))

    
    result_file.close()

    return (best_validation_loss, test_score, (end_time-start_time)/60, best_iter)


def experiment(state,channel):

    (best_validation_loss, test_score, minutes_trained, iter) = \
        sgd_optimization_mnist(state.learning_rate, state.n_iter, state.n_code_layer,
            state.complexity)

    state.best_validation_loss = best_validation_loss
    state.test_score = test_score
    state.minutes_trained = minutes_trained
    state.iter = iter

    return channel.COMPLETE

def experiment_nist(state,channel):

    (best_validation_loss, test_score, minutes_trained, iter) = \
        sgd_optimization_nist(state.learning_rate, state.n_iter, state.n_code_layer,
            state.complexity)

    state.best_validation_loss = best_validation_loss
    state.test_score = test_score
    state.minutes_trained = minutes_trained
    state.iter = iter

    return channel.COMPLETE


if __name__ == '__main__':

    sgd_optimization_nist()