ift6266: deep/stacked_dae/v_sylvain/stacked

comparison deep/stacked_dae/v_sylvain/stacked_dae.py @ 353:bc4464c0894c

Ajout d'une fonctionnalite pour pouvoir avoir un taux d'apprentissage decroissant dans le pretrain

author	SylvainPL <sylvain.pannetier.lebeuf@umontreal.ca>
date	Wed, 21 Apr 2010 14:51:14 -0400
parents	799ad23a161f
children	14b28e43ce4e

comparison

equal deleted inserted replaced

-:cfb79f9fd1a4
+:bc4464c0894c
 self.tilde_x  = theano_rng.binomial( self.x.shape,  1,  1 - corruption_level, dtype=theano.config.floatX) * self.x
 # Equation (2)
 # note  : y is stored as an attribute of the class so that it can be
 #         used later when stacking dAs.
-self.y   = T.nnet.sigmoid(T.dot(self.tilde_x, self.W      ) + self.b)
+##    self.y   = T.nnet.sigmoid(T.dot(self.tilde_x, self.W      ) + self.b)
+##
-# Equation (3)
+##    # Equation (3)
-#self.z   = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
+##    #self.z   = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
-# Equation (4)
+##    # Equation (4)
-# note : we sum over the size of a datapoint; if we are using minibatches,
+##    # note : we sum over the size of a datapoint; if we are using minibatches,
-#        L will  be a vector, with one entry per example in minibatch
+##    #        L will  be a vector, with one entry per example in minibatch
-#self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 )
+##    #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 )
-#self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1)
+##    #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1)
+##
-# bypassing z to avoid running to log(0)
+##    # bypassing z to avoid running to log(0)
-z_a = T.dot(self.y, self.W_prime) + self.b_prime
+##    z_a = T.dot(self.y, self.W_prime) + self.b_prime
-log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a))
+##    log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a))
-# log(1-sigmoid(z_a))
+##    # log(1-sigmoid(z_a))
-log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a))
+##    log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a))
-self.L = -T.sum( self.x * (log_sigmoid) \
+##    self.L = -T.sum( self.x * (log_sigmoid) \
-+ (1.0-self.x) * (log_1_sigmoid), axis=1 )
+##                    + (1.0-self.x) * (log_1_sigmoid), axis=1 )
 # I added this epsilon to avoid getting log(0) and 1/0 in grad
 # This means conceptually that there'd be no probability of 0, but that
 # doesn't seem to me as important (maybe I'm wrong?).
 #eps = 0.00000001
 # note : L is now a vector, where each element is the cross-entropy cost
 #        of the reconstruction of the corresponding example of the
 #        minibatch. We need to compute the average of all these to get
 #        the cost of the minibatch
-##    #Or use a Tanh everything is always between 0 and 1, the range is
+#Or use a Tanh everything is always between 0 and 1, the range is
-##    #changed so it remain the same as when sigmoid is used
+#changed so it remain the same as when sigmoid is used
-##    self.y   = (T.tanh(T.dot(self.tilde_x, self.W      ) + self.b)+1.0)/2.0
+self.y   = (T.tanh(T.dot(self.tilde_x, self.W      ) + self.b)+1.0)/2.0
-##
-##    z_a = T.dot(self.y, self.W_prime) + self.b_prime
+z_a = T.dot(self.y, self.W_prime) + self.b_prime
-##    self.z =  (T.tanh(z_a + self.b_prime)+1.0) / 2.0
+self.z =  (T.tanh(z_a + self.b_prime)+1.0) / 2.0
-##    #To ensure to do not have a log(0) operation
+#To ensure to do not have a log(0) operation
-##    if self.z <= 0:
+if self.z <= 0:
-##        self.z = 0.000001
+self.z = 0.000001
-##    if self.z >= 1:
+if self.z >= 1:
-##        self.z = 0.999999
+self.z = 0.999999
-##
-##    self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 )
+self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 )
 self.cost = T.mean(self.L)
 self.params = [ self.W, self.b, self.b_prime ]
 #index   = T.lscalar()    # index to a [mini]batch
 self.x  = T.matrix('x')  # the data is presented as rasterized images
 self.y  = T.ivector('y') # the labels are presented as 1D vector of
 # [int] labels
 self.finetune_lr = T.fscalar('finetune_lr') #To get a dynamic finetune learning rate
+self.pretrain_lr = T.fscalar('pretrain_lr') #To get a dynamic pretrain learning rate
 for i in xrange( self.n_layers ):
 # construct the sigmoidal layer
 # the size of the input is either the number of hidden units of
 layer_input = self.x
 else:
 layer_input = self.layers[-1].output
 #We have to choose between sigmoidal layer or tanh layer !
-layer = SigmoidalLayer(rng, layer_input, input_size,
+##            layer = SigmoidalLayer(rng, layer_input, input_size,
+##                                   hidden_layers_sizes[i] )
+layer = TanhLayer(rng, layer_input, input_size,
 hidden_layers_sizes[i] )
-##            layer = TanhLayer(rng, layer_input, input_size,
-##                                   hidden_layers_sizes[i] )
 # add the layer to the
 self.layers += [layer]
 self.params += layer.params
 # Construct a denoising autoencoder that shared weights with this
 # compute gradients of layer parameters
 gparams = T.grad(dA_layer.cost, dA_layer.params)
 # compute the list of updates
 updates = {}
 for param, gparam in zip(dA_layer.params, gparams):
-updates[param] = param - gparam * pretrain_lr
+updates[param] = param - gparam * self.pretrain_lr
 # create a function that trains the dA
-update_fn = theano.function([self.x], dA_layer.cost, \
+update_fn = theano.function([self.x, self.pretrain_lr], dA_layer.cost, \
 updates = updates)#,
 #     givens = {
 #         self.x : ensemble})
 # collect this function into a list
 #update_fn = theano.function([index], dA_layer.cost, \

Mercurial > ift6266

comparison deep/stacked_dae/v_sylvain/stacked_dae.py @ 353:bc4464c0894c