Mercurial > ift6266
diff deep/stacked_dae/v_sylvain/stacked_dae.py @ 353:bc4464c0894c
Ajout d'une fonctionnalite pour pouvoir avoir un taux d'apprentissage decroissant dans le pretrain
author | SylvainPL <sylvain.pannetier.lebeuf@umontreal.ca> |
---|---|
date | Wed, 21 Apr 2010 14:51:14 -0400 |
parents | 799ad23a161f |
children | 14b28e43ce4e |
line wrap: on
line diff
--- a/deep/stacked_dae/v_sylvain/stacked_dae.py Wed Apr 21 14:50:59 2010 -0400 +++ b/deep/stacked_dae/v_sylvain/stacked_dae.py Wed Apr 21 14:51:14 2010 -0400 @@ -153,23 +153,23 @@ # note : y is stored as an attribute of the class so that it can be # used later when stacking dAs. - self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) - - # Equation (3) - #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) - # Equation (4) - # note : we sum over the size of a datapoint; if we are using minibatches, - # L will be a vector, with one entry per example in minibatch - #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) - #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) - - # bypassing z to avoid running to log(0) - z_a = T.dot(self.y, self.W_prime) + self.b_prime - log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a)) - # log(1-sigmoid(z_a)) - log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a)) - self.L = -T.sum( self.x * (log_sigmoid) \ - + (1.0-self.x) * (log_1_sigmoid), axis=1 ) +## self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) +## +## # Equation (3) +## #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) +## # Equation (4) +## # note : we sum over the size of a datapoint; if we are using minibatches, +## # L will be a vector, with one entry per example in minibatch +## #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) +## #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) +## +## # bypassing z to avoid running to log(0) +## z_a = T.dot(self.y, self.W_prime) + self.b_prime +## log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a)) +## # log(1-sigmoid(z_a)) +## log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a)) +## self.L = -T.sum( self.x * (log_sigmoid) \ +## + (1.0-self.x) * (log_1_sigmoid), axis=1 ) # I added this epsilon to avoid getting log(0) and 1/0 in grad # This means conceptually that there'd be no probability of 0, but that @@ -183,19 +183,19 @@ # minibatch. We need to compute the average of all these to get # the cost of the minibatch -## #Or use a Tanh everything is always between 0 and 1, the range is -## #changed so it remain the same as when sigmoid is used -## self.y = (T.tanh(T.dot(self.tilde_x, self.W ) + self.b)+1.0)/2.0 -## -## z_a = T.dot(self.y, self.W_prime) + self.b_prime -## self.z = (T.tanh(z_a + self.b_prime)+1.0) / 2.0 -## #To ensure to do not have a log(0) operation -## if self.z <= 0: -## self.z = 0.000001 -## if self.z >= 1: -## self.z = 0.999999 -## -## self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) + #Or use a Tanh everything is always between 0 and 1, the range is + #changed so it remain the same as when sigmoid is used + self.y = (T.tanh(T.dot(self.tilde_x, self.W ) + self.b)+1.0)/2.0 + + z_a = T.dot(self.y, self.W_prime) + self.b_prime + self.z = (T.tanh(z_a + self.b_prime)+1.0) / 2.0 + #To ensure to do not have a log(0) operation + if self.z <= 0: + self.z = 0.000001 + if self.z >= 1: + self.z = 0.999999 + + self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) self.cost = T.mean(self.L) @@ -241,6 +241,7 @@ self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels self.finetune_lr = T.fscalar('finetune_lr') #To get a dynamic finetune learning rate + self.pretrain_lr = T.fscalar('pretrain_lr') #To get a dynamic pretrain learning rate for i in xrange( self.n_layers ): # construct the sigmoidal layer @@ -261,11 +262,11 @@ layer_input = self.layers[-1].output #We have to choose between sigmoidal layer or tanh layer ! - layer = SigmoidalLayer(rng, layer_input, input_size, - hidden_layers_sizes[i] ) +## layer = SigmoidalLayer(rng, layer_input, input_size, +## hidden_layers_sizes[i] ) -## layer = TanhLayer(rng, layer_input, input_size, -## hidden_layers_sizes[i] ) + layer = TanhLayer(rng, layer_input, input_size, + hidden_layers_sizes[i] ) # add the layer to the self.layers += [layer] self.params += layer.params @@ -285,10 +286,10 @@ # compute the list of updates updates = {} for param, gparam in zip(dA_layer.params, gparams): - updates[param] = param - gparam * pretrain_lr + updates[param] = param - gparam * self.pretrain_lr # create a function that trains the dA - update_fn = theano.function([self.x], dA_layer.cost, \ + update_fn = theano.function([self.x, self.pretrain_lr], dA_layer.cost, \ updates = updates)#, # givens = { # self.x : ensemble})