comparison deep/stacked_dae/v_sylvain/stacked_dae.py @ 353:bc4464c0894c

Ajout d'une fonctionnalite pour pouvoir avoir un taux d'apprentissage decroissant dans le pretrain
author SylvainPL <sylvain.pannetier.lebeuf@umontreal.ca>
date Wed, 21 Apr 2010 14:51:14 -0400
parents 799ad23a161f
children 14b28e43ce4e
comparison
equal deleted inserted replaced
352:cfb79f9fd1a4 353:bc4464c0894c
151 self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level, dtype=theano.config.floatX) * self.x 151 self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level, dtype=theano.config.floatX) * self.x
152 # Equation (2) 152 # Equation (2)
153 # note : y is stored as an attribute of the class so that it can be 153 # note : y is stored as an attribute of the class so that it can be
154 # used later when stacking dAs. 154 # used later when stacking dAs.
155 155
156 self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) 156 ## self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b)
157 157 ##
158 # Equation (3) 158 ## # Equation (3)
159 #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) 159 ## #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
160 # Equation (4) 160 ## # Equation (4)
161 # note : we sum over the size of a datapoint; if we are using minibatches, 161 ## # note : we sum over the size of a datapoint; if we are using minibatches,
162 # L will be a vector, with one entry per example in minibatch 162 ## # L will be a vector, with one entry per example in minibatch
163 #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) 163 ## #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 )
164 #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) 164 ## #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1)
165 165 ##
166 # bypassing z to avoid running to log(0) 166 ## # bypassing z to avoid running to log(0)
167 z_a = T.dot(self.y, self.W_prime) + self.b_prime 167 ## z_a = T.dot(self.y, self.W_prime) + self.b_prime
168 log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a)) 168 ## log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a))
169 # log(1-sigmoid(z_a)) 169 ## # log(1-sigmoid(z_a))
170 log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a)) 170 ## log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a))
171 self.L = -T.sum( self.x * (log_sigmoid) \ 171 ## self.L = -T.sum( self.x * (log_sigmoid) \
172 + (1.0-self.x) * (log_1_sigmoid), axis=1 ) 172 ## + (1.0-self.x) * (log_1_sigmoid), axis=1 )
173 173
174 # I added this epsilon to avoid getting log(0) and 1/0 in grad 174 # I added this epsilon to avoid getting log(0) and 1/0 in grad
175 # This means conceptually that there'd be no probability of 0, but that 175 # This means conceptually that there'd be no probability of 0, but that
176 # doesn't seem to me as important (maybe I'm wrong?). 176 # doesn't seem to me as important (maybe I'm wrong?).
177 #eps = 0.00000001 177 #eps = 0.00000001
181 # note : L is now a vector, where each element is the cross-entropy cost 181 # note : L is now a vector, where each element is the cross-entropy cost
182 # of the reconstruction of the corresponding example of the 182 # of the reconstruction of the corresponding example of the
183 # minibatch. We need to compute the average of all these to get 183 # minibatch. We need to compute the average of all these to get
184 # the cost of the minibatch 184 # the cost of the minibatch
185 185
186 ## #Or use a Tanh everything is always between 0 and 1, the range is 186 #Or use a Tanh everything is always between 0 and 1, the range is
187 ## #changed so it remain the same as when sigmoid is used 187 #changed so it remain the same as when sigmoid is used
188 ## self.y = (T.tanh(T.dot(self.tilde_x, self.W ) + self.b)+1.0)/2.0 188 self.y = (T.tanh(T.dot(self.tilde_x, self.W ) + self.b)+1.0)/2.0
189 ## 189
190 ## z_a = T.dot(self.y, self.W_prime) + self.b_prime 190 z_a = T.dot(self.y, self.W_prime) + self.b_prime
191 ## self.z = (T.tanh(z_a + self.b_prime)+1.0) / 2.0 191 self.z = (T.tanh(z_a + self.b_prime)+1.0) / 2.0
192 ## #To ensure to do not have a log(0) operation 192 #To ensure to do not have a log(0) operation
193 ## if self.z <= 0: 193 if self.z <= 0:
194 ## self.z = 0.000001 194 self.z = 0.000001
195 ## if self.z >= 1: 195 if self.z >= 1:
196 ## self.z = 0.999999 196 self.z = 0.999999
197 ## 197
198 ## self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) 198 self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 )
199 199
200 self.cost = T.mean(self.L) 200 self.cost = T.mean(self.L)
201 201
202 self.params = [ self.W, self.b, self.b_prime ] 202 self.params = [ self.W, self.b, self.b_prime ]
203 203
239 #index = T.lscalar() # index to a [mini]batch 239 #index = T.lscalar() # index to a [mini]batch
240 self.x = T.matrix('x') # the data is presented as rasterized images 240 self.x = T.matrix('x') # the data is presented as rasterized images
241 self.y = T.ivector('y') # the labels are presented as 1D vector of 241 self.y = T.ivector('y') # the labels are presented as 1D vector of
242 # [int] labels 242 # [int] labels
243 self.finetune_lr = T.fscalar('finetune_lr') #To get a dynamic finetune learning rate 243 self.finetune_lr = T.fscalar('finetune_lr') #To get a dynamic finetune learning rate
244 self.pretrain_lr = T.fscalar('pretrain_lr') #To get a dynamic pretrain learning rate
244 245
245 for i in xrange( self.n_layers ): 246 for i in xrange( self.n_layers ):
246 # construct the sigmoidal layer 247 # construct the sigmoidal layer
247 248
248 # the size of the input is either the number of hidden units of 249 # the size of the input is either the number of hidden units of
259 layer_input = self.x 260 layer_input = self.x
260 else: 261 else:
261 layer_input = self.layers[-1].output 262 layer_input = self.layers[-1].output
262 #We have to choose between sigmoidal layer or tanh layer ! 263 #We have to choose between sigmoidal layer or tanh layer !
263 264
264 layer = SigmoidalLayer(rng, layer_input, input_size, 265 ## layer = SigmoidalLayer(rng, layer_input, input_size,
266 ## hidden_layers_sizes[i] )
267
268 layer = TanhLayer(rng, layer_input, input_size,
265 hidden_layers_sizes[i] ) 269 hidden_layers_sizes[i] )
266
267 ## layer = TanhLayer(rng, layer_input, input_size,
268 ## hidden_layers_sizes[i] )
269 # add the layer to the 270 # add the layer to the
270 self.layers += [layer] 271 self.layers += [layer]
271 self.params += layer.params 272 self.params += layer.params
272 273
273 # Construct a denoising autoencoder that shared weights with this 274 # Construct a denoising autoencoder that shared weights with this
283 # compute gradients of layer parameters 284 # compute gradients of layer parameters
284 gparams = T.grad(dA_layer.cost, dA_layer.params) 285 gparams = T.grad(dA_layer.cost, dA_layer.params)
285 # compute the list of updates 286 # compute the list of updates
286 updates = {} 287 updates = {}
287 for param, gparam in zip(dA_layer.params, gparams): 288 for param, gparam in zip(dA_layer.params, gparams):
288 updates[param] = param - gparam * pretrain_lr 289 updates[param] = param - gparam * self.pretrain_lr
289 290
290 # create a function that trains the dA 291 # create a function that trains the dA
291 update_fn = theano.function([self.x], dA_layer.cost, \ 292 update_fn = theano.function([self.x, self.pretrain_lr], dA_layer.cost, \
292 updates = updates)#, 293 updates = updates)#,
293 # givens = { 294 # givens = {
294 # self.x : ensemble}) 295 # self.x : ensemble})
295 # collect this function into a list 296 # collect this function into a list
296 #update_fn = theano.function([index], dA_layer.cost, \ 297 #update_fn = theano.function([index], dA_layer.cost, \