comparison deep/stacked_dae/stacked_dae.py @ 275:7b4507295eba

merge
author Xavier Glorot <glorotxa@iro.umontreal.ca>
date Mon, 22 Mar 2010 10:20:10 -0400
parents acb942530923 c8fe09a65039
children
comparison
equal deleted inserted replaced
274:44409b6652aa 275:7b4507295eba
125 # third argument is the probability of success of any trial 125 # third argument is the probability of success of any trial
126 # 126 #
127 # this will produce an array of 0s and 1s where 1 has a 127 # this will produce an array of 0s and 1s where 1 has a
128 # probability of 1 - ``corruption_level`` and 0 with 128 # probability of 1 - ``corruption_level`` and 0 with
129 # ``corruption_level`` 129 # ``corruption_level``
130 self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level) * self.x 130 self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level, dtype=theano.config.floatX) * self.x
131 # Equation (2) 131 # Equation (2)
132 # note : y is stored as an attribute of the class so that it can be 132 # note : y is stored as an attribute of the class so that it can be
133 # used later when stacking dAs. 133 # used later when stacking dAs.
134 self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) 134 self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b)
135 # Equation (3) 135 # Equation (3)
136 self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) 136 #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
137 # Equation (4) 137 # Equation (4)
138 # note : we sum over the size of a datapoint; if we are using minibatches, 138 # note : we sum over the size of a datapoint; if we are using minibatches,
139 # L will be a vector, with one entry per example in minibatch 139 # L will be a vector, with one entry per example in minibatch
140 #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) 140 #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 )
141 #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) 141 #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1)
142 142
143 # bypassing z to avoid running to log(0) 143 # bypassing z to avoid running to log(0)
144 #self.z_a = T.dot(self.y, self.W_prime) + self.b_prime) 144 z_a = T.dot(self.y, self.W_prime) + self.b_prime
145 #self.L = -T.sum( self.x * (T.log(1)-T.log(1+T.exp(-self.z_a))) \ 145 log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a))
146 # + (1.0-self.x) * (T.log(1)-T.log(1+T.exp(-self.z_a))), axis=1 ) 146 # log(1-sigmoid(z_a))
147 log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a))
148 self.L = -T.sum( self.x * (log_sigmoid) \
149 + (1.0-self.x) * (log_1_sigmoid), axis=1 )
147 150
148 # I added this epsilon to avoid getting log(0) and 1/0 in grad 151 # I added this epsilon to avoid getting log(0) and 1/0 in grad
149 # This means conceptually that there'd be no probability of 0, but that 152 # This means conceptually that there'd be no probability of 0, but that
150 # doesn't seem to me as important (maybe I'm wrong?). 153 # doesn't seem to me as important (maybe I'm wrong?).
151 eps = 0.00000001 154 #eps = 0.00000001
152 eps_1 = 1-eps 155 #eps_1 = 1-eps
153 self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \ 156 #self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \
154 + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 ) 157 # + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 )
155 # note : L is now a vector, where each element is the cross-entropy cost 158 # note : L is now a vector, where each element is the cross-entropy cost
156 # of the reconstruction of the corresponding example of the 159 # of the reconstruction of the corresponding example of the
157 # minibatch. We need to compute the average of all these to get 160 # minibatch. We need to compute the average of all these to get
158 # the cost of the minibatch 161 # the cost of the minibatch
159 self.cost = T.mean(self.L) 162 self.cost = T.mean(self.L)
160 163
161 self.params = [ self.W, self.b, self.b_prime ] 164 self.params = [ self.W, self.b, self.b_prime ]
162 165
163 166
164 class SdA(object): 167 class SdA(object):
165 def __init__(self, train_set_x, train_set_y, batch_size, n_ins, 168 def __init__(self, batch_size, n_ins,
166 hidden_layers_sizes, n_outs, 169 hidden_layers_sizes, n_outs,
167 corruption_levels, rng, pretrain_lr, finetune_lr, input_divider=1.0): 170 corruption_levels, rng, pretrain_lr, finetune_lr):
168 # Just to make sure those are not modified somewhere else afterwards 171 # Just to make sure those are not modified somewhere else afterwards
169 hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes) 172 hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes)
170 corruption_levels = copy.deepcopy(corruption_levels) 173 corruption_levels = copy.deepcopy(corruption_levels)
171 174
172 update_locals(self, locals()) 175 update_locals(self, locals())
185 print "corruption_levels", corruption_levels 188 print "corruption_levels", corruption_levels
186 print "n_ins", n_ins 189 print "n_ins", n_ins
187 print "n_outs", n_outs 190 print "n_outs", n_outs
188 print "pretrain_lr", pretrain_lr 191 print "pretrain_lr", pretrain_lr
189 print "finetune_lr", finetune_lr 192 print "finetune_lr", finetune_lr
190 print "input_divider", input_divider
191 print "----" 193 print "----"
192
193 self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX))
194 194
195 if len(hidden_layers_sizes) < 1 : 195 if len(hidden_layers_sizes) < 1 :
196 raiseException (' You must have at least one hidden layer ') 196 raiseException (' You must have at least one hidden layer ')
197 197
198 198
199 # allocate symbolic variables for the data 199 # allocate symbolic variables for the data
200 index = T.lscalar() # index to a [mini]batch 200 #index = T.lscalar() # index to a [mini]batch
201 self.x = T.matrix('x') # the data is presented as rasterized images 201 self.x = T.matrix('x') # the data is presented as rasterized images
202 self.y = T.ivector('y') # the labels are presented as 1D vector of 202 self.y = T.ivector('y') # the labels are presented as 1D vector of
203 # [int] labels 203 # [int] labels
204 204
205 for i in xrange( self.n_layers ): 205 for i in xrange( self.n_layers ):
242 updates = {} 242 updates = {}
243 for param, gparam in zip(dA_layer.params, gparams): 243 for param, gparam in zip(dA_layer.params, gparams):
244 updates[param] = param - gparam * pretrain_lr 244 updates[param] = param - gparam * pretrain_lr
245 245
246 # create a function that trains the dA 246 # create a function that trains the dA
247 update_fn = theano.function([index], dA_layer.cost, \ 247 update_fn = theano.function([self.x], dA_layer.cost, \
248 updates = updates, 248 updates = updates)#,
249 givens = { 249 # givens = {
250 self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider}) 250 # self.x : ensemble})
251 # collect this function into a list
252 #update_fn = theano.function([index], dA_layer.cost, \
253 # updates = updates,
254 # givens = {
255 # self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider})
251 # collect this function into a list 256 # collect this function into a list
252 self.pretrain_functions += [update_fn] 257 self.pretrain_functions += [update_fn]
253 258
254 259
255 # We now need to add a logistic layer on top of the MLP 260 # We now need to add a logistic layer on top of the MLP
268 # compute list of updates 273 # compute list of updates
269 updates = {} 274 updates = {}
270 for param,gparam in zip(self.params, gparams): 275 for param,gparam in zip(self.params, gparams):
271 updates[param] = param - gparam*finetune_lr 276 updates[param] = param - gparam*finetune_lr
272 277
273 self.finetune = theano.function([index], cost, 278 self.finetune = theano.function([self.x,self.y], cost,
274 updates = updates, 279 updates = updates)#,
275 givens = { 280 # givens = {
276 self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider, 281 # self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider,
277 self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) 282 # self.y : train_set_y[index*batch_size:(index+1)*batch_size]} )
278 283
279 # symbolic variable that points to the number of errors made on the 284 # symbolic variable that points to the number of errors made on the
280 # minibatch given by self.x and self.y 285 # minibatch given by self.x and self.y
281 286
282 self.errors = self.logLayer.errors(self.y) 287 self.errors = self.logLayer.errors(self.y)