comparison baseline/log_reg/log_reg.py @ 198:5d88ed99c0af

Modify the log_reg.py tutorial code to use the datasets module.
author Arnaud Bergeron <abergeron@gmail.com>
date Tue, 02 Mar 2010 18:16:49 -0500
parents d37c944133c3
children 777f48ba30df
comparison
equal deleted inserted replaced
197:9116cfe8e4ab 198:5d88ed99c0af
33 Christopher M. Bishop, section 4.3.2 33 Christopher M. Bishop, section 4.3.2
34 34
35 """ 35 """
36 __docformat__ = 'restructedtext en' 36 __docformat__ = 'restructedtext en'
37 37
38 import numpy, time, cPickle, gzip 38 import numpy, time
39 39
40 import theano 40 import theano
41 import theano.tensor as T 41 import theano.tensor as T
42 42 from ift6266 import datasets
43 43
44 class LogisticRegression(object): 44 class LogisticRegression(object):
45 """Multi-class Logistic Regression Class 45 """Multi-class Logistic Regression Class
46 46
47 The logistic regression is fully described by a weight matrix :math:`W` 47 The logistic regression is fully described by a weight matrix :math:`W`
133 # represents a mistake in prediction 133 # represents a mistake in prediction
134 return T.mean( T.neq( self.y_pred, y ) ) 134 return T.mean( T.neq( self.y_pred, y ) )
135 else: 135 else:
136 raise NotImplementedError() 136 raise NotImplementedError()
137 137
138 def shared_dataset( data_xy ):
139 """ Function that loads the dataset into shared variables
140
141 The reason we store our dataset in shared variables is to allow
142 Theano to copy it into the GPU memory (when code is run on GPU).
143 Since copying data into the GPU is slow, copying a minibatch everytime
144 is needed (the default behaviour if the data is not in a shared
145 variable) would lead to a large decrease in performance.
146 """
147 data_x, data_y = data_xy
148 shared_x = theano.shared( numpy.asarray( data_x, dtype = theano.config.floatX ) )
149 shared_y = theano.shared( numpy.asarray( data_y, dtype = theano.config.floatX ) )
150 # When storing data on the GPU it has to be stored as floats
151 # therefore we will store the labels as ``floatX`` as well
152 # (``shared_y`` does exactly that). But during our computations
153 # we need them as ints (we use labels as index, and if they are
154 # floats it doesn't make sense) therefore instead of returning
155 # ``shared_y`` we will have to cast it to int. This little hack
156 # lets ous get around this issue
157 return shared_x, T.cast( shared_y, 'int32' )
158
159 def load_data_pkl_gz( dataset ):
160 ''' Loads the dataset
161
162 :type dataset: string
163 :param dataset: the path to the dataset (here MNIST)
164 '''
165
166 #--------------------------------------------------------------------------------------------------------------------
167 # Load Data
168 #--------------------------------------------------------------------------------------------------------------------
169
170
171 print '... loading data'
172
173 # Load the dataset
174 f = gzip.open(dataset,'rb')
175 train_set, valid_set, test_set = cPickle.load(f)
176 f.close()
177
178 test_set_x, test_set_y = shared_dataset( test_set )
179 valid_set_x, valid_set_y = shared_dataset( valid_set )
180 train_set_x, train_set_y = shared_dataset( train_set )
181
182 rval = [ ( train_set_x, train_set_y ), ( valid_set_x,valid_set_y ), ( test_set_x, test_set_y ) ]
183 return rval
184
185 ##def load_data_ft( verbose = False,\
186 ## data_path = '/data/lisa/data/nist/by_class/'\
187 ## train_data = 'all/all_train_data.ft',\
188 ## train_labels = 'all/all_train_labels.ft',\
189 ## test_data = 'all/all_test_data.ft',\
190 ## test_labels = 'all/all_test_labels.ft'):
191 ##
192 ## train_data_file = open(data_path + train_data)
193 ## train_labels_file = open(data_path + train_labels)
194 ## test_labels_file = open(data_path + test_data)
195 ## test_data_file = open(data_path + test_labels)
196 ##
197 ## raw_train_data = ft.read( train_data_file)
198 ## raw_train_labels = ft.read(train_labels_file)
199 ## raw_test_data = ft.read( test_labels_file)
200 ## raw_test_labels = ft.read( test_data_file)
201 ##
202 ## f.close()
203 ## g.close()
204 ## i.close()
205 ## h.close()
206 ##
207 ##
208 ## test_set_x, test_set_y = shared_dataset(test_set)
209 ## valid_set_x, valid_set_y = shared_dataset(valid_set)
210 ## train_set_x, train_set_y = shared_dataset(train_set)
211 ##
212 ## rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)]
213 ## return rval
214 ## #create a validation set the same size as the test size
215 ## #use the end of the training array for this purpose
216 ## #discard the last remaining so we get a %batch_size number
217 ## test_size=len(raw_test_labels)
218 ## test_size = int(test_size/batch_size)
219 ## test_size*=batch_size
220 ## train_size = len(raw_train_data)
221 ## train_size = int(train_size/batch_size)
222 ## train_size*=batch_size
223 ## validation_size =test_size
224 ## offset = train_size-test_size
225 ## if verbose == True:
226 ## print 'train size = %d' %train_size
227 ## print 'test size = %d' %test_size
228 ## print 'valid size = %d' %validation_size
229 ## print 'offset = %d' %offset
230 ##
231 ##
232
233 #-------------------------------------------------------------------------------------------------------------------- 138 #--------------------------------------------------------------------------------------------------------------------
234 # MAIN 139 # MAIN
235 #-------------------------------------------------------------------------------------------------------------------- 140 #--------------------------------------------------------------------------------------------------------------------
236 141
237 def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \ 142 def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \
238 dataset_name = 'mnist.pkl.gz', image_size = 28 * 28, nb_class = 10, \ 143 dataset=datasets.nist_digits, image_size = 32 * 32, nb_class = 10, \
239 patience = 5000, patience_increase = 2, improvement_threshold = 0.995): 144 patience = 5000, patience_increase = 2, improvement_threshold = 0.995):
240 145
241 """ 146 """
242 Demonstrate stochastic gradient descent optimization of a log-linear 147 Demonstrate stochastic gradient descent optimization of a log-linear
243 model 148 model
252 :param nb_max_examples: maximal number of epochs to run the optimizer 157 :param nb_max_examples: maximal number of epochs to run the optimizer
253 158
254 :type batch_size: int 159 :type batch_size: int
255 :param batch_size: size of the minibatch 160 :param batch_size: size of the minibatch
256 161
257 :type dataset_name: string 162 :type dataset: dataset
258 :param dataset: the path of the MNIST dataset file from 163 :param dataset: a dataset instance from ift6266.datasets
259 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
260 164
261 :type image_size: int 165 :type image_size: int
262 :param image_size: size of the input image in pixels (width * height) 166 :param image_size: size of the input image in pixels (width * height)
263 167
264 :type nb_class: int 168 :type nb_class: int
273 :type improvement_threshold: float 177 :type improvement_threshold: float
274 :param improvement_threshold: a relative improvement of this much is considered significant 178 :param improvement_threshold: a relative improvement of this much is considered significant
275 179
276 180
277 """ 181 """
278 datasets = load_data_pkl_gz( dataset_name )
279
280 train_set_x, train_set_y = datasets[0]
281 valid_set_x, valid_set_y = datasets[1]
282 test_set_x , test_set_y = datasets[2]
283
284 # compute number of minibatches for training, validation and testing
285 n_train_batches = train_set_x.value.shape[0] / batch_size
286 n_valid_batches = valid_set_x.value.shape[0] / batch_size
287 n_test_batches = test_set_x.value.shape[0] / batch_size
288
289 #-------------------------------------------------------------------------------------------------------------------- 182 #--------------------------------------------------------------------------------------------------------------------
290 # Build actual model 183 # Build actual model
291 #-------------------------------------------------------------------------------------------------------------------- 184 #--------------------------------------------------------------------------------------------------------------------
292 185
293 print '... building the model' 186 print '... building the model'
306 # the model in symbolic format 199 # the model in symbolic format
307 cost = classifier.negative_log_likelihood( y ) 200 cost = classifier.negative_log_likelihood( y )
308 201
309 # compiling a Theano function that computes the mistakes that are made by 202 # compiling a Theano function that computes the mistakes that are made by
310 # the model on a minibatch 203 # the model on a minibatch
311 test_model = theano.function( inputs = [ index ], 204 test_model = theano.function( inputs = [ x, y ],
312 outputs = classifier.errors( y ), 205 outputs = classifier.errors( y ))
313 givens = { 206
314 x:test_set_x[ index * batch_size: ( index + 1 ) * batch_size ], 207 validate_model = theano.function( inputs = [ x, y ],
315 y:test_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } ) 208 outputs = classifier.errors( y ))
316
317 validate_model = theano.function( inputs = [ index ],
318 outputs = classifier.errors( y ),
319 givens = {
320 x:valid_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
321 y:valid_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
322 209
323 # compute the gradient of cost with respect to theta = ( W, b ) 210 # compute the gradient of cost with respect to theta = ( W, b )
324 g_W = T.grad( cost = cost, wrt = classifier.W ) 211 g_W = T.grad( cost = cost, wrt = classifier.W )
325 g_b = T.grad( cost = cost, wrt = classifier.b ) 212 g_b = T.grad( cost = cost, wrt = classifier.b )
326 213
329 classifier.b: classifier.b - learning_rate * g_b} 216 classifier.b: classifier.b - learning_rate * g_b}
330 217
331 # compiling a Theano function `train_model` that returns the cost, but in 218 # compiling a Theano function `train_model` that returns the cost, but in
332 # the same time updates the parameter of the model based on the rules 219 # the same time updates the parameter of the model based on the rules
333 # defined in `updates` 220 # defined in `updates`
334 train_model = theano.function( inputs = [ index ], 221 train_model = theano.function( inputs = [ x, y ],
335 outputs = cost, 222 outputs = cost,
336 updates = updates, 223 updates = updates)
337 givens = {
338 x: train_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
339 y: train_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
340 224
341 #-------------------------------------------------------------------------------------------------------------------- 225 #--------------------------------------------------------------------------------------------------------------------
342 # Train model 226 # Train model
343 #-------------------------------------------------------------------------------------------------------------------- 227 #--------------------------------------------------------------------------------------------------------------------
344 228
347 patience = 5000 # look as this many examples regardless 231 patience = 5000 # look as this many examples regardless
348 patience_increase = 2 # wait this much longer when a new best is 232 patience_increase = 2 # wait this much longer when a new best is
349 # found 233 # found
350 improvement_threshold = 0.995 # a relative improvement of this much is 234 improvement_threshold = 0.995 # a relative improvement of this much is
351 # considered significant 235 # considered significant
352 validation_frequency = min( n_train_batches, patience * 0.5 ) 236 validation_frequency = patience * 0.5
353 # go through this many 237 # go through this many
354 # minibatche before checking the network 238 # minibatche before checking the network
355 # on the validation set; in this case we 239 # on the validation set; in this case we
356 # check every epoch 240 # check every epoch
357 241
358 best_params = None 242 best_params = None
359 best_validation_loss = float('inf') 243 best_validation_loss = float('inf')
360 test_score = 0. 244 test_score = 0.
361 start_time = time.clock() 245 start_time = time.clock()
362 246
363 done_looping = False 247 done_looping = False
364 n_epochs = nb_max_examples / train_set_x.value.shape[0] 248 n_iters = nb_max_examples / batch_size
365 epoch = 0 249 epoch = 0
366 250 iter = 0
367 while ( epoch < n_epochs ) and ( not done_looping ): 251
252 while ( iter < n_iters ) and ( not done_looping ):
368 253
369 epoch = epoch + 1 254 epoch = epoch + 1
370 for minibatch_index in xrange( n_train_batches ): 255 for x, y in dataset.train(batch_size):
371 256
372 minibatch_avg_cost = train_model( minibatch_index ) 257 minibatch_avg_cost = train_model( x, y )
373 # iteration number 258 # iteration number
374 iter = epoch * n_train_batches + minibatch_index 259 iter += 1
375 260
376 if ( iter + 1 ) % validation_frequency == 0: 261 if iter % validation_frequency == 0:
377 # compute zero-one loss on validation set 262 # compute zero-one loss on validation set
378 validation_losses = [ validate_model( i ) for i in xrange( n_valid_batches ) ] 263 validation_losses = [ validate_model( xv, yv ) for xv, yv in dataset.valid(batch_size) ]
379 this_validation_loss = numpy.mean( validation_losses ) 264 this_validation_loss = numpy.mean( validation_losses )
380 265
381 print('epoch %i, minibatch %i/%i, validation error %f %%' % \ 266 print('epoch %i, iter %i, validation error %f %%' % \
382 ( epoch, minibatch_index + 1,n_train_batches, \ 267 ( epoch, iter, this_validation_loss*100. ) )
383 this_validation_loss*100. ) )
384 268
385 269
386 # if we got the best validation score until now 270 # if we got the best validation score until now
387 if this_validation_loss < best_validation_loss: 271 if this_validation_loss < best_validation_loss:
388 #improve patience if loss improvement is good enough 272 #improve patience if loss improvement is good enough
391 patience = max( patience, iter * patience_increase ) 275 patience = max( patience, iter * patience_increase )
392 276
393 best_validation_loss = this_validation_loss 277 best_validation_loss = this_validation_loss
394 # test it on the test set 278 # test it on the test set
395 279
396 test_losses = [test_model(i) for i in xrange(n_test_batches)] 280 test_losses = [test_model(xt, yt) for xt, yt in dataset.test(batch_size)]
397 test_score = numpy.mean(test_losses) 281 test_score = numpy.mean(test_losses)
398 282
399 print((' epoch %i, minibatch %i/%i, test error of best ' 283 print((' epoch %i, iter %i, test error of best '
400 'model %f %%') % \ 284 'model %f %%') % \
401 (epoch, minibatch_index+1, n_train_batches,test_score*100.)) 285 (epoch, iter, test_score*100.))
402 286
403 if patience <= iter : 287 if patience <= iter :
404 done_looping = True 288 done_looping = True
405 break 289 break
406 290