comparison baseline/log_reg/log_reg.py @ 169:d37c944133c3

directory name change
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Fri, 26 Feb 2010 14:24:11 -0500
parents baseline_algorithms/log_reg/log_reg.py@d1bb6e06497a
children 5d88ed99c0af
comparison
equal deleted inserted replaced
168:5e0e5f1860ec 169:d37c944133c3
1 """
2 This tutorial introduces logistic regression using Theano and stochastic
3 gradient descent.
4
5 Logistic regression is a probabilistic, linear classifier. It is parametrized
6 by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
7 done by projecting data points onto a set of hyperplanes, the distance to
8 which is used to determine a class membership probability.
9
10 Mathematically, this can be written as:
11
12 .. math::
13 P(Y=i|x, W,b) &= softmax_i(W x + b) \\
14 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
15
16
17 The output of the model or prediction is then done by taking the argmax of
18 the vector whose i'th element is P(Y=i|x).
19
20 .. math::
21
22 y_{pred} = argmax_i P(Y=i|x,W,b)
23
24
25 This tutorial presents a stochastic gradient descent optimization method
26 suitable for large datasets, and a conjugate gradient optimization method
27 that is suitable for smaller datasets.
28
29
30 References:
31
32 - textbooks: "Pattern Recognition and Machine Learning" -
33 Christopher M. Bishop, section 4.3.2
34
35 """
36 __docformat__ = 'restructedtext en'
37
38 import numpy, time, cPickle, gzip
39
40 import theano
41 import theano.tensor as T
42
43
44 class LogisticRegression(object):
45 """Multi-class Logistic Regression Class
46
47 The logistic regression is fully described by a weight matrix :math:`W`
48 and bias vector :math:`b`. Classification is done by projecting data
49 points onto a set of hyperplanes, the distance to which is used to
50 determine a class membership probability.
51 """
52
53
54 def __init__( self, input, n_in, n_out ):
55 """ Initialize the parameters of the logistic regression
56
57 :type input: theano.tensor.TensorType
58 :param input: symbolic variable that describes the input of the
59 architecture (one minibatch)
60
61 :type n_in: int
62 :param n_in: number of input units, the dimension of the space in
63 which the datapoints lie
64
65 :type n_out: int
66 :param n_out: number of output units, the dimension of the space in
67 which the labels lie
68
69 """
70
71 # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
72 self.W = theano.shared( value = numpy.zeros(( n_in, n_out ), dtype = theano.config.floatX ),
73 name =' W')
74 # initialize the baises b as a vector of n_out 0s
75 self.b = theano.shared( value = numpy.zeros(( n_out, ), dtype = theano.config.floatX ),
76 name = 'b')
77
78
79 # compute vector of class-membership probabilities in symbolic form
80 self.p_y_given_x = T.nnet.softmax( T.dot( input, self.W ) + self.b )
81
82 # compute prediction as class whose probability is maximal in
83 # symbolic form
84 self.y_pred=T.argmax( self.p_y_given_x, axis =1 )
85
86 # parameters of the model
87 self.params = [ self.W, self.b ]
88
89
90 def negative_log_likelihood( self, y ):
91 """Return the mean of the negative log-likelihood of the prediction
92 of this model under a given target distribution.
93
94 .. math::
95
96 \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
97 \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
98 \ell (\theta=\{W,b\}, \mathcal{D})
99
100 :type y: theano.tensor.TensorType
101 :param y: corresponds to a vector that gives for each example the
102 correct label
103
104 Note: we use the mean instead of the sum so that
105 the learning rate is less dependent on the batch size
106 """
107 # y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch
108 # T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1]
109 # T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class
110 # LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]]
111 # and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v,
112 # i.e., the mean log-likelihood across the minibatch.
113 return -T.mean( T.log( self.p_y_given_x )[ T.arange( y.shape[0] ), y ] )
114
115
116 def errors( self, y ):
117 """Return a float representing the number of errors in the minibatch
118 over the total number of examples of the minibatch ; zero one
119 loss over the size of the minibatch
120
121 :type y: theano.tensor.TensorType
122 :param y: corresponds to a vector that gives for each example the
123 correct label
124 """
125
126 # check if y has same dimension of y_pred
127 if y.ndim != self.y_pred.ndim:
128 raise TypeError( 'y should have the same shape as self.y_pred',
129 ( 'y', target.type, 'y_pred', self.y_pred.type ) )
130 # check if y is of the correct datatype
131 if y.dtype.startswith('int'):
132 # the T.neq operator returns a vector of 0s and 1s, where 1
133 # represents a mistake in prediction
134 return T.mean( T.neq( self.y_pred, y ) )
135 else:
136 raise NotImplementedError()
137
138 def shared_dataset( data_xy ):
139 """ Function that loads the dataset into shared variables
140
141 The reason we store our dataset in shared variables is to allow
142 Theano to copy it into the GPU memory (when code is run on GPU).
143 Since copying data into the GPU is slow, copying a minibatch everytime
144 is needed (the default behaviour if the data is not in a shared
145 variable) would lead to a large decrease in performance.
146 """
147 data_x, data_y = data_xy
148 shared_x = theano.shared( numpy.asarray( data_x, dtype = theano.config.floatX ) )
149 shared_y = theano.shared( numpy.asarray( data_y, dtype = theano.config.floatX ) )
150 # When storing data on the GPU it has to be stored as floats
151 # therefore we will store the labels as ``floatX`` as well
152 # (``shared_y`` does exactly that). But during our computations
153 # we need them as ints (we use labels as index, and if they are
154 # floats it doesn't make sense) therefore instead of returning
155 # ``shared_y`` we will have to cast it to int. This little hack
156 # lets ous get around this issue
157 return shared_x, T.cast( shared_y, 'int32' )
158
159 def load_data_pkl_gz( dataset ):
160 ''' Loads the dataset
161
162 :type dataset: string
163 :param dataset: the path to the dataset (here MNIST)
164 '''
165
166 #--------------------------------------------------------------------------------------------------------------------
167 # Load Data
168 #--------------------------------------------------------------------------------------------------------------------
169
170
171 print '... loading data'
172
173 # Load the dataset
174 f = gzip.open(dataset,'rb')
175 train_set, valid_set, test_set = cPickle.load(f)
176 f.close()
177
178 test_set_x, test_set_y = shared_dataset( test_set )
179 valid_set_x, valid_set_y = shared_dataset( valid_set )
180 train_set_x, train_set_y = shared_dataset( train_set )
181
182 rval = [ ( train_set_x, train_set_y ), ( valid_set_x,valid_set_y ), ( test_set_x, test_set_y ) ]
183 return rval
184
185 ##def load_data_ft( verbose = False,\
186 ## data_path = '/data/lisa/data/nist/by_class/'\
187 ## train_data = 'all/all_train_data.ft',\
188 ## train_labels = 'all/all_train_labels.ft',\
189 ## test_data = 'all/all_test_data.ft',\
190 ## test_labels = 'all/all_test_labels.ft'):
191 ##
192 ## train_data_file = open(data_path + train_data)
193 ## train_labels_file = open(data_path + train_labels)
194 ## test_labels_file = open(data_path + test_data)
195 ## test_data_file = open(data_path + test_labels)
196 ##
197 ## raw_train_data = ft.read( train_data_file)
198 ## raw_train_labels = ft.read(train_labels_file)
199 ## raw_test_data = ft.read( test_labels_file)
200 ## raw_test_labels = ft.read( test_data_file)
201 ##
202 ## f.close()
203 ## g.close()
204 ## i.close()
205 ## h.close()
206 ##
207 ##
208 ## test_set_x, test_set_y = shared_dataset(test_set)
209 ## valid_set_x, valid_set_y = shared_dataset(valid_set)
210 ## train_set_x, train_set_y = shared_dataset(train_set)
211 ##
212 ## rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)]
213 ## return rval
214 ## #create a validation set the same size as the test size
215 ## #use the end of the training array for this purpose
216 ## #discard the last remaining so we get a %batch_size number
217 ## test_size=len(raw_test_labels)
218 ## test_size = int(test_size/batch_size)
219 ## test_size*=batch_size
220 ## train_size = len(raw_train_data)
221 ## train_size = int(train_size/batch_size)
222 ## train_size*=batch_size
223 ## validation_size =test_size
224 ## offset = train_size-test_size
225 ## if verbose == True:
226 ## print 'train size = %d' %train_size
227 ## print 'test size = %d' %test_size
228 ## print 'valid size = %d' %validation_size
229 ## print 'offset = %d' %offset
230 ##
231 ##
232
233 #--------------------------------------------------------------------------------------------------------------------
234 # MAIN
235 #--------------------------------------------------------------------------------------------------------------------
236
237 def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \
238 dataset_name = 'mnist.pkl.gz', image_size = 28 * 28, nb_class = 10, \
239 patience = 5000, patience_increase = 2, improvement_threshold = 0.995):
240
241 """
242 Demonstrate stochastic gradient descent optimization of a log-linear
243 model
244
245 This is demonstrated on MNIST.
246
247 :type learning_rate: float
248 :param learning_rate: learning rate used (factor for the stochastic
249 gradient)
250
251 :type nb_max_examples: int
252 :param nb_max_examples: maximal number of epochs to run the optimizer
253
254 :type batch_size: int
255 :param batch_size: size of the minibatch
256
257 :type dataset_name: string
258 :param dataset: the path of the MNIST dataset file from
259 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
260
261 :type image_size: int
262 :param image_size: size of the input image in pixels (width * height)
263
264 :type nb_class: int
265 :param nb_class: number of classes
266
267 :type patience: int
268 :param patience: look as this many examples regardless
269
270 :type patience_increase: int
271 :param patience_increase: wait this much longer when a new best is found
272
273 :type improvement_threshold: float
274 :param improvement_threshold: a relative improvement of this much is considered significant
275
276
277 """
278 datasets = load_data_pkl_gz( dataset_name )
279
280 train_set_x, train_set_y = datasets[0]
281 valid_set_x, valid_set_y = datasets[1]
282 test_set_x , test_set_y = datasets[2]
283
284 # compute number of minibatches for training, validation and testing
285 n_train_batches = train_set_x.value.shape[0] / batch_size
286 n_valid_batches = valid_set_x.value.shape[0] / batch_size
287 n_test_batches = test_set_x.value.shape[0] / batch_size
288
289 #--------------------------------------------------------------------------------------------------------------------
290 # Build actual model
291 #--------------------------------------------------------------------------------------------------------------------
292
293 print '... building the model'
294
295 # allocate symbolic variables for the data
296 index = T.lscalar( ) # index to a [mini]batch
297 x = T.matrix('x') # the data is presented as rasterized images
298 y = T.ivector('y') # the labels are presented as 1D vector of
299 # [int] labels
300
301 # construct the logistic regression class
302
303 classifier = LogisticRegression( input = x, n_in = image_size, n_out = nb_class )
304
305 # the cost we minimize during training is the negative log likelihood of
306 # the model in symbolic format
307 cost = classifier.negative_log_likelihood( y )
308
309 # compiling a Theano function that computes the mistakes that are made by
310 # the model on a minibatch
311 test_model = theano.function( inputs = [ index ],
312 outputs = classifier.errors( y ),
313 givens = {
314 x:test_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
315 y:test_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
316
317 validate_model = theano.function( inputs = [ index ],
318 outputs = classifier.errors( y ),
319 givens = {
320 x:valid_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
321 y:valid_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
322
323 # compute the gradient of cost with respect to theta = ( W, b )
324 g_W = T.grad( cost = cost, wrt = classifier.W )
325 g_b = T.grad( cost = cost, wrt = classifier.b )
326
327 # specify how to update the parameters of the model as a dictionary
328 updates = { classifier.W: classifier.W - learning_rate * g_W,\
329 classifier.b: classifier.b - learning_rate * g_b}
330
331 # compiling a Theano function `train_model` that returns the cost, but in
332 # the same time updates the parameter of the model based on the rules
333 # defined in `updates`
334 train_model = theano.function( inputs = [ index ],
335 outputs = cost,
336 updates = updates,
337 givens = {
338 x: train_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
339 y: train_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
340
341 #--------------------------------------------------------------------------------------------------------------------
342 # Train model
343 #--------------------------------------------------------------------------------------------------------------------
344
345 print '... training the model'
346 # early-stopping parameters
347 patience = 5000 # look as this many examples regardless
348 patience_increase = 2 # wait this much longer when a new best is
349 # found
350 improvement_threshold = 0.995 # a relative improvement of this much is
351 # considered significant
352 validation_frequency = min( n_train_batches, patience * 0.5 )
353 # go through this many
354 # minibatche before checking the network
355 # on the validation set; in this case we
356 # check every epoch
357
358 best_params = None
359 best_validation_loss = float('inf')
360 test_score = 0.
361 start_time = time.clock()
362
363 done_looping = False
364 n_epochs = nb_max_examples / train_set_x.value.shape[0]
365 epoch = 0
366
367 while ( epoch < n_epochs ) and ( not done_looping ):
368
369 epoch = epoch + 1
370 for minibatch_index in xrange( n_train_batches ):
371
372 minibatch_avg_cost = train_model( minibatch_index )
373 # iteration number
374 iter = epoch * n_train_batches + minibatch_index
375
376 if ( iter + 1 ) % validation_frequency == 0:
377 # compute zero-one loss on validation set
378 validation_losses = [ validate_model( i ) for i in xrange( n_valid_batches ) ]
379 this_validation_loss = numpy.mean( validation_losses )
380
381 print('epoch %i, minibatch %i/%i, validation error %f %%' % \
382 ( epoch, minibatch_index + 1,n_train_batches, \
383 this_validation_loss*100. ) )
384
385
386 # if we got the best validation score until now
387 if this_validation_loss < best_validation_loss:
388 #improve patience if loss improvement is good enough
389 if this_validation_loss < best_validation_loss * \
390 improvement_threshold :
391 patience = max( patience, iter * patience_increase )
392
393 best_validation_loss = this_validation_loss
394 # test it on the test set
395
396 test_losses = [test_model(i) for i in xrange(n_test_batches)]
397 test_score = numpy.mean(test_losses)
398
399 print((' epoch %i, minibatch %i/%i, test error of best '
400 'model %f %%') % \
401 (epoch, minibatch_index+1, n_train_batches,test_score*100.))
402
403 if patience <= iter :
404 done_looping = True
405 break
406
407 end_time = time.clock()
408 print(('Optimization complete with best validation score of %f %%,'
409 'with test performance %f %%') %
410 ( best_validation_loss * 100., test_score * 100.))
411 print ('The code ran for %f minutes' % ((end_time-start_time) / 60.))
412
413 ###### return validation_error, test_error, nb_exemples, time
414
415 if __name__ == '__main__':
416 log_reg()
417
418
419 def jobman_log_reg(state, channel):
420 (validation_error, test_error, nb_exemples, time) = log_reg( learning_rate = state.learning_rate,\
421 nb_max_examples = state.nb_max_examples,\
422 batch_size = state.batch_size,\
423 dataset_name = state.dataset_name, \
424 image_size = state.image_size, \
425 nb_class = state.nb_class )
426
427 state.validation_error = validation_error
428 state.test_error = test_error
429 state.nb_exemples = nb_exemples
430 state.time = time
431 return channel.COMPLETE
432
433
434
435
436
437