Mercurial > ift6266
comparison baseline/log_reg/log_reg.py @ 169:d37c944133c3
directory name change
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Fri, 26 Feb 2010 14:24:11 -0500 |
parents | baseline_algorithms/log_reg/log_reg.py@d1bb6e06497a |
children | 5d88ed99c0af |
comparison
equal
deleted
inserted
replaced
168:5e0e5f1860ec | 169:d37c944133c3 |
---|---|
1 """ | |
2 This tutorial introduces logistic regression using Theano and stochastic | |
3 gradient descent. | |
4 | |
5 Logistic regression is a probabilistic, linear classifier. It is parametrized | |
6 by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is | |
7 done by projecting data points onto a set of hyperplanes, the distance to | |
8 which is used to determine a class membership probability. | |
9 | |
10 Mathematically, this can be written as: | |
11 | |
12 .. math:: | |
13 P(Y=i|x, W,b) &= softmax_i(W x + b) \\ | |
14 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} | |
15 | |
16 | |
17 The output of the model or prediction is then done by taking the argmax of | |
18 the vector whose i'th element is P(Y=i|x). | |
19 | |
20 .. math:: | |
21 | |
22 y_{pred} = argmax_i P(Y=i|x,W,b) | |
23 | |
24 | |
25 This tutorial presents a stochastic gradient descent optimization method | |
26 suitable for large datasets, and a conjugate gradient optimization method | |
27 that is suitable for smaller datasets. | |
28 | |
29 | |
30 References: | |
31 | |
32 - textbooks: "Pattern Recognition and Machine Learning" - | |
33 Christopher M. Bishop, section 4.3.2 | |
34 | |
35 """ | |
36 __docformat__ = 'restructedtext en' | |
37 | |
38 import numpy, time, cPickle, gzip | |
39 | |
40 import theano | |
41 import theano.tensor as T | |
42 | |
43 | |
44 class LogisticRegression(object): | |
45 """Multi-class Logistic Regression Class | |
46 | |
47 The logistic regression is fully described by a weight matrix :math:`W` | |
48 and bias vector :math:`b`. Classification is done by projecting data | |
49 points onto a set of hyperplanes, the distance to which is used to | |
50 determine a class membership probability. | |
51 """ | |
52 | |
53 | |
54 def __init__( self, input, n_in, n_out ): | |
55 """ Initialize the parameters of the logistic regression | |
56 | |
57 :type input: theano.tensor.TensorType | |
58 :param input: symbolic variable that describes the input of the | |
59 architecture (one minibatch) | |
60 | |
61 :type n_in: int | |
62 :param n_in: number of input units, the dimension of the space in | |
63 which the datapoints lie | |
64 | |
65 :type n_out: int | |
66 :param n_out: number of output units, the dimension of the space in | |
67 which the labels lie | |
68 | |
69 """ | |
70 | |
71 # initialize with 0 the weights W as a matrix of shape (n_in, n_out) | |
72 self.W = theano.shared( value = numpy.zeros(( n_in, n_out ), dtype = theano.config.floatX ), | |
73 name =' W') | |
74 # initialize the baises b as a vector of n_out 0s | |
75 self.b = theano.shared( value = numpy.zeros(( n_out, ), dtype = theano.config.floatX ), | |
76 name = 'b') | |
77 | |
78 | |
79 # compute vector of class-membership probabilities in symbolic form | |
80 self.p_y_given_x = T.nnet.softmax( T.dot( input, self.W ) + self.b ) | |
81 | |
82 # compute prediction as class whose probability is maximal in | |
83 # symbolic form | |
84 self.y_pred=T.argmax( self.p_y_given_x, axis =1 ) | |
85 | |
86 # parameters of the model | |
87 self.params = [ self.W, self.b ] | |
88 | |
89 | |
90 def negative_log_likelihood( self, y ): | |
91 """Return the mean of the negative log-likelihood of the prediction | |
92 of this model under a given target distribution. | |
93 | |
94 .. math:: | |
95 | |
96 \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = | |
97 \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ | |
98 \ell (\theta=\{W,b\}, \mathcal{D}) | |
99 | |
100 :type y: theano.tensor.TensorType | |
101 :param y: corresponds to a vector that gives for each example the | |
102 correct label | |
103 | |
104 Note: we use the mean instead of the sum so that | |
105 the learning rate is less dependent on the batch size | |
106 """ | |
107 # y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch | |
108 # T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1] | |
109 # T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class | |
110 # LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]] | |
111 # and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v, | |
112 # i.e., the mean log-likelihood across the minibatch. | |
113 return -T.mean( T.log( self.p_y_given_x )[ T.arange( y.shape[0] ), y ] ) | |
114 | |
115 | |
116 def errors( self, y ): | |
117 """Return a float representing the number of errors in the minibatch | |
118 over the total number of examples of the minibatch ; zero one | |
119 loss over the size of the minibatch | |
120 | |
121 :type y: theano.tensor.TensorType | |
122 :param y: corresponds to a vector that gives for each example the | |
123 correct label | |
124 """ | |
125 | |
126 # check if y has same dimension of y_pred | |
127 if y.ndim != self.y_pred.ndim: | |
128 raise TypeError( 'y should have the same shape as self.y_pred', | |
129 ( 'y', target.type, 'y_pred', self.y_pred.type ) ) | |
130 # check if y is of the correct datatype | |
131 if y.dtype.startswith('int'): | |
132 # the T.neq operator returns a vector of 0s and 1s, where 1 | |
133 # represents a mistake in prediction | |
134 return T.mean( T.neq( self.y_pred, y ) ) | |
135 else: | |
136 raise NotImplementedError() | |
137 | |
138 def shared_dataset( data_xy ): | |
139 """ Function that loads the dataset into shared variables | |
140 | |
141 The reason we store our dataset in shared variables is to allow | |
142 Theano to copy it into the GPU memory (when code is run on GPU). | |
143 Since copying data into the GPU is slow, copying a minibatch everytime | |
144 is needed (the default behaviour if the data is not in a shared | |
145 variable) would lead to a large decrease in performance. | |
146 """ | |
147 data_x, data_y = data_xy | |
148 shared_x = theano.shared( numpy.asarray( data_x, dtype = theano.config.floatX ) ) | |
149 shared_y = theano.shared( numpy.asarray( data_y, dtype = theano.config.floatX ) ) | |
150 # When storing data on the GPU it has to be stored as floats | |
151 # therefore we will store the labels as ``floatX`` as well | |
152 # (``shared_y`` does exactly that). But during our computations | |
153 # we need them as ints (we use labels as index, and if they are | |
154 # floats it doesn't make sense) therefore instead of returning | |
155 # ``shared_y`` we will have to cast it to int. This little hack | |
156 # lets ous get around this issue | |
157 return shared_x, T.cast( shared_y, 'int32' ) | |
158 | |
159 def load_data_pkl_gz( dataset ): | |
160 ''' Loads the dataset | |
161 | |
162 :type dataset: string | |
163 :param dataset: the path to the dataset (here MNIST) | |
164 ''' | |
165 | |
166 #-------------------------------------------------------------------------------------------------------------------- | |
167 # Load Data | |
168 #-------------------------------------------------------------------------------------------------------------------- | |
169 | |
170 | |
171 print '... loading data' | |
172 | |
173 # Load the dataset | |
174 f = gzip.open(dataset,'rb') | |
175 train_set, valid_set, test_set = cPickle.load(f) | |
176 f.close() | |
177 | |
178 test_set_x, test_set_y = shared_dataset( test_set ) | |
179 valid_set_x, valid_set_y = shared_dataset( valid_set ) | |
180 train_set_x, train_set_y = shared_dataset( train_set ) | |
181 | |
182 rval = [ ( train_set_x, train_set_y ), ( valid_set_x,valid_set_y ), ( test_set_x, test_set_y ) ] | |
183 return rval | |
184 | |
185 ##def load_data_ft( verbose = False,\ | |
186 ## data_path = '/data/lisa/data/nist/by_class/'\ | |
187 ## train_data = 'all/all_train_data.ft',\ | |
188 ## train_labels = 'all/all_train_labels.ft',\ | |
189 ## test_data = 'all/all_test_data.ft',\ | |
190 ## test_labels = 'all/all_test_labels.ft'): | |
191 ## | |
192 ## train_data_file = open(data_path + train_data) | |
193 ## train_labels_file = open(data_path + train_labels) | |
194 ## test_labels_file = open(data_path + test_data) | |
195 ## test_data_file = open(data_path + test_labels) | |
196 ## | |
197 ## raw_train_data = ft.read( train_data_file) | |
198 ## raw_train_labels = ft.read(train_labels_file) | |
199 ## raw_test_data = ft.read( test_labels_file) | |
200 ## raw_test_labels = ft.read( test_data_file) | |
201 ## | |
202 ## f.close() | |
203 ## g.close() | |
204 ## i.close() | |
205 ## h.close() | |
206 ## | |
207 ## | |
208 ## test_set_x, test_set_y = shared_dataset(test_set) | |
209 ## valid_set_x, valid_set_y = shared_dataset(valid_set) | |
210 ## train_set_x, train_set_y = shared_dataset(train_set) | |
211 ## | |
212 ## rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)] | |
213 ## return rval | |
214 ## #create a validation set the same size as the test size | |
215 ## #use the end of the training array for this purpose | |
216 ## #discard the last remaining so we get a %batch_size number | |
217 ## test_size=len(raw_test_labels) | |
218 ## test_size = int(test_size/batch_size) | |
219 ## test_size*=batch_size | |
220 ## train_size = len(raw_train_data) | |
221 ## train_size = int(train_size/batch_size) | |
222 ## train_size*=batch_size | |
223 ## validation_size =test_size | |
224 ## offset = train_size-test_size | |
225 ## if verbose == True: | |
226 ## print 'train size = %d' %train_size | |
227 ## print 'test size = %d' %test_size | |
228 ## print 'valid size = %d' %validation_size | |
229 ## print 'offset = %d' %offset | |
230 ## | |
231 ## | |
232 | |
233 #-------------------------------------------------------------------------------------------------------------------- | |
234 # MAIN | |
235 #-------------------------------------------------------------------------------------------------------------------- | |
236 | |
237 def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \ | |
238 dataset_name = 'mnist.pkl.gz', image_size = 28 * 28, nb_class = 10, \ | |
239 patience = 5000, patience_increase = 2, improvement_threshold = 0.995): | |
240 | |
241 """ | |
242 Demonstrate stochastic gradient descent optimization of a log-linear | |
243 model | |
244 | |
245 This is demonstrated on MNIST. | |
246 | |
247 :type learning_rate: float | |
248 :param learning_rate: learning rate used (factor for the stochastic | |
249 gradient) | |
250 | |
251 :type nb_max_examples: int | |
252 :param nb_max_examples: maximal number of epochs to run the optimizer | |
253 | |
254 :type batch_size: int | |
255 :param batch_size: size of the minibatch | |
256 | |
257 :type dataset_name: string | |
258 :param dataset: the path of the MNIST dataset file from | |
259 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz | |
260 | |
261 :type image_size: int | |
262 :param image_size: size of the input image in pixels (width * height) | |
263 | |
264 :type nb_class: int | |
265 :param nb_class: number of classes | |
266 | |
267 :type patience: int | |
268 :param patience: look as this many examples regardless | |
269 | |
270 :type patience_increase: int | |
271 :param patience_increase: wait this much longer when a new best is found | |
272 | |
273 :type improvement_threshold: float | |
274 :param improvement_threshold: a relative improvement of this much is considered significant | |
275 | |
276 | |
277 """ | |
278 datasets = load_data_pkl_gz( dataset_name ) | |
279 | |
280 train_set_x, train_set_y = datasets[0] | |
281 valid_set_x, valid_set_y = datasets[1] | |
282 test_set_x , test_set_y = datasets[2] | |
283 | |
284 # compute number of minibatches for training, validation and testing | |
285 n_train_batches = train_set_x.value.shape[0] / batch_size | |
286 n_valid_batches = valid_set_x.value.shape[0] / batch_size | |
287 n_test_batches = test_set_x.value.shape[0] / batch_size | |
288 | |
289 #-------------------------------------------------------------------------------------------------------------------- | |
290 # Build actual model | |
291 #-------------------------------------------------------------------------------------------------------------------- | |
292 | |
293 print '... building the model' | |
294 | |
295 # allocate symbolic variables for the data | |
296 index = T.lscalar( ) # index to a [mini]batch | |
297 x = T.matrix('x') # the data is presented as rasterized images | |
298 y = T.ivector('y') # the labels are presented as 1D vector of | |
299 # [int] labels | |
300 | |
301 # construct the logistic regression class | |
302 | |
303 classifier = LogisticRegression( input = x, n_in = image_size, n_out = nb_class ) | |
304 | |
305 # the cost we minimize during training is the negative log likelihood of | |
306 # the model in symbolic format | |
307 cost = classifier.negative_log_likelihood( y ) | |
308 | |
309 # compiling a Theano function that computes the mistakes that are made by | |
310 # the model on a minibatch | |
311 test_model = theano.function( inputs = [ index ], | |
312 outputs = classifier.errors( y ), | |
313 givens = { | |
314 x:test_set_x[ index * batch_size: ( index + 1 ) * batch_size ], | |
315 y:test_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } ) | |
316 | |
317 validate_model = theano.function( inputs = [ index ], | |
318 outputs = classifier.errors( y ), | |
319 givens = { | |
320 x:valid_set_x[ index * batch_size: ( index + 1 ) * batch_size ], | |
321 y:valid_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } ) | |
322 | |
323 # compute the gradient of cost with respect to theta = ( W, b ) | |
324 g_W = T.grad( cost = cost, wrt = classifier.W ) | |
325 g_b = T.grad( cost = cost, wrt = classifier.b ) | |
326 | |
327 # specify how to update the parameters of the model as a dictionary | |
328 updates = { classifier.W: classifier.W - learning_rate * g_W,\ | |
329 classifier.b: classifier.b - learning_rate * g_b} | |
330 | |
331 # compiling a Theano function `train_model` that returns the cost, but in | |
332 # the same time updates the parameter of the model based on the rules | |
333 # defined in `updates` | |
334 train_model = theano.function( inputs = [ index ], | |
335 outputs = cost, | |
336 updates = updates, | |
337 givens = { | |
338 x: train_set_x[ index * batch_size: ( index + 1 ) * batch_size ], | |
339 y: train_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } ) | |
340 | |
341 #-------------------------------------------------------------------------------------------------------------------- | |
342 # Train model | |
343 #-------------------------------------------------------------------------------------------------------------------- | |
344 | |
345 print '... training the model' | |
346 # early-stopping parameters | |
347 patience = 5000 # look as this many examples regardless | |
348 patience_increase = 2 # wait this much longer when a new best is | |
349 # found | |
350 improvement_threshold = 0.995 # a relative improvement of this much is | |
351 # considered significant | |
352 validation_frequency = min( n_train_batches, patience * 0.5 ) | |
353 # go through this many | |
354 # minibatche before checking the network | |
355 # on the validation set; in this case we | |
356 # check every epoch | |
357 | |
358 best_params = None | |
359 best_validation_loss = float('inf') | |
360 test_score = 0. | |
361 start_time = time.clock() | |
362 | |
363 done_looping = False | |
364 n_epochs = nb_max_examples / train_set_x.value.shape[0] | |
365 epoch = 0 | |
366 | |
367 while ( epoch < n_epochs ) and ( not done_looping ): | |
368 | |
369 epoch = epoch + 1 | |
370 for minibatch_index in xrange( n_train_batches ): | |
371 | |
372 minibatch_avg_cost = train_model( minibatch_index ) | |
373 # iteration number | |
374 iter = epoch * n_train_batches + minibatch_index | |
375 | |
376 if ( iter + 1 ) % validation_frequency == 0: | |
377 # compute zero-one loss on validation set | |
378 validation_losses = [ validate_model( i ) for i in xrange( n_valid_batches ) ] | |
379 this_validation_loss = numpy.mean( validation_losses ) | |
380 | |
381 print('epoch %i, minibatch %i/%i, validation error %f %%' % \ | |
382 ( epoch, minibatch_index + 1,n_train_batches, \ | |
383 this_validation_loss*100. ) ) | |
384 | |
385 | |
386 # if we got the best validation score until now | |
387 if this_validation_loss < best_validation_loss: | |
388 #improve patience if loss improvement is good enough | |
389 if this_validation_loss < best_validation_loss * \ | |
390 improvement_threshold : | |
391 patience = max( patience, iter * patience_increase ) | |
392 | |
393 best_validation_loss = this_validation_loss | |
394 # test it on the test set | |
395 | |
396 test_losses = [test_model(i) for i in xrange(n_test_batches)] | |
397 test_score = numpy.mean(test_losses) | |
398 | |
399 print((' epoch %i, minibatch %i/%i, test error of best ' | |
400 'model %f %%') % \ | |
401 (epoch, minibatch_index+1, n_train_batches,test_score*100.)) | |
402 | |
403 if patience <= iter : | |
404 done_looping = True | |
405 break | |
406 | |
407 end_time = time.clock() | |
408 print(('Optimization complete with best validation score of %f %%,' | |
409 'with test performance %f %%') % | |
410 ( best_validation_loss * 100., test_score * 100.)) | |
411 print ('The code ran for %f minutes' % ((end_time-start_time) / 60.)) | |
412 | |
413 ###### return validation_error, test_error, nb_exemples, time | |
414 | |
415 if __name__ == '__main__': | |
416 log_reg() | |
417 | |
418 | |
419 def jobman_log_reg(state, channel): | |
420 (validation_error, test_error, nb_exemples, time) = log_reg( learning_rate = state.learning_rate,\ | |
421 nb_max_examples = state.nb_max_examples,\ | |
422 batch_size = state.batch_size,\ | |
423 dataset_name = state.dataset_name, \ | |
424 image_size = state.image_size, \ | |
425 nb_class = state.nb_class ) | |
426 | |
427 state.validation_error = validation_error | |
428 state.test_error = test_error | |
429 state.nb_exemples = nb_exemples | |
430 state.time = time | |
431 return channel.COMPLETE | |
432 | |
433 | |
434 | |
435 | |
436 | |
437 |