Mercurial > ift6266
comparison code_tutoriel/logistic_sgd.py @ 165:4bc5eeec6394
Updating the tutorial code to the latest revisions.
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Fri, 26 Feb 2010 13:55:27 -0500 |
parents | bcc87d3e33a3 |
children |
comparison
equal
deleted
inserted
replaced
164:e3de934a98b6 | 165:4bc5eeec6394 |
---|---|
30 References: | 30 References: |
31 | 31 |
32 - textbooks: "Pattern Recognition and Machine Learning" - | 32 - textbooks: "Pattern Recognition and Machine Learning" - |
33 Christopher M. Bishop, section 4.3.2 | 33 Christopher M. Bishop, section 4.3.2 |
34 | 34 |
35 | |
36 """ | 35 """ |
37 __docformat__ = 'restructedtext en' | 36 __docformat__ = 'restructedtext en' |
38 | 37 |
39 | 38 import numpy, time, cPickle, gzip |
40 import numpy, cPickle, gzip | |
41 | |
42 import time | |
43 | 39 |
44 import theano | 40 import theano |
45 import theano.tensor as T | 41 import theano.tensor as T |
46 | |
47 import theano.tensor.nnet | |
48 | 42 |
49 | 43 |
50 class LogisticRegression(object): | 44 class LogisticRegression(object): |
51 """Multi-class Logistic Regression Class | 45 """Multi-class Logistic Regression Class |
52 | 46 |
60 | 54 |
61 | 55 |
62 def __init__(self, input, n_in, n_out): | 56 def __init__(self, input, n_in, n_out): |
63 """ Initialize the parameters of the logistic regression | 57 """ Initialize the parameters of the logistic regression |
64 | 58 |
59 :type input: theano.tensor.TensorType | |
65 :param input: symbolic variable that describes the input of the | 60 :param input: symbolic variable that describes the input of the |
66 architecture (one minibatch) | 61 architecture (one minibatch) |
67 | 62 |
63 :type n_in: int | |
68 :param n_in: number of input units, the dimension of the space in | 64 :param n_in: number of input units, the dimension of the space in |
69 which the datapoints lie | 65 which the datapoints lie |
70 | 66 |
67 :type n_out: int | |
71 :param n_out: number of output units, the dimension of the space in | 68 :param n_out: number of output units, the dimension of the space in |
72 which the labels lie | 69 which the labels lie |
73 | 70 |
74 """ | 71 """ |
75 | 72 |
76 # initialize with 0 the weights W as a matrix of shape (n_in, n_out) | 73 # initialize with 0 the weights W as a matrix of shape (n_in, n_out) |
77 self.W = theano.shared( value=numpy.zeros((n_in,n_out), | 74 self.W = theano.shared(value=numpy.zeros((n_in,n_out), dtype = theano.config.floatX), |
78 dtype = theano.config.floatX) ) | 75 name='W') |
79 # initialize the baises b as a vector of n_out 0s | 76 # initialize the baises b as a vector of n_out 0s |
80 self.b = theano.shared( value=numpy.zeros((n_out,), | 77 self.b = theano.shared(value=numpy.zeros((n_out,), dtype = theano.config.floatX), |
81 dtype = theano.config.floatX) ) | 78 name='b') |
82 | 79 |
83 | 80 |
84 # compute vector of class-membership probabilities in symbolic form | 81 # compute vector of class-membership probabilities in symbolic form |
85 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) | 82 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) |
86 | 83 |
87 # compute prediction as class whose probability is maximal in | 84 # compute prediction as class whose probability is maximal in |
88 # symbolic form | 85 # symbolic form |
89 self.y_pred=T.argmax(self.p_y_given_x, axis=1) | 86 self.y_pred=T.argmax(self.p_y_given_x, axis=1) |
90 | 87 |
88 # parameters of the model | |
89 self.params = [self.W, self.b] | |
90 | |
91 | 91 |
92 | 92 |
93 | 93 |
94 | 94 |
95 def negative_log_likelihood(self, y): | 95 def negative_log_likelihood(self, y): |
100 | 100 |
101 \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = | 101 \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = |
102 \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ | 102 \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ |
103 \ell (\theta=\{W,b\}, \mathcal{D}) | 103 \ell (\theta=\{W,b\}, \mathcal{D}) |
104 | 104 |
105 | 105 :type y: theano.tensor.TensorType |
106 :param y: corresponds to a vector that gives for each example the | 106 :param y: corresponds to a vector that gives for each example the |
107 :correct label | 107 correct label |
108 | 108 |
109 Note: we use the mean instead of the sum so that | 109 Note: we use the mean instead of the sum so that |
110 the learning rate is less dependent on the batch size | 110 the learning rate is less dependent on the batch size |
111 """ | 111 """ |
112 # y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch | |
113 # T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1] | |
114 # T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class | |
115 # LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]] | |
116 # and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v, | |
117 # i.e., the mean log-likelihood across the minibatch. | |
112 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) | 118 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) |
113 | |
114 | |
115 | |
116 | 119 |
117 | 120 |
118 def errors(self, y): | 121 def errors(self, y): |
119 """Return a float representing the number of errors in the minibatch | 122 """Return a float representing the number of errors in the minibatch |
120 over the total number of examples of the minibatch ; zero one | 123 over the total number of examples of the minibatch ; zero one |
121 loss over the size of the minibatch | 124 loss over the size of the minibatch |
125 | |
126 :type y: theano.tensor.TensorType | |
127 :param y: corresponds to a vector that gives for each example the | |
128 correct label | |
122 """ | 129 """ |
123 | 130 |
124 # check if y has same dimension of y_pred | 131 # check if y has same dimension of y_pred |
125 if y.ndim != self.y_pred.ndim: | 132 if y.ndim != self.y_pred.ndim: |
126 raise TypeError('y should have the same shape as self.y_pred', | 133 raise TypeError('y should have the same shape as self.y_pred', |
132 return T.mean(T.neq(self.y_pred, y)) | 139 return T.mean(T.neq(self.y_pred, y)) |
133 else: | 140 else: |
134 raise NotImplementedError() | 141 raise NotImplementedError() |
135 | 142 |
136 | 143 |
137 | 144 def load_data(dataset): |
138 | 145 ''' Loads the dataset |
139 | 146 |
140 def sgd_optimization_mnist( learning_rate=0.01, n_iter=100): | 147 :type dataset: string |
148 :param dataset: the path to the dataset (here MNIST) | |
149 ''' | |
150 | |
151 ############# | |
152 # LOAD DATA # | |
153 ############# | |
154 print '... loading data' | |
155 | |
156 # Load the dataset | |
157 f = gzip.open(dataset,'rb') | |
158 train_set, valid_set, test_set = cPickle.load(f) | |
159 f.close() | |
160 | |
161 | |
162 def shared_dataset(data_xy): | |
163 """ Function that loads the dataset into shared variables | |
164 | |
165 The reason we store our dataset in shared variables is to allow | |
166 Theano to copy it into the GPU memory (when code is run on GPU). | |
167 Since copying data into the GPU is slow, copying a minibatch everytime | |
168 is needed (the default behaviour if the data is not in a shared | |
169 variable) would lead to a large decrease in performance. | |
170 """ | |
171 data_x, data_y = data_xy | |
172 shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) | |
173 shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) | |
174 # When storing data on the GPU it has to be stored as floats | |
175 # therefore we will store the labels as ``floatX`` as well | |
176 # (``shared_y`` does exactly that). But during our computations | |
177 # we need them as ints (we use labels as index, and if they are | |
178 # floats it doesn't make sense) therefore instead of returning | |
179 # ``shared_y`` we will have to cast it to int. This little hack | |
180 # lets ous get around this issue | |
181 return shared_x, T.cast(shared_y, 'int32') | |
182 | |
183 test_set_x, test_set_y = shared_dataset(test_set) | |
184 valid_set_x, valid_set_y = shared_dataset(valid_set) | |
185 train_set_x, train_set_y = shared_dataset(train_set) | |
186 | |
187 rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)] | |
188 return rval | |
189 | |
190 | |
191 | |
192 | |
193 def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz'): | |
141 """ | 194 """ |
142 Demonstrate stochastic gradient descent optimization of a log-linear | 195 Demonstrate stochastic gradient descent optimization of a log-linear |
143 model | 196 model |
144 | 197 |
145 This is demonstrated on MNIST. | 198 This is demonstrated on MNIST. |
146 | 199 |
200 :type learning_rate: float | |
147 :param learning_rate: learning rate used (factor for the stochastic | 201 :param learning_rate: learning rate used (factor for the stochastic |
148 gradient | 202 gradient) |
149 | 203 |
150 :param n_iter: maximal number of iterations ot run the optimizer | 204 :type n_epochs: int |
205 :param n_epochs: maximal number of epochs to run the optimizer | |
206 | |
207 :type dataset: string | |
208 :param dataset: the path of the MNIST dataset file from | |
209 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz | |
151 | 210 |
152 """ | 211 """ |
153 | 212 datasets = load_data(dataset) |
154 # Load the dataset | 213 |
155 f = gzip.open('mnist.pkl.gz','rb') | 214 train_set_x, train_set_y = datasets[0] |
156 train_set, valid_set, test_set = cPickle.load(f) | 215 valid_set_x, valid_set_y = datasets[1] |
157 f.close() | 216 test_set_x , test_set_y = datasets[2] |
158 | 217 |
159 # make minibatches of size 20 | 218 batch_size = 600 # size of the minibatch |
160 batch_size = 20 # sized of the minibatch | 219 |
161 | 220 # compute number of minibatches for training, validation and testing |
162 # Dealing with the training set | 221 n_train_batches = train_set_x.value.shape[0] / batch_size |
163 # get the list of training images (x) and their labels (y) | 222 n_valid_batches = valid_set_x.value.shape[0] / batch_size |
164 (train_set_x, train_set_y) = train_set | 223 n_test_batches = test_set_x.value.shape[0] / batch_size |
165 # initialize the list of training minibatches with empty list | 224 |
166 train_batches = [] | 225 |
167 for i in xrange(0, len(train_set_x), batch_size): | 226 ###################### |
168 # add to the list of minibatches the minibatch starting at | 227 # BUILD ACTUAL MODEL # |
169 # position i, ending at position i+batch_size | 228 ###################### |
170 # a minibatch is a pair ; the first element of the pair is a list | 229 print '... building the model' |
171 # of datapoints, the second element is the list of corresponding | 230 |
172 # labels | |
173 train_batches = train_batches + \ | |
174 [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] | |
175 | |
176 # Dealing with the validation set | |
177 (valid_set_x, valid_set_y) = valid_set | |
178 # initialize the list of validation minibatches | |
179 valid_batches = [] | |
180 for i in xrange(0, len(valid_set_x), batch_size): | |
181 valid_batches = valid_batches + \ | |
182 [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] | |
183 | |
184 # Dealing with the testing set | |
185 (test_set_x, test_set_y) = test_set | |
186 # initialize the list of testing minibatches | |
187 test_batches = [] | |
188 for i in xrange(0, len(test_set_x), batch_size): | |
189 test_batches = test_batches + \ | |
190 [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] | |
191 | |
192 | |
193 ishape = (28,28) # this is the size of MNIST images | |
194 | 231 |
195 # allocate symbolic variables for the data | 232 # allocate symbolic variables for the data |
196 x = T.fmatrix() # the data is presented as rasterized images | 233 index = T.lscalar() # index to a [mini]batch |
197 y = T.lvector() # the labels are presented as 1D vector of | 234 x = T.matrix('x') # the data is presented as rasterized images |
198 # [long int] labels | 235 y = T.ivector('y') # the labels are presented as 1D vector of |
236 # [int] labels | |
199 | 237 |
200 # construct the logistic regression class | 238 # construct the logistic regression class |
201 classifier = LogisticRegression( \ | 239 # Each MNIST image has size 28*28 |
202 input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10) | 240 classifier = LogisticRegression( input=x, n_in=28*28, n_out=10) |
203 | 241 |
204 # the cost we minimize during training is the negative log likelihood of | 242 # the cost we minimize during training is the negative log likelihood of |
205 # the model in symbolic format | 243 # the model in symbolic format |
206 cost = classifier.negative_log_likelihood(y) | 244 cost = classifier.negative_log_likelihood(y) |
207 | 245 |
208 # compiling a Theano function that computes the mistakes that are made by | 246 # compiling a Theano function that computes the mistakes that are made by |
209 # the model on a minibatch | 247 # the model on a minibatch |
210 test_model = theano.function([x,y], classifier.errors(y)) | 248 test_model = theano.function(inputs = [index], |
249 outputs = classifier.errors(y), | |
250 givens={ | |
251 x:test_set_x[index*batch_size:(index+1)*batch_size], | |
252 y:test_set_y[index*batch_size:(index+1)*batch_size]}) | |
253 | |
254 validate_model = theano.function( inputs = [index], | |
255 outputs = classifier.errors(y), | |
256 givens={ | |
257 x:valid_set_x[index*batch_size:(index+1)*batch_size], | |
258 y:valid_set_y[index*batch_size:(index+1)*batch_size]}) | |
211 | 259 |
212 # compute the gradient of cost with respect to theta = (W,b) | 260 # compute the gradient of cost with respect to theta = (W,b) |
213 g_W = T.grad(cost, classifier.W) | 261 g_W = T.grad(cost = cost, wrt = classifier.W) |
214 g_b = T.grad(cost, classifier.b) | 262 g_b = T.grad(cost = cost, wrt = classifier.b) |
215 | 263 |
216 # specify how to update the parameters of the model as a dictionary | 264 # specify how to update the parameters of the model as a dictionary |
217 updates ={classifier.W: classifier.W - learning_rate*g_W,\ | 265 updates ={classifier.W: classifier.W - learning_rate*g_W,\ |
218 classifier.b: classifier.b - learning_rate*g_b} | 266 classifier.b: classifier.b - learning_rate*g_b} |
219 | 267 |
220 # compiling a Theano function `train_model` that returns the cost, but in | 268 # compiling a Theano function `train_model` that returns the cost, but in |
221 # the same time updates the parameter of the model based on the rules | 269 # the same time updates the parameter of the model based on the rules |
222 # defined in `updates` | 270 # defined in `updates` |
223 train_model = theano.function([x, y], cost, updates = updates ) | 271 train_model = theano.function(inputs = [index], |
224 | 272 outputs = cost, |
225 n_minibatches = len(train_batches) # number of minibatchers | 273 updates = updates, |
226 | 274 givens={ |
275 x:train_set_x[index*batch_size:(index+1)*batch_size], | |
276 y:train_set_y[index*batch_size:(index+1)*batch_size]}) | |
277 | |
278 ############### | |
279 # TRAIN MODEL # | |
280 ############### | |
281 print '... training the model' | |
227 # early-stopping parameters | 282 # early-stopping parameters |
228 patience = 5000 # look as this many examples regardless | 283 patience = 5000 # look as this many examples regardless |
229 patience_increase = 2 # wait this much longer when a new best is | 284 patience_increase = 2 # wait this much longer when a new best is |
230 # found | 285 # found |
231 improvement_threshold = 0.995 # a relative improvement of this much is | 286 improvement_threshold = 0.995 # a relative improvement of this much is |
232 # considered significant | 287 # considered significant |
233 validation_frequency = n_minibatches # go through this many | 288 validation_frequency = min(n_train_batches, patience/2) |
289 # go through this many | |
234 # minibatche before checking the network | 290 # minibatche before checking the network |
235 # on the validation set; in this case we | 291 # on the validation set; in this case we |
236 # check every epoch | 292 # check every epoch |
237 | 293 |
238 best_params = None | 294 best_params = None |
239 best_validation_loss = float('inf') | 295 best_validation_loss = float('inf') |
240 test_score = 0. | 296 test_score = 0. |
241 start_time = time.clock() | 297 start_time = time.clock() |
242 # have a maximum of `n_iter` iterations through the entire dataset | 298 |
243 for iter in xrange(n_iter* n_minibatches): | 299 done_looping = False |
244 | 300 epoch = 0 |
245 # get epoch and minibatch index | 301 while (epoch < n_epochs) and (not done_looping): |
246 epoch = iter / n_minibatches | 302 epoch = epoch + 1 |
247 minibatch_index = iter % n_minibatches | 303 for minibatch_index in xrange(n_train_batches): |
248 | 304 |
249 # get the minibatches corresponding to `iter` modulo | 305 minibatch_avg_cost = train_model(minibatch_index) |
250 # `len(train_batches)` | 306 # iteration number |
251 x,y = train_batches[ minibatch_index ] | 307 iter = epoch * n_train_batches + minibatch_index |
252 cost_ij = train_model(x,y) | |
253 | 308 |
254 if (iter+1) % validation_frequency == 0: | 309 if (iter+1) % validation_frequency == 0: |
255 # compute zero-one loss on validation set | 310 # compute zero-one loss on validation set |
256 this_validation_loss = 0. | 311 validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] |
257 for x,y in valid_batches: | 312 this_validation_loss = numpy.mean(validation_losses) |
258 # sum up the errors for each minibatch | |
259 this_validation_loss += test_model(x,y) | |
260 # get the average by dividing with the number of minibatches | |
261 this_validation_loss /= len(valid_batches) | |
262 | 313 |
263 print('epoch %i, minibatch %i/%i, validation error %f %%' % \ | 314 print('epoch %i, minibatch %i/%i, validation error %f %%' % \ |
264 (epoch, minibatch_index+1,n_minibatches, \ | 315 (epoch, minibatch_index+1,n_train_batches, \ |
265 this_validation_loss*100.)) | 316 this_validation_loss*100.)) |
266 | 317 |
267 | 318 |
268 # if we got the best validation score until now | 319 # if we got the best validation score until now |
269 if this_validation_loss < best_validation_loss: | 320 if this_validation_loss < best_validation_loss: |
273 patience = max(patience, iter * patience_increase) | 324 patience = max(patience, iter * patience_increase) |
274 | 325 |
275 best_validation_loss = this_validation_loss | 326 best_validation_loss = this_validation_loss |
276 # test it on the test set | 327 # test it on the test set |
277 | 328 |
278 test_score = 0. | 329 test_losses = [test_model(i) for i in xrange(n_test_batches)] |
279 for x,y in test_batches: | 330 test_score = numpy.mean(test_losses) |
280 test_score += test_model(x,y) | 331 |
281 test_score /= len(test_batches) | |
282 print((' epoch %i, minibatch %i/%i, test error of best ' | 332 print((' epoch %i, minibatch %i/%i, test error of best ' |
283 'model %f %%') % \ | 333 'model %f %%') % \ |
284 (epoch, minibatch_index+1, n_minibatches,test_score*100.)) | 334 (epoch, minibatch_index+1, n_train_batches,test_score*100.)) |
285 | 335 |
286 if patience <= iter : | 336 if patience <= iter : |
337 done_looping = True | |
287 break | 338 break |
288 | 339 |
289 end_time = time.clock() | 340 end_time = time.clock() |
290 print(('Optimization complete with best validation score of %f %%,' | 341 print(('Optimization complete with best validation score of %f %%,' |
291 'with test performance %f %%') % | 342 'with test performance %f %%') % |
292 (best_validation_loss * 100., test_score*100.)) | 343 (best_validation_loss * 100., test_score*100.)) |
293 print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) | 344 print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) |
294 | 345 |
295 | |
296 | |
297 | |
298 | |
299 | |
300 | |
301 if __name__ == '__main__': | 346 if __name__ == '__main__': |
302 sgd_optimization_mnist() | 347 sgd_optimization_mnist() |
303 | 348 |