comparison code_tutoriel/logistic_sgd.py @ 165:4bc5eeec6394

Updating the tutorial code to the latest revisions.
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Fri, 26 Feb 2010 13:55:27 -0500
parents bcc87d3e33a3
children
comparison
equal deleted inserted replaced
164:e3de934a98b6 165:4bc5eeec6394
30 References: 30 References:
31 31
32 - textbooks: "Pattern Recognition and Machine Learning" - 32 - textbooks: "Pattern Recognition and Machine Learning" -
33 Christopher M. Bishop, section 4.3.2 33 Christopher M. Bishop, section 4.3.2
34 34
35
36 """ 35 """
37 __docformat__ = 'restructedtext en' 36 __docformat__ = 'restructedtext en'
38 37
39 38 import numpy, time, cPickle, gzip
40 import numpy, cPickle, gzip
41
42 import time
43 39
44 import theano 40 import theano
45 import theano.tensor as T 41 import theano.tensor as T
46
47 import theano.tensor.nnet
48 42
49 43
50 class LogisticRegression(object): 44 class LogisticRegression(object):
51 """Multi-class Logistic Regression Class 45 """Multi-class Logistic Regression Class
52 46
60 54
61 55
62 def __init__(self, input, n_in, n_out): 56 def __init__(self, input, n_in, n_out):
63 """ Initialize the parameters of the logistic regression 57 """ Initialize the parameters of the logistic regression
64 58
59 :type input: theano.tensor.TensorType
65 :param input: symbolic variable that describes the input of the 60 :param input: symbolic variable that describes the input of the
66 architecture (one minibatch) 61 architecture (one minibatch)
67 62
63 :type n_in: int
68 :param n_in: number of input units, the dimension of the space in 64 :param n_in: number of input units, the dimension of the space in
69 which the datapoints lie 65 which the datapoints lie
70 66
67 :type n_out: int
71 :param n_out: number of output units, the dimension of the space in 68 :param n_out: number of output units, the dimension of the space in
72 which the labels lie 69 which the labels lie
73 70
74 """ 71 """
75 72
76 # initialize with 0 the weights W as a matrix of shape (n_in, n_out) 73 # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
77 self.W = theano.shared( value=numpy.zeros((n_in,n_out), 74 self.W = theano.shared(value=numpy.zeros((n_in,n_out), dtype = theano.config.floatX),
78 dtype = theano.config.floatX) ) 75 name='W')
79 # initialize the baises b as a vector of n_out 0s 76 # initialize the baises b as a vector of n_out 0s
80 self.b = theano.shared( value=numpy.zeros((n_out,), 77 self.b = theano.shared(value=numpy.zeros((n_out,), dtype = theano.config.floatX),
81 dtype = theano.config.floatX) ) 78 name='b')
82 79
83 80
84 # compute vector of class-membership probabilities in symbolic form 81 # compute vector of class-membership probabilities in symbolic form
85 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) 82 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
86 83
87 # compute prediction as class whose probability is maximal in 84 # compute prediction as class whose probability is maximal in
88 # symbolic form 85 # symbolic form
89 self.y_pred=T.argmax(self.p_y_given_x, axis=1) 86 self.y_pred=T.argmax(self.p_y_given_x, axis=1)
90 87
88 # parameters of the model
89 self.params = [self.W, self.b]
90
91 91
92 92
93 93
94 94
95 def negative_log_likelihood(self, y): 95 def negative_log_likelihood(self, y):
100 100
101 \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 101 \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
102 \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ 102 \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
103 \ell (\theta=\{W,b\}, \mathcal{D}) 103 \ell (\theta=\{W,b\}, \mathcal{D})
104 104
105 105 :type y: theano.tensor.TensorType
106 :param y: corresponds to a vector that gives for each example the 106 :param y: corresponds to a vector that gives for each example the
107 :correct label 107 correct label
108 108
109 Note: we use the mean instead of the sum so that 109 Note: we use the mean instead of the sum so that
110 the learning rate is less dependent on the batch size 110 the learning rate is less dependent on the batch size
111 """ 111 """
112 # y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch
113 # T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1]
114 # T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class
115 # LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]]
116 # and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v,
117 # i.e., the mean log-likelihood across the minibatch.
112 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) 118 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
113
114
115
116 119
117 120
118 def errors(self, y): 121 def errors(self, y):
119 """Return a float representing the number of errors in the minibatch 122 """Return a float representing the number of errors in the minibatch
120 over the total number of examples of the minibatch ; zero one 123 over the total number of examples of the minibatch ; zero one
121 loss over the size of the minibatch 124 loss over the size of the minibatch
125
126 :type y: theano.tensor.TensorType
127 :param y: corresponds to a vector that gives for each example the
128 correct label
122 """ 129 """
123 130
124 # check if y has same dimension of y_pred 131 # check if y has same dimension of y_pred
125 if y.ndim != self.y_pred.ndim: 132 if y.ndim != self.y_pred.ndim:
126 raise TypeError('y should have the same shape as self.y_pred', 133 raise TypeError('y should have the same shape as self.y_pred',
132 return T.mean(T.neq(self.y_pred, y)) 139 return T.mean(T.neq(self.y_pred, y))
133 else: 140 else:
134 raise NotImplementedError() 141 raise NotImplementedError()
135 142
136 143
137 144 def load_data(dataset):
138 145 ''' Loads the dataset
139 146
140 def sgd_optimization_mnist( learning_rate=0.01, n_iter=100): 147 :type dataset: string
148 :param dataset: the path to the dataset (here MNIST)
149 '''
150
151 #############
152 # LOAD DATA #
153 #############
154 print '... loading data'
155
156 # Load the dataset
157 f = gzip.open(dataset,'rb')
158 train_set, valid_set, test_set = cPickle.load(f)
159 f.close()
160
161
162 def shared_dataset(data_xy):
163 """ Function that loads the dataset into shared variables
164
165 The reason we store our dataset in shared variables is to allow
166 Theano to copy it into the GPU memory (when code is run on GPU).
167 Since copying data into the GPU is slow, copying a minibatch everytime
168 is needed (the default behaviour if the data is not in a shared
169 variable) would lead to a large decrease in performance.
170 """
171 data_x, data_y = data_xy
172 shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
173 shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
174 # When storing data on the GPU it has to be stored as floats
175 # therefore we will store the labels as ``floatX`` as well
176 # (``shared_y`` does exactly that). But during our computations
177 # we need them as ints (we use labels as index, and if they are
178 # floats it doesn't make sense) therefore instead of returning
179 # ``shared_y`` we will have to cast it to int. This little hack
180 # lets ous get around this issue
181 return shared_x, T.cast(shared_y, 'int32')
182
183 test_set_x, test_set_y = shared_dataset(test_set)
184 valid_set_x, valid_set_y = shared_dataset(valid_set)
185 train_set_x, train_set_y = shared_dataset(train_set)
186
187 rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)]
188 return rval
189
190
191
192
193 def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz'):
141 """ 194 """
142 Demonstrate stochastic gradient descent optimization of a log-linear 195 Demonstrate stochastic gradient descent optimization of a log-linear
143 model 196 model
144 197
145 This is demonstrated on MNIST. 198 This is demonstrated on MNIST.
146 199
200 :type learning_rate: float
147 :param learning_rate: learning rate used (factor for the stochastic 201 :param learning_rate: learning rate used (factor for the stochastic
148 gradient 202 gradient)
149 203
150 :param n_iter: maximal number of iterations ot run the optimizer 204 :type n_epochs: int
205 :param n_epochs: maximal number of epochs to run the optimizer
206
207 :type dataset: string
208 :param dataset: the path of the MNIST dataset file from
209 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
151 210
152 """ 211 """
153 212 datasets = load_data(dataset)
154 # Load the dataset 213
155 f = gzip.open('mnist.pkl.gz','rb') 214 train_set_x, train_set_y = datasets[0]
156 train_set, valid_set, test_set = cPickle.load(f) 215 valid_set_x, valid_set_y = datasets[1]
157 f.close() 216 test_set_x , test_set_y = datasets[2]
158 217
159 # make minibatches of size 20 218 batch_size = 600 # size of the minibatch
160 batch_size = 20 # sized of the minibatch 219
161 220 # compute number of minibatches for training, validation and testing
162 # Dealing with the training set 221 n_train_batches = train_set_x.value.shape[0] / batch_size
163 # get the list of training images (x) and their labels (y) 222 n_valid_batches = valid_set_x.value.shape[0] / batch_size
164 (train_set_x, train_set_y) = train_set 223 n_test_batches = test_set_x.value.shape[0] / batch_size
165 # initialize the list of training minibatches with empty list 224
166 train_batches = [] 225
167 for i in xrange(0, len(train_set_x), batch_size): 226 ######################
168 # add to the list of minibatches the minibatch starting at 227 # BUILD ACTUAL MODEL #
169 # position i, ending at position i+batch_size 228 ######################
170 # a minibatch is a pair ; the first element of the pair is a list 229 print '... building the model'
171 # of datapoints, the second element is the list of corresponding 230
172 # labels
173 train_batches = train_batches + \
174 [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
175
176 # Dealing with the validation set
177 (valid_set_x, valid_set_y) = valid_set
178 # initialize the list of validation minibatches
179 valid_batches = []
180 for i in xrange(0, len(valid_set_x), batch_size):
181 valid_batches = valid_batches + \
182 [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
183
184 # Dealing with the testing set
185 (test_set_x, test_set_y) = test_set
186 # initialize the list of testing minibatches
187 test_batches = []
188 for i in xrange(0, len(test_set_x), batch_size):
189 test_batches = test_batches + \
190 [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
191
192
193 ishape = (28,28) # this is the size of MNIST images
194 231
195 # allocate symbolic variables for the data 232 # allocate symbolic variables for the data
196 x = T.fmatrix() # the data is presented as rasterized images 233 index = T.lscalar() # index to a [mini]batch
197 y = T.lvector() # the labels are presented as 1D vector of 234 x = T.matrix('x') # the data is presented as rasterized images
198 # [long int] labels 235 y = T.ivector('y') # the labels are presented as 1D vector of
236 # [int] labels
199 237
200 # construct the logistic regression class 238 # construct the logistic regression class
201 classifier = LogisticRegression( \ 239 # Each MNIST image has size 28*28
202 input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10) 240 classifier = LogisticRegression( input=x, n_in=28*28, n_out=10)
203 241
204 # the cost we minimize during training is the negative log likelihood of 242 # the cost we minimize during training is the negative log likelihood of
205 # the model in symbolic format 243 # the model in symbolic format
206 cost = classifier.negative_log_likelihood(y) 244 cost = classifier.negative_log_likelihood(y)
207 245
208 # compiling a Theano function that computes the mistakes that are made by 246 # compiling a Theano function that computes the mistakes that are made by
209 # the model on a minibatch 247 # the model on a minibatch
210 test_model = theano.function([x,y], classifier.errors(y)) 248 test_model = theano.function(inputs = [index],
249 outputs = classifier.errors(y),
250 givens={
251 x:test_set_x[index*batch_size:(index+1)*batch_size],
252 y:test_set_y[index*batch_size:(index+1)*batch_size]})
253
254 validate_model = theano.function( inputs = [index],
255 outputs = classifier.errors(y),
256 givens={
257 x:valid_set_x[index*batch_size:(index+1)*batch_size],
258 y:valid_set_y[index*batch_size:(index+1)*batch_size]})
211 259
212 # compute the gradient of cost with respect to theta = (W,b) 260 # compute the gradient of cost with respect to theta = (W,b)
213 g_W = T.grad(cost, classifier.W) 261 g_W = T.grad(cost = cost, wrt = classifier.W)
214 g_b = T.grad(cost, classifier.b) 262 g_b = T.grad(cost = cost, wrt = classifier.b)
215 263
216 # specify how to update the parameters of the model as a dictionary 264 # specify how to update the parameters of the model as a dictionary
217 updates ={classifier.W: classifier.W - learning_rate*g_W,\ 265 updates ={classifier.W: classifier.W - learning_rate*g_W,\
218 classifier.b: classifier.b - learning_rate*g_b} 266 classifier.b: classifier.b - learning_rate*g_b}
219 267
220 # compiling a Theano function `train_model` that returns the cost, but in 268 # compiling a Theano function `train_model` that returns the cost, but in
221 # the same time updates the parameter of the model based on the rules 269 # the same time updates the parameter of the model based on the rules
222 # defined in `updates` 270 # defined in `updates`
223 train_model = theano.function([x, y], cost, updates = updates ) 271 train_model = theano.function(inputs = [index],
224 272 outputs = cost,
225 n_minibatches = len(train_batches) # number of minibatchers 273 updates = updates,
226 274 givens={
275 x:train_set_x[index*batch_size:(index+1)*batch_size],
276 y:train_set_y[index*batch_size:(index+1)*batch_size]})
277
278 ###############
279 # TRAIN MODEL #
280 ###############
281 print '... training the model'
227 # early-stopping parameters 282 # early-stopping parameters
228 patience = 5000 # look as this many examples regardless 283 patience = 5000 # look as this many examples regardless
229 patience_increase = 2 # wait this much longer when a new best is 284 patience_increase = 2 # wait this much longer when a new best is
230 # found 285 # found
231 improvement_threshold = 0.995 # a relative improvement of this much is 286 improvement_threshold = 0.995 # a relative improvement of this much is
232 # considered significant 287 # considered significant
233 validation_frequency = n_minibatches # go through this many 288 validation_frequency = min(n_train_batches, patience/2)
289 # go through this many
234 # minibatche before checking the network 290 # minibatche before checking the network
235 # on the validation set; in this case we 291 # on the validation set; in this case we
236 # check every epoch 292 # check every epoch
237 293
238 best_params = None 294 best_params = None
239 best_validation_loss = float('inf') 295 best_validation_loss = float('inf')
240 test_score = 0. 296 test_score = 0.
241 start_time = time.clock() 297 start_time = time.clock()
242 # have a maximum of `n_iter` iterations through the entire dataset 298
243 for iter in xrange(n_iter* n_minibatches): 299 done_looping = False
244 300 epoch = 0
245 # get epoch and minibatch index 301 while (epoch < n_epochs) and (not done_looping):
246 epoch = iter / n_minibatches 302 epoch = epoch + 1
247 minibatch_index = iter % n_minibatches 303 for minibatch_index in xrange(n_train_batches):
248 304
249 # get the minibatches corresponding to `iter` modulo 305 minibatch_avg_cost = train_model(minibatch_index)
250 # `len(train_batches)` 306 # iteration number
251 x,y = train_batches[ minibatch_index ] 307 iter = epoch * n_train_batches + minibatch_index
252 cost_ij = train_model(x,y)
253 308
254 if (iter+1) % validation_frequency == 0: 309 if (iter+1) % validation_frequency == 0:
255 # compute zero-one loss on validation set 310 # compute zero-one loss on validation set
256 this_validation_loss = 0. 311 validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
257 for x,y in valid_batches: 312 this_validation_loss = numpy.mean(validation_losses)
258 # sum up the errors for each minibatch
259 this_validation_loss += test_model(x,y)
260 # get the average by dividing with the number of minibatches
261 this_validation_loss /= len(valid_batches)
262 313
263 print('epoch %i, minibatch %i/%i, validation error %f %%' % \ 314 print('epoch %i, minibatch %i/%i, validation error %f %%' % \
264 (epoch, minibatch_index+1,n_minibatches, \ 315 (epoch, minibatch_index+1,n_train_batches, \
265 this_validation_loss*100.)) 316 this_validation_loss*100.))
266 317
267 318
268 # if we got the best validation score until now 319 # if we got the best validation score until now
269 if this_validation_loss < best_validation_loss: 320 if this_validation_loss < best_validation_loss:
273 patience = max(patience, iter * patience_increase) 324 patience = max(patience, iter * patience_increase)
274 325
275 best_validation_loss = this_validation_loss 326 best_validation_loss = this_validation_loss
276 # test it on the test set 327 # test it on the test set
277 328
278 test_score = 0. 329 test_losses = [test_model(i) for i in xrange(n_test_batches)]
279 for x,y in test_batches: 330 test_score = numpy.mean(test_losses)
280 test_score += test_model(x,y) 331
281 test_score /= len(test_batches)
282 print((' epoch %i, minibatch %i/%i, test error of best ' 332 print((' epoch %i, minibatch %i/%i, test error of best '
283 'model %f %%') % \ 333 'model %f %%') % \
284 (epoch, minibatch_index+1, n_minibatches,test_score*100.)) 334 (epoch, minibatch_index+1, n_train_batches,test_score*100.))
285 335
286 if patience <= iter : 336 if patience <= iter :
337 done_looping = True
287 break 338 break
288 339
289 end_time = time.clock() 340 end_time = time.clock()
290 print(('Optimization complete with best validation score of %f %%,' 341 print(('Optimization complete with best validation score of %f %%,'
291 'with test performance %f %%') % 342 'with test performance %f %%') %
292 (best_validation_loss * 100., test_score*100.)) 343 (best_validation_loss * 100., test_score*100.))
293 print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) 344 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
294 345
295
296
297
298
299
300
301 if __name__ == '__main__': 346 if __name__ == '__main__':
302 sgd_optimization_mnist() 347 sgd_optimization_mnist()
303 348