Mercurial > ift6266
comparison code_tutoriel/logistic_sgd.py @ 0:fda5f787baa6
commit initial
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Thu, 21 Jan 2010 11:26:43 -0500 |
parents | |
children | bcc87d3e33a3 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fda5f787baa6 |
---|---|
1 """ | |
2 This tutorial introduces logistic regression using Theano and stochastic | |
3 gradient descent. | |
4 | |
5 Logistic regression is a probabilistic, linear classifier. It is parametrized | |
6 by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is | |
7 done by projecting data points onto a set of hyperplanes, the distance to | |
8 which is used to determine a class membership probability. | |
9 | |
10 Mathematically, this can be written as: | |
11 | |
12 .. math:: | |
13 P(Y=i|x, W,b) &= softmax_i(W x + b) \\ | |
14 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} | |
15 | |
16 | |
17 The output of the model or prediction is then done by taking the argmax of | |
18 the vector whose i'th element is P(Y=i|x). | |
19 | |
20 .. math:: | |
21 | |
22 y_{pred} = argmax_i P(Y=i|x,W,b) | |
23 | |
24 | |
25 This tutorial presents a stochastic gradient descent optimization method | |
26 suitable for large datasets, and a conjugate gradient optimization method | |
27 that is suitable for smaller datasets. | |
28 | |
29 | |
30 References: | |
31 | |
32 - textbooks: "Pattern Recognition and Machine Learning" - | |
33 Christopher M. Bishop, section 4.3.2 | |
34 | |
35 | |
36 """ | |
37 __docformat__ = 'restructedtext en' | |
38 | |
39 | |
40 import numpy, cPickle, gzip | |
41 | |
42 import time | |
43 | |
44 import theano | |
45 import theano.tensor as T | |
46 | |
47 import theano.tensor.nnet | |
48 | |
49 | |
50 class LogisticRegression(object): | |
51 """Multi-class Logistic Regression Class | |
52 | |
53 The logistic regression is fully described by a weight matrix :math:`W` | |
54 and bias vector :math:`b`. Classification is done by projecting data | |
55 points onto a set of hyperplanes, the distance to which is used to | |
56 determine a class membership probability. | |
57 """ | |
58 | |
59 | |
60 | |
61 | |
62 def __init__(self, input, n_in, n_out): | |
63 """ Initialize the parameters of the logistic regression | |
64 | |
65 :param input: symbolic variable that describes the input of the | |
66 architecture (one minibatch) | |
67 | |
68 :param n_in: number of input units, the dimension of the space in | |
69 which the datapoints lie | |
70 | |
71 :param n_out: number of output units, the dimension of the space in | |
72 which the labels lie | |
73 | |
74 """ | |
75 | |
76 # initialize with 0 the weights W as a matrix of shape (n_in, n_out) | |
77 self.W = theano.shared( value=numpy.zeros((n_in,n_out), | |
78 dtype = theano.config.floatX) ) | |
79 # initialize the baises b as a vector of n_out 0s | |
80 self.b = theano.shared( value=numpy.zeros((n_out,), | |
81 dtype = theano.config.floatX) ) | |
82 | |
83 | |
84 # compute vector of class-membership probabilities in symbolic form | |
85 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) | |
86 | |
87 # compute prediction as class whose probability is maximal in | |
88 # symbolic form | |
89 self.y_pred=T.argmax(self.p_y_given_x, axis=1) | |
90 | |
91 | |
92 | |
93 | |
94 | |
95 def negative_log_likelihood(self, y): | |
96 """Return the mean of the negative log-likelihood of the prediction | |
97 of this model under a given target distribution. | |
98 | |
99 .. math:: | |
100 | |
101 \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = | |
102 \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ | |
103 \ell (\theta=\{W,b\}, \mathcal{D}) | |
104 | |
105 | |
106 :param y: corresponds to a vector that gives for each example the | |
107 :correct label | |
108 | |
109 Note: we use the mean instead of the sum so that | |
110 the learning rate is less dependent on the batch size | |
111 """ | |
112 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) | |
113 | |
114 | |
115 | |
116 | |
117 | |
118 def errors(self, y): | |
119 """Return a float representing the number of errors in the minibatch | |
120 over the total number of examples of the minibatch ; zero one | |
121 loss over the size of the minibatch | |
122 """ | |
123 | |
124 # check if y has same dimension of y_pred | |
125 if y.ndim != self.y_pred.ndim: | |
126 raise TypeError('y should have the same shape as self.y_pred', | |
127 ('y', target.type, 'y_pred', self.y_pred.type)) | |
128 # check if y is of the correct datatype | |
129 if y.dtype.startswith('int'): | |
130 # the T.neq operator returns a vector of 0s and 1s, where 1 | |
131 # represents a mistake in prediction | |
132 return T.mean(T.neq(self.y_pred, y)) | |
133 else: | |
134 raise NotImplementedError() | |
135 | |
136 | |
137 | |
138 | |
139 | |
140 def sgd_optimization_mnist( learning_rate=0.01, n_iter=100): | |
141 """ | |
142 Demonstrate stochastic gradient descent optimization of a log-linear | |
143 model | |
144 | |
145 This is demonstrated on MNIST. | |
146 | |
147 :param learning_rate: learning rate used (factor for the stochastic | |
148 gradient | |
149 | |
150 :param n_iter: number of iterations ot run the optimizer | |
151 | |
152 """ | |
153 | |
154 # Load the dataset | |
155 f = gzip.open('mnist.pkl.gz','rb') | |
156 train_set, valid_set, test_set = cPickle.load(f) | |
157 f.close() | |
158 | |
159 # make minibatches of size 20 | |
160 batch_size = 20 # sized of the minibatch | |
161 | |
162 # Dealing with the training set | |
163 # get the list of training images (x) and their labels (y) | |
164 (train_set_x, train_set_y) = train_set | |
165 # initialize the list of training minibatches with empty list | |
166 train_batches = [] | |
167 for i in xrange(0, len(train_set_x), batch_size): | |
168 # add to the list of minibatches the minibatch starting at | |
169 # position i, ending at position i+batch_size | |
170 # a minibatch is a pair ; the first element of the pair is a list | |
171 # of datapoints, the second element is the list of corresponding | |
172 # labels | |
173 train_batches = train_batches + \ | |
174 [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] | |
175 | |
176 # Dealing with the validation set | |
177 (valid_set_x, valid_set_y) = valid_set | |
178 # initialize the list of validation minibatches | |
179 valid_batches = [] | |
180 for i in xrange(0, len(valid_set_x), batch_size): | |
181 valid_batches = valid_batches + \ | |
182 [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] | |
183 | |
184 # Dealing with the testing set | |
185 (test_set_x, test_set_y) = test_set | |
186 # initialize the list of testing minibatches | |
187 test_batches = [] | |
188 for i in xrange(0, len(test_set_x), batch_size): | |
189 test_batches = test_batches + \ | |
190 [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] | |
191 | |
192 | |
193 ishape = (28,28) # this is the size of MNIST images | |
194 | |
195 # allocate symbolic variables for the data | |
196 x = T.fmatrix() # the data is presented as rasterized images | |
197 y = T.lvector() # the labels are presented as 1D vector of | |
198 # [long int] labels | |
199 | |
200 # construct the logistic regression class | |
201 classifier = LogisticRegression( \ | |
202 input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10) | |
203 | |
204 # the cost we minimize during training is the negative log likelihood of | |
205 # the model in symbolic format | |
206 cost = classifier.negative_log_likelihood(y) | |
207 | |
208 # compiling a Theano function that computes the mistakes that are made by | |
209 # the model on a minibatch | |
210 test_model = theano.function([x,y], classifier.errors(y)) | |
211 | |
212 # compute the gradient of cost with respect to theta = (W,b) | |
213 g_W = T.grad(cost, classifier.W) | |
214 g_b = T.grad(cost, classifier.b) | |
215 | |
216 # specify how to update the parameters of the model as a dictionary | |
217 updates ={classifier.W: classifier.W - learning_rate*g_W,\ | |
218 classifier.b: classifier.b - learning_rate*g_b} | |
219 | |
220 # compiling a Theano function `train_model` that returns the cost, but in | |
221 # the same time updates the parameter of the model based on the rules | |
222 # defined in `updates` | |
223 train_model = theano.function([x, y], cost, updates = updates ) | |
224 | |
225 n_minibatches = len(train_batches) # number of minibatchers | |
226 | |
227 # early-stopping parameters | |
228 patience = 5000 # look as this many examples regardless | |
229 patience_increase = 2 # wait this much longer when a new best is | |
230 # found | |
231 improvement_threshold = 0.995 # a relative improvement of this much is | |
232 # considered significant | |
233 validation_frequency = n_minibatches # go through this many | |
234 # minibatche before checking the network | |
235 # on the validation set; in this case we | |
236 # check every epoch | |
237 | |
238 best_params = None | |
239 best_validation_loss = float('inf') | |
240 test_score = 0. | |
241 start_time = time.clock() | |
242 # have a maximum of `n_iter` iterations through the entire dataset | |
243 for iter in xrange(n_iter* n_minibatches): | |
244 | |
245 # get epoch and minibatch index | |
246 epoch = iter / n_minibatches | |
247 minibatch_index = iter % n_minibatches | |
248 | |
249 # get the minibatches corresponding to `iter` modulo | |
250 # `len(train_batches)` | |
251 x,y = train_batches[ minibatch_index ] | |
252 cost_ij = train_model(x,y) | |
253 | |
254 if (iter+1) % validation_frequency == 0: | |
255 # compute zero-one loss on validation set | |
256 this_validation_loss = 0. | |
257 for x,y in valid_batches: | |
258 # sum up the errors for each minibatch | |
259 this_validation_loss += test_model(x,y) | |
260 # get the average by dividing with the number of minibatches | |
261 this_validation_loss /= len(valid_batches) | |
262 | |
263 print('epoch %i, minibatch %i/%i, validation error %f %%' % \ | |
264 (epoch, minibatch_index+1,n_minibatches, \ | |
265 this_validation_loss*100.)) | |
266 | |
267 | |
268 # if we got the best validation score until now | |
269 if this_validation_loss < best_validation_loss: | |
270 #improve patience if loss improvement is good enough | |
271 if this_validation_loss < best_validation_loss * \ | |
272 improvement_threshold : | |
273 patience = max(patience, iter * patience_increase) | |
274 | |
275 best_validation_loss = this_validation_loss | |
276 # test it on the test set | |
277 | |
278 test_score = 0. | |
279 for x,y in test_batches: | |
280 test_score += test_model(x,y) | |
281 test_score /= len(test_batches) | |
282 print((' epoch %i, minibatch %i/%i, test error of best ' | |
283 'model %f %%') % \ | |
284 (epoch, minibatch_index+1, n_minibatches,test_score*100.)) | |
285 | |
286 if patience <= iter : | |
287 break | |
288 | |
289 end_time = time.clock() | |
290 print(('Optimization complete with best validation score of %f %%,' | |
291 'with test performance %f %%') % | |
292 (best_validation_loss * 100., test_score*100.)) | |
293 print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) | |
294 | |
295 | |
296 | |
297 | |
298 | |
299 | |
300 | |
301 if __name__ == '__main__': | |
302 sgd_optimization_mnist() | |
303 |