Mercurial > ift6266
comparison code_tutoriel/logistic_cg.py @ 165:4bc5eeec6394
Updating the tutorial code to the latest revisions.
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Fri, 26 Feb 2010 13:55:27 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
164:e3de934a98b6 | 165:4bc5eeec6394 |
---|---|
1 """ | |
2 This tutorial introduces logistic regression using Theano and conjugate | |
3 gradient descent. | |
4 | |
5 Logistic regression is a probabilistic, linear classifier. It is parametrized | |
6 by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is | |
7 done by projecting data points onto a set of hyperplanes, the distance to | |
8 which is used to determine a class membership probability. | |
9 | |
10 Mathematically, this can be written as: | |
11 | |
12 .. math:: | |
13 P(Y=i|x, W,b) &= softmax_i(W x + b) \\ | |
14 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} | |
15 | |
16 | |
17 The output of the model or prediction is then done by taking the argmax of | |
18 the vector whose i'th element is P(Y=i|x). | |
19 | |
20 .. math:: | |
21 | |
22 y_{pred} = argmax_i P(Y=i|x,W,b) | |
23 | |
24 | |
25 This tutorial presents a stochastic gradient descent optimization method | |
26 suitable for large datasets, and a conjugate gradient optimization method | |
27 that is suitable for smaller datasets. | |
28 | |
29 | |
30 References: | |
31 | |
32 - textbooks: "Pattern Recognition and Machine Learning" - | |
33 Christopher M. Bishop, section 4.3.2 | |
34 | |
35 | |
36 """ | |
37 __docformat__ = 'restructedtext en' | |
38 | |
39 | |
40 import numpy, time, cPickle, gzip | |
41 | |
42 import theano | |
43 import theano.tensor as T | |
44 | |
45 | |
46 class LogisticRegression(object): | |
47 """Multi-class Logistic Regression Class | |
48 | |
49 The logistic regression is fully described by a weight matrix :math:`W` | |
50 and bias vector :math:`b`. Classification is done by projecting data | |
51 points onto a set of hyperplanes, the distance to which is used to | |
52 determine a class membership probability. | |
53 """ | |
54 | |
55 | |
56 | |
57 | |
58 def __init__(self, input, n_in, n_out): | |
59 """ Initialize the parameters of the logistic regression | |
60 | |
61 :type input: theano.tensor.TensorType | |
62 :param input: symbolic variable that describes the input of the | |
63 architecture ( one minibatch) | |
64 | |
65 :type n_in: int | |
66 :param n_in: number of input units, the dimension of the space in | |
67 which the datapoint lies | |
68 | |
69 :type n_out: int | |
70 :param n_out: number of output units, the dimension of the space in | |
71 which the target lies | |
72 | |
73 """ | |
74 | |
75 # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out), | |
76 # while b is a vector of n_out elements, making theta a vector of | |
77 # n_in*n_out + n_out elements | |
78 self.theta = theano.shared( value = numpy.zeros(n_in*n_out+n_out, dtype = theano.config.floatX) ) | |
79 # W is represented by the fisr n_in*n_out elements of theta | |
80 self.W = self.theta[0:n_in*n_out].reshape((n_in,n_out)) | |
81 # b is the rest (last n_out elements) | |
82 self.b = self.theta[n_in*n_out:n_in*n_out+n_out] | |
83 | |
84 | |
85 # compute vector of class-membership probabilities in symbolic form | |
86 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) | |
87 | |
88 # compute prediction as class whose probability is maximal in | |
89 # symbolic form | |
90 self.y_pred=T.argmax(self.p_y_given_x, axis=1) | |
91 | |
92 | |
93 | |
94 | |
95 | |
96 def negative_log_likelihood(self, y): | |
97 """Return the negative log-likelihood of the prediction of this model | |
98 under a given target distribution. | |
99 | |
100 .. math:: | |
101 | |
102 \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = | |
103 \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ | |
104 \ell (\theta=\{W,b\}, \mathcal{D}) | |
105 | |
106 :type y: theano.tensor.TensorType | |
107 :param y: corresponds to a vector that gives for each example the | |
108 correct label | |
109 """ | |
110 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) | |
111 | |
112 | |
113 | |
114 | |
115 | |
116 def errors(self, y): | |
117 """Return a float representing the number of errors in the minibatch | |
118 over the total number of examples of the minibatch | |
119 | |
120 :type y: theano.tensor.TensorType | |
121 :param y: corresponds to a vector that gives for each example | |
122 the correct label | |
123 """ | |
124 | |
125 # check if y has same dimension of y_pred | |
126 if y.ndim != self.y_pred.ndim: | |
127 raise TypeError('y should have the same shape as self.y_pred', | |
128 ('y', target.type, 'y_pred', self.y_pred.type)) | |
129 # check if y is of the correct datatype | |
130 if y.dtype.startswith('int'): | |
131 # the T.neq operator returns a vector of 0s and 1s, where 1 | |
132 # represents a mistake in prediction | |
133 return T.mean(T.neq(self.y_pred, y)) | |
134 else: | |
135 raise NotImplementedError() | |
136 | |
137 | |
138 | |
139 | |
140 | |
141 | |
142 | |
143 def cg_optimization_mnist( n_epochs=50, mnist_pkl_gz='mnist.pkl.gz' ): | |
144 """Demonstrate conjugate gradient optimization of a log-linear model | |
145 | |
146 This is demonstrated on MNIST. | |
147 | |
148 :type n_epochs: int | |
149 :param n_epochs: number of epochs to run the optimizer | |
150 | |
151 :type mnist_pkl_gz: string | |
152 :param mnist_pkl_gz: the path of the mnist training file from | |
153 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz | |
154 | |
155 """ | |
156 ############# | |
157 # LOAD DATA # | |
158 ############# | |
159 print '... loading data' | |
160 | |
161 # Load the dataset | |
162 f = gzip.open(mnist_pkl_gz,'rb') | |
163 train_set, valid_set, test_set = cPickle.load(f) | |
164 f.close() | |
165 | |
166 def shared_dataset(data_xy): | |
167 """ Function that loads the dataset into shared variables | |
168 | |
169 The reason we store our dataset in shared variables is to allow | |
170 Theano to copy it into the GPU memory (when code is run on GPU). | |
171 Since copying data into the GPU is slow, copying a minibatch everytime | |
172 is needed (the default behaviour if the data is not in a shared | |
173 variable) would lead to a large decrease in performance. | |
174 """ | |
175 data_x, data_y = data_xy | |
176 shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) | |
177 shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) | |
178 # When storing data on the GPU it has to be stored as floats | |
179 # therefore we will store the labels as ``floatX`` as well | |
180 # (``shared_y`` does exactly that). But during our computations | |
181 # we need them as ints (we use labels as index, and if they are | |
182 # floats it doesn't make sense) therefore instead of returning | |
183 # ``shared_y`` we will have to cast it to int. This little hack | |
184 # lets ous get around this issue | |
185 return shared_x, T.cast(shared_y, 'int32') | |
186 | |
187 | |
188 test_set_x, test_set_y = shared_dataset(test_set) | |
189 valid_set_x, valid_set_y = shared_dataset(valid_set) | |
190 train_set_x, train_set_y = shared_dataset(train_set) | |
191 | |
192 batch_size = 600 # size of the minibatch | |
193 | |
194 n_train_batches = train_set_x.value.shape[0] / batch_size | |
195 n_valid_batches = valid_set_x.value.shape[0] / batch_size | |
196 n_test_batches = test_set_x.value.shape[0] / batch_size | |
197 | |
198 | |
199 ishape = (28,28) # this is the size of MNIST images | |
200 n_in = 28*28 # number of input units | |
201 n_out = 10 # number of output units | |
202 | |
203 | |
204 ###################### | |
205 # BUILD ACTUAL MODEL # | |
206 ###################### | |
207 print '... building the model' | |
208 | |
209 # allocate symbolic variables for the data | |
210 minibatch_offset = T.lscalar() # offset to the start of a [mini]batch | |
211 x = T.matrix() # the data is presented as rasterized images | |
212 y = T.ivector() # the labels are presented as 1D vector of | |
213 # [int] labels | |
214 | |
215 | |
216 # construct the logistic regression class | |
217 classifier = LogisticRegression( input=x, n_in=28*28, n_out=10) | |
218 | |
219 # the cost we minimize during training is the negative log likelihood of | |
220 # the model in symbolic format | |
221 cost = classifier.negative_log_likelihood(y).mean() | |
222 | |
223 # compile a theano function that computes the mistakes that are made by | |
224 # the model on a minibatch | |
225 test_model = theano.function([minibatch_offset], classifier.errors(y), | |
226 givens={ | |
227 x:test_set_x[minibatch_offset:minibatch_offset+batch_size], | |
228 y:test_set_y[minibatch_offset:minibatch_offset+batch_size]}) | |
229 | |
230 validate_model = theano.function([minibatch_offset],classifier.errors(y), | |
231 givens={ | |
232 x:valid_set_x[minibatch_offset:minibatch_offset+batch_size], | |
233 y:valid_set_y[minibatch_offset:minibatch_offset+batch_size]}) | |
234 | |
235 # compile a thenao function that returns the cost of a minibatch | |
236 batch_cost = theano.function([minibatch_offset], cost, | |
237 givens= { | |
238 x : train_set_x[minibatch_offset:minibatch_offset+batch_size], | |
239 y : train_set_y[minibatch_offset:minibatch_offset+batch_size]}) | |
240 | |
241 | |
242 | |
243 # compile a theano function that returns the gradient of the minibatch | |
244 # with respect to theta | |
245 batch_grad = theano.function([minibatch_offset], T.grad(cost,classifier.theta), | |
246 givens= { | |
247 x : train_set_x[minibatch_offset:minibatch_offset+batch_size], | |
248 y : train_set_y[minibatch_offset:minibatch_offset+batch_size]}) | |
249 | |
250 | |
251 # creates a function that computes the average cost on the training set | |
252 def train_fn(theta_value): | |
253 classifier.theta.value = theta_value | |
254 train_losses = [batch_cost(i*batch_size) for i in xrange(n_train_batches)] | |
255 return numpy.mean(train_losses) | |
256 | |
257 # creates a function that computes the average gradient of cost with | |
258 # respect to theta | |
259 def train_fn_grad(theta_value): | |
260 classifier.theta.value = theta_value | |
261 grad = batch_grad(0) | |
262 for i in xrange(1,n_train_batches): | |
263 grad += batch_grad(i*batch_size) | |
264 return grad/n_train_batches | |
265 | |
266 | |
267 validation_scores = [float('inf'), 0] | |
268 | |
269 # creates the validation function | |
270 def callback(theta_value): | |
271 classifier.theta.value = theta_value | |
272 #compute the validation loss | |
273 validation_losses = [validate_model(i*batch_size) for i in xrange(n_valid_batches)] | |
274 this_validation_loss = numpy.mean(validation_losses) | |
275 print('validation error %f %%' % (this_validation_loss*100.,)) | |
276 | |
277 # check if it is better then best validation score got until now | |
278 if this_validation_loss < validation_scores[0]: | |
279 # if so, replace the old one, and compute the score on the | |
280 # testing dataset | |
281 validation_scores[0] = this_validation_loss | |
282 test_loses = [test_model(i*batch_size) for i in xrange(n_test_batches)] | |
283 validation_scores[1] = numpy.mean(test_loses) | |
284 | |
285 ############### | |
286 # TRAIN MODEL # | |
287 ############### | |
288 | |
289 # using scipy conjugate gradient optimizer | |
290 import scipy.optimize | |
291 print ("Optimizing using scipy.optimize.fmin_cg...") | |
292 start_time = time.clock() | |
293 best_w_b = scipy.optimize.fmin_cg( | |
294 f = train_fn, | |
295 x0 = numpy.zeros((n_in+1)*n_out, dtype=x.dtype), | |
296 fprime = train_fn_grad, | |
297 callback = callback, | |
298 disp = 0, | |
299 maxiter = n_epochs) | |
300 end_time = time.clock() | |
301 print(('Optimization complete with best validation score of %f %%, with ' | |
302 'test performance %f %%') % | |
303 (validation_scores[0]*100., validation_scores[1]*100.)) | |
304 | |
305 print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) | |
306 | |
307 | |
308 if __name__ == '__main__': | |
309 cg_optimization_mnist() | |
310 |