Mercurial > ift6266
comparison code_tutoriel/logistic_cg.py @ 0:fda5f787baa6
commit initial
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Thu, 21 Jan 2010 11:26:43 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fda5f787baa6 |
---|---|
1 """ | |
2 This tutorial introduces logistic regression using Theano and conjugate | |
3 gradient descent. | |
4 | |
5 Logistic regression is a probabilistic, linear classifier. It is parametrized | |
6 by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is | |
7 done by projecting data points onto a set of hyperplanes, the distance to | |
8 which is used to determine a class membership probability. | |
9 | |
10 Mathematically, this can be written as: | |
11 | |
12 .. math:: | |
13 P(Y=i|x, W,b) &= softmax_i(W x + b) \\ | |
14 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} | |
15 | |
16 | |
17 The output of the model or prediction is then done by taking the argmax of | |
18 the vector whose i'th element is P(Y=i|x). | |
19 | |
20 .. math:: | |
21 | |
22 y_{pred} = argmax_i P(Y=i|x,W,b) | |
23 | |
24 | |
25 This tutorial presents a stochastic gradient descent optimization method | |
26 suitable for large datasets, and a conjugate gradient optimization method | |
27 that is suitable for smaller datasets. | |
28 | |
29 | |
30 References: | |
31 | |
32 - textbooks: "Pattern Recognition and Machine Learning" - | |
33 Christopher M. Bishop, section 4.3.2 | |
34 | |
35 | |
36 """ | |
37 __docformat__ = 'restructedtext en' | |
38 | |
39 | |
40 import numpy, cPickle, gzip | |
41 | |
42 import time | |
43 | |
44 import theano | |
45 import theano.tensor as T | |
46 import theano.tensor.nnet | |
47 | |
48 | |
49 class LogisticRegression(object): | |
50 """Multi-class Logistic Regression Class | |
51 | |
52 The logistic regression is fully described by a weight matrix :math:`W` | |
53 and bias vector :math:`b`. Classification is done by projecting data | |
54 points onto a set of hyperplanes, the distance to which is used to | |
55 determine a class membership probability. | |
56 """ | |
57 | |
58 | |
59 | |
60 | |
61 def __init__(self, input, n_in, n_out): | |
62 """ Initialize the parameters of the logistic regression | |
63 | |
64 :param input: symbolic variable that describes the input of the | |
65 architecture ( one minibatch) | |
66 | |
67 :param n_in: number of input units, the dimension of the space in | |
68 which the datapoint lies | |
69 | |
70 :param n_out: number of output units, the dimension of the space in | |
71 which the target lies | |
72 | |
73 """ | |
74 | |
75 # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out), | |
76 # while b is a vector of n_out elements, making theta a vector of | |
77 # n_in*n_out + n_out elements | |
78 self.theta = theano.shared( value = numpy.zeros(n_in*n_out+n_out) ) | |
79 # W is represented by the fisr n_in*n_out elements of theta | |
80 self.W = self.theta[0:n_in*n_out].reshape((n_in,n_out)) | |
81 # b is the rest (last n_out elements) | |
82 self.b = self.theta[n_in*n_out:n_in*n_out+n_out] | |
83 | |
84 | |
85 # compute vector of class-membership probabilities in symbolic form | |
86 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) | |
87 | |
88 # compute prediction as class whose probability is maximal in | |
89 # symbolic form | |
90 self.y_pred=T.argmax(self.p_y_given_x, axis=1) | |
91 | |
92 | |
93 | |
94 | |
95 | |
96 def negative_log_likelihood(self, y): | |
97 """Return the negative log-likelihood of the prediction of this model | |
98 under a given target distribution. | |
99 | |
100 .. math:: | |
101 | |
102 \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = | |
103 \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ | |
104 \ell (\theta=\{W,b\}, \mathcal{D}) | |
105 | |
106 | |
107 :param y: corresponds to a vector that gives for each example the | |
108 :correct label | |
109 """ | |
110 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) | |
111 | |
112 | |
113 | |
114 | |
115 | |
116 def errors(self, y): | |
117 """Return a float representing the number of errors in the minibatch | |
118 over the total number of examples of the minibatch | |
119 """ | |
120 | |
121 # check if y has same dimension of y_pred | |
122 if y.ndim != self.y_pred.ndim: | |
123 raise TypeError('y should have the same shape as self.y_pred', | |
124 ('y', target.type, 'y_pred', self.y_pred.type)) | |
125 # check if y is of the correct datatype | |
126 if y.dtype.startswith('int'): | |
127 # the T.neq operator returns a vector of 0s and 1s, where 1 | |
128 # represents a mistake in prediction | |
129 return T.mean(T.neq(self.y_pred, y)) | |
130 else: | |
131 raise NotImplementedError() | |
132 | |
133 | |
134 | |
135 | |
136 | |
137 | |
138 | |
139 def cg_optimization_mnist( n_iter=50 ): | |
140 """Demonstrate conjugate gradient optimization of a log-linear model | |
141 | |
142 This is demonstrated on MNIST. | |
143 | |
144 :param n_iter: number of iterations ot run the optimizer | |
145 | |
146 """ | |
147 | |
148 # Load the dataset | |
149 f = gzip.open('mnist.pkl.gz','rb') | |
150 train_set, valid_set, test_set = cPickle.load(f) | |
151 f.close() | |
152 | |
153 # make minibatches of size 20 | |
154 batch_size = 20 # sized of the minibatch | |
155 | |
156 # Dealing with the training set | |
157 # get the list of training images (x) and their labels (y) | |
158 (train_set_x, train_set_y) = train_set | |
159 # initialize the list of training minibatches with empty list | |
160 train_batches = [] | |
161 for i in xrange(0, len(train_set_x), batch_size): | |
162 # add to the list of minibatches the minibatch starting at | |
163 # position i, ending at position i+batch_size | |
164 # a minibatch is a pair ; the first element of the pair is a list | |
165 # of datapoints, the second element is the list of corresponding | |
166 # labels | |
167 train_batches = train_batches + \ | |
168 [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] | |
169 | |
170 # Dealing with the validation set | |
171 (valid_set_x, valid_set_y) = valid_set | |
172 # initialize the list of validation minibatches | |
173 valid_batches = [] | |
174 for i in xrange(0, len(valid_set_x), batch_size): | |
175 valid_batches = valid_batches + \ | |
176 [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] | |
177 | |
178 # Dealing with the testing set | |
179 (test_set_x, test_set_y) = test_set | |
180 # initialize the list of testing minibatches | |
181 test_batches = [] | |
182 for i in xrange(0, len(test_set_x), batch_size): | |
183 test_batches = test_batches + \ | |
184 [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] | |
185 | |
186 | |
187 ishape = (28,28) # this is the size of MNIST images | |
188 n_in = 28*28 # number of input units | |
189 n_out = 10 # number of output units | |
190 # allocate symbolic variables for the data | |
191 x = T.fmatrix() # the data is presented as rasterized images | |
192 y = T.lvector() # the labels are presented as 1D vector of | |
193 # [long int] labels | |
194 | |
195 | |
196 # construct the logistic regression class | |
197 classifier = LogisticRegression( \ | |
198 input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10) | |
199 | |
200 # the cost we minimize during training is the negative log likelihood of | |
201 # the model in symbolic format | |
202 cost = classifier.negative_log_likelihood(y).mean() | |
203 | |
204 # compile a theano function that computes the mistakes that are made by | |
205 # the model on a minibatch | |
206 test_model = theano.function([x,y], classifier.errors(y)) | |
207 # compile a theano function that returns the gradient of the minibatch | |
208 # with respect to theta | |
209 batch_grad = theano.function([x, y], T.grad(cost, classifier.theta)) | |
210 # compile a thenao function that returns the cost of a minibatch | |
211 batch_cost = theano.function([x, y], cost) | |
212 | |
213 # creates a function that computes the average cost on the training set | |
214 def train_fn(theta_value): | |
215 classifier.theta.value = theta_value | |
216 cost = 0. | |
217 for x,y in train_batches : | |
218 cost += batch_cost(x,y) | |
219 return cost / len(train_batches) | |
220 | |
221 # creates a function that computes the average gradient of cost with | |
222 # respect to theta | |
223 def train_fn_grad(theta_value): | |
224 classifier.theta.value = theta_value | |
225 grad = numpy.zeros(n_in * n_out + n_out) | |
226 for x,y in train_batches: | |
227 grad += batch_grad(x,y) | |
228 return grad/ len(train_batches) | |
229 | |
230 | |
231 | |
232 validation_scores = [float('inf'), 0] | |
233 | |
234 # creates the validation function | |
235 def callback(theta_value): | |
236 classifier.theta.value = theta_value | |
237 #compute the validation loss | |
238 this_validation_loss = 0. | |
239 for x,y in valid_batches: | |
240 this_validation_loss += test_model(x,y) | |
241 | |
242 this_validation_loss /= len(valid_batches) | |
243 | |
244 print('validation error %f %%' % (this_validation_loss*100.,)) | |
245 | |
246 # check if it is better then best validation score got until now | |
247 if this_validation_loss < validation_scores[0]: | |
248 # if so, replace the old one, and compute the score on the | |
249 # testing dataset | |
250 validation_scores[0] = this_validation_loss | |
251 test_score = 0. | |
252 for x,y in test_batches: | |
253 test_score += test_model(x,y) | |
254 validation_scores[1] = test_score / len(test_batches) | |
255 | |
256 # using scipy conjugate gradient optimizer | |
257 import scipy.optimize | |
258 print ("Optimizing using scipy.optimize.fmin_cg...") | |
259 start_time = time.clock() | |
260 best_w_b = scipy.optimize.fmin_cg( | |
261 f=train_fn, | |
262 x0=numpy.zeros((n_in+1)*n_out, dtype=x.dtype), | |
263 fprime=train_fn_grad, | |
264 callback=callback, | |
265 disp=0, | |
266 maxiter=n_iter) | |
267 end_time = time.clock() | |
268 print(('Optimization complete with best validation score of %f %%, with ' | |
269 'test performance %f %%') % | |
270 (validation_scores[0]*100., validation_scores[1]*100.)) | |
271 | |
272 print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) | |
273 | |
274 | |
275 | |
276 | |
277 | |
278 | |
279 | |
280 if __name__ == '__main__': | |
281 cg_optimization_mnist() | |
282 |