0
|
1
|
|
2 """
|
|
3 This tutorial introduces the LeNet5 neural network architecture using Theano. LeNet5 is a
|
|
4 convolutional neural network, good for classifying images. This tutorial shows how to build the
|
|
5 architecture, and comes with all the hyper-parameters you need to reproduce the paper's MNIST
|
|
6 results.
|
|
7
|
|
8 The best results are obtained after X iterations of the main program loop, which takes ***
|
|
9 minutes on my workstation (an Intel Core i7, circa July 2009), and *** minutes on my GPU (an
|
|
10 NVIDIA GTX 285 graphics processor).
|
|
11
|
|
12 This implementation simplifies the model in the following ways:
|
|
13
|
|
14 - LeNetConvPool doesn't implement location-specific gain and bias parameters
|
|
15
|
|
16 - LeNetConvPool doesn't implement pooling by average, it implements pooling by max.
|
|
17
|
|
18 - Digit classification is implemented with a logistic regression rather than an RBF network
|
|
19
|
|
20 - LeNet5 was not fully-connected convolutions at second layer
|
|
21
|
|
22 References:
|
|
23
|
|
24 - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner: Gradient-Based Learning Applied to Document
|
|
25 Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998.
|
|
26 http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf
|
|
27
|
|
28
|
|
29 """
|
|
30 import numpy
|
|
31 from theano.compile.sandbox import shared, pfunc
|
|
32 from theano import tensor
|
|
33 from pylearn.shared.layers import LogisticRegression, SigmoidalLayer
|
|
34 import theano.sandbox.softsign
|
|
35 import pylearn.datasets.MNIST
|
|
36
|
|
37
|
|
38 try:
|
|
39 # this tells theano to use the GPU if possible
|
|
40 from theano.sandbox.cuda import use
|
|
41 use()
|
|
42 except Exception, e:
|
|
43 print('Warning: Attempt to use GPU resulted in error "%s"' % str(e))
|
|
44
|
|
45 class LeNetConvPool(object):
|
|
46 """WRITEME
|
|
47
|
|
48 Math of what the layer does, and what symbolic variables are created by the class (w, b,
|
|
49 output).
|
|
50
|
|
51 """
|
|
52
|
|
53 #TODO: implement biases & scales properly. There are supposed to be more parameters.
|
|
54 # - one bias & scale per filter
|
|
55 # - one bias & scale per downsample feature location (a 2d bias)
|
|
56 # - more?
|
|
57
|
|
58 def __init__(self, rng, input, n_examples, n_imgs, img_shape, n_filters, filter_shape=(5,5),
|
|
59 poolsize=(2,2)):
|
|
60 """
|
|
61 Allocate a LeNetConvPool layer with shared variable internal parameters.
|
|
62
|
|
63 :param rng: a random number generator used to initialize weights
|
|
64
|
|
65 :param input: symbolic images. Shape: (n_examples, n_imgs, img_shape[0], img_shape[1])
|
|
66
|
|
67 :param n_examples: input's shape[0] at runtime
|
|
68
|
|
69 :param n_imgs: input's shape[1] at runtime
|
|
70
|
|
71 :param img_shape: input's shape[2:4] at runtime
|
|
72
|
|
73 :param n_filters: the number of filters to apply to the image.
|
|
74
|
|
75 :param filter_shape: the size of the filters to apply
|
|
76 :type filter_shape: pair (rows, cols)
|
|
77
|
|
78 :param poolsize: the downsampling (pooling) factor
|
|
79 :type poolsize: pair (rows, cols)
|
|
80 """
|
|
81
|
|
82 #TODO: make a simpler convolution constructor!!
|
|
83 # - make dx and dy optional
|
|
84 # - why do we have to pass shapes? (Can we make them optional at least?)
|
|
85 conv_op = ConvOp((n_imgs,)+img_shape, filter_shape, n_filters, n_examples,
|
|
86 dx=1, dy=1, output_mode='valid')
|
|
87
|
|
88 # - why is poolsize an op parameter here?
|
|
89 # - can we just have a maxpool function that creates this Op internally?
|
|
90 ds_op = DownsampleFactorMax(poolsize, ignore_border=True)
|
|
91
|
|
92 # the filter tensor that we will apply is a 4D tensor
|
|
93 w_shp = (n_filters, n_imgs) + filter_shape
|
|
94
|
|
95 # the bias we add is a 1D tensor
|
|
96 b_shp = (n_filters,)
|
|
97
|
|
98 self.w = shared(
|
|
99 numpy.asarray(
|
|
100 rng.uniform(
|
|
101 low=-1.0 / numpy.sqrt(filter_shape[0] * filter_shape[1] * n_imgs),
|
|
102 high=1.0 / numpy.sqrt(filter_shape[0] * filter_shape[1] * n_imgs),
|
|
103 size=w_shp),
|
|
104 dtype=input.dtype))
|
|
105 self.b = shared(
|
|
106 numpy.asarray(
|
|
107 rng.uniform(low=-.0, high=0., size=(n_filters,)),
|
|
108 dtype=input.dtype))
|
|
109
|
|
110 self.input = input
|
|
111 conv_out = conv_op(input, self.w)
|
|
112 self.output = tensor.tanh(ds_op(conv_out) + b.dimshuffle('x', 0, 'x', 'x'))
|
|
113 self.params = [self.w, self.b]
|
|
114
|
|
115 class SigmoidalLayer(object):
|
|
116 def __init__(self, input, n_in, n_out):
|
|
117 """
|
|
118 :param input: a symbolic tensor of shape (n_examples, n_in)
|
|
119 :param w: a symbolic weight matrix of shape (n_in, n_out)
|
|
120 :param b: symbolic bias terms of shape (n_out,)
|
|
121 :param squash: an squashing function
|
|
122 """
|
|
123 self.input = input
|
|
124 self.w = shared(
|
|
125 numpy.asarray(
|
|
126 rng.uniform(low=-2/numpy.sqrt(n_in), high=2/numpy.sqrt(n_in),
|
|
127 size=(n_in, n_out)), dtype=input.dtype))
|
|
128 self.b = shared(numpy.asarray(numpy.zeros(n_out), dtype=input.dtype))
|
|
129 self.output = tensor.tanh(tensor.dot(input, self.w) + self.b)
|
|
130 self.params = [self.w, self.b]
|
|
131
|
|
132 class LogisticRegression(object):
|
|
133 """WRITEME"""
|
|
134
|
|
135 def __init__(self, input, n_in, n_out):
|
|
136 self.w = shared(numpy.zeros((n_in, n_out), dtype=input.dtype))
|
|
137 self.b = shared(numpy.zeros((n_out,), dtype=input.dtype))
|
|
138 self.l1=abs(self.w).sum()
|
|
139 self.l2_sqr = (self.w**2).sum()
|
|
140 self.output=nnet.softmax(theano.dot(input, self.w)+self.b)
|
|
141 self.argmax=theano.tensor.argmax(self.output, axis=1)
|
|
142 self.params = [self.w, self.b]
|
|
143
|
|
144 def nll(self, target):
|
|
145 """Return the negative log-likelihood of the prediction of this model under a given
|
|
146 target distribution. Passing symbolic integers here means 1-hot.
|
|
147 WRITEME
|
|
148 """
|
|
149 return nnet.categorical_crossentropy(self.output, target)
|
|
150
|
|
151 def errors(self, target):
|
|
152 """Return a vector of 0s and 1s, with 1s on every line that was mis-classified.
|
|
153 """
|
|
154 if target.ndim != self.argmax.ndim:
|
|
155 raise TypeError('target should have the same shape as self.argmax', ('target', target.type,
|
|
156 'argmax', self.argmax.type))
|
|
157 if target.dtype.startswith('int'):
|
|
158 return theano.tensor.neq(self.argmax, target)
|
|
159 else:
|
|
160 raise NotImplementedError()
|
|
161
|
|
162 def evaluate_lenet5(batch_size=30, n_iter=1000):
|
|
163 rng = numpy.random.RandomState(23455)
|
|
164
|
|
165 mnist = pylearn.datasets.MNIST.train_valid_test()
|
|
166
|
|
167 ishape=(28,28) #this is the size of MNIST images
|
|
168
|
|
169 # allocate symbolic variables for the data
|
|
170 x = tensor.fmatrix() # the data is presented as rasterized images
|
|
171 y = tensor.lvector() # the labels are presented as 1D vector of [long int] labels
|
|
172
|
|
173 # construct the first convolutional pooling layer
|
|
174 layer0 = LeNetConvPool.new(rng, input=x.reshape((batch_size,1,28,28)), n_examples=batch_size,
|
|
175 n_imgs=1, img_shape=ishape,
|
|
176 n_filters=6, filter_shape=(5,5),
|
|
177 poolsize=(2,2))
|
|
178
|
|
179 # construct the second convolutional pooling layer
|
|
180 layer1 = LeNetConvPool.new(rng, input=layer0.output, n_examples=batch_size,
|
|
181 n_imgs=6, img_shape=(12,12),
|
|
182 n_filters=16, filter_shape=(5,5),
|
|
183 poolsize=(2,2))
|
|
184
|
|
185 # construct a fully-connected sigmoidal layer
|
|
186 layer2 = SigmoidalLayer.new(rng, input=layer1.output.flatten(2), n_in=16*16, n_out=128) # 128 ?
|
|
187
|
|
188 # classify the values of the fully-connected sigmoidal layer
|
|
189 layer3 = LogisticRegression.new(input=layer2.output, n_in=128, n_out=10)
|
|
190
|
|
191 # the cost we minimize during training is the NLL of the model
|
|
192 cost = layer3.nll(y).mean()
|
|
193
|
|
194 # create a function to compute the mistakes that are made by the model
|
|
195 test_model = pfunc([x,y], layer3.errors(y))
|
|
196
|
|
197 # create a list of all model parameters to be fit by gradient descent
|
|
198 params = layer3.params+ layer2.params+ layer1.params + layer0.params
|
|
199 learning_rate = numpy.asarray(0.01, dtype='float32')
|
|
200
|
|
201 # train_model is a function that updates the model parameters by SGD
|
|
202 train_model = pfunc([x, y], cost,
|
|
203 updates=[(p, p - learning_rate*gp) for p,gp in zip(params, tensor.grad(cost, params))])
|
|
204
|
|
205 # IS IT MORE SIMPLE TO USE A MINIMIZER OR THE DIRECT CODE?
|
|
206
|
|
207 best_valid_score = float('inf')
|
|
208 for i in xrange(n_iter):
|
|
209 for j in xrange(len(mnist.train.x)/batch_size):
|
|
210 cost_ij = train_model(
|
|
211 mnist.train.x[j*batch_size:(j+1)*batch_size],
|
|
212 mnist.train.y[j*batch_size:(j+1)*batch_size])
|
|
213 #if 0 == j % 100:
|
|
214 #print('epoch %i:%i, training error %f' % (i, j*batch_size, cost_ij))
|
|
215 valid_score = numpy.mean([test_model(
|
|
216 mnist.valid.x[j*batch_size:(j+1)*batch_size],
|
|
217 mnist.valid.y[j*batch_size:(j+1)*batch_size])
|
|
218 for j in xrange(len(mnist.valid.x)/batch_size)])
|
|
219 print('epoch %i, validation error %f' % (i, valid_score))
|
|
220 if valid_score < best_valid_score:
|
|
221 best_valid_score = valid_score
|
|
222 test_score = numpy.mean([test_model(
|
|
223 mnist.test.x[j*batch_size:(j+1)*batch_size],
|
|
224 mnist.test.y[j*batch_size:(j+1)*batch_size])
|
|
225 for j in xrange(len(mnist.test.x)/batch_size)])
|
|
226 print('epoch %i, test error of best model %f' % (i, test_score))
|
|
227
|
|
228 if __name__ == '__main__':
|
|
229 evaluate_lenet5()
|
|
230
|