# HG changeset patch
# User Dumitru Erhan <dumitru.erhan@gmail.com>
# Date 1264390469 18000
# Node ID bcc87d3e33a342a42a0469e6ad95002663b135f7
# Parent  0fda55a7de99c9b08ce0f36acd56b9f00f2d857c
adding latest tutorial code

diff -r 0fda55a7de99 -r bcc87d3e33a3 code_tutoriel/logistic_cg.py
--- a/code_tutoriel/logistic_cg.py	Sun Jan 24 22:33:33 2010 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,282 +0,0 @@
-"""
-This tutorial introduces logistic regression using Theano and conjugate 
-gradient descent.  
-
-Logistic regression is a probabilistic, linear classifier. It is parametrized
-by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
-done by projecting data points onto a set of hyperplanes, the distance to
-which is used to determine a class membership probability. 
-
-Mathematically, this can be written as:
-
-.. math::
-  P(Y=i|x, W,b) &= softmax_i(W x + b) \\
-                &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
-
-
-The output of the model or prediction is then done by taking the argmax of 
-the vector whose i'th element is P(Y=i|x).
-
-.. math::
-
-  y_{pred} = argmax_i P(Y=i|x,W,b)
-
-
-This tutorial presents a stochastic gradient descent optimization method 
-suitable for large datasets, and a conjugate gradient optimization method 
-that is suitable for smaller datasets.
-
-
-References:
-
-   - textbooks: "Pattern Recognition and Machine Learning" - 
-                 Christopher M. Bishop, section 4.3.2
-
-
-"""
-__docformat__ = 'restructedtext en'
-
-
-import numpy, cPickle, gzip
-
-import time
-
-import theano
-import theano.tensor as T
-import theano.tensor.nnet
-
-
-class LogisticRegression(object):
-    """Multi-class Logistic Regression Class
-
-    The logistic regression is fully described by a weight matrix :math:`W` 
-    and bias vector :math:`b`. Classification is done by projecting data 
-    points onto a set of hyperplanes, the distance to which is used to 
-    determine a class membership probability. 
-    """
-
-
-
-
-    def __init__(self, input, n_in, n_out):
-        """ Initialize the parameters of the logistic regression
-
-        :param input: symbolic variable that describes the input of the 
-        architecture ( one minibatch)
-
-        :param n_in: number of input units, the dimension of the space in 
-        which the datapoint lies
-
-        :param n_out: number of output units, the dimension of the space in 
-        which the target lies
-
-        """ 
-
-        # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out), 
-        # while b is a vector of n_out elements, making theta a vector of
-        # n_in*n_out + n_out elements
-        self.theta = theano.shared( value = numpy.zeros(n_in*n_out+n_out) )
-        # W is represented by the fisr n_in*n_out elements of theta
-        self.W = self.theta[0:n_in*n_out].reshape((n_in,n_out))
-        # b is the rest (last n_out elements)
-        self.b = self.theta[n_in*n_out:n_in*n_out+n_out]
-
-
-        # compute vector of class-membership probabilities in symbolic form
-        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
-
-        # compute prediction as class whose probability is maximal in 
-        # symbolic form
-        self.y_pred=T.argmax(self.p_y_given_x, axis=1)
-
-
-
-
-
-    def negative_log_likelihood(self, y):
-        """Return the negative log-likelihood of the prediction of this model
-        under a given target distribution.  
-
-        .. math::
-
-            \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
-            \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
-                \ell (\theta=\{W,b\}, \mathcal{D}) 
-
-
-        :param y: corresponds to a vector that gives for each example the
-        :correct label
-        """
-        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
-
-
-
-
-
-    def errors(self, y):
-        """Return a float representing the number of errors in the minibatch 
-        over the total number of examples of the minibatch 
-        """
-
-        # check if y has same dimension of y_pred 
-        if y.ndim != self.y_pred.ndim:
-            raise TypeError('y should have the same shape as self.y_pred', 
-                ('y', target.type, 'y_pred', self.y_pred.type))
-        # check if y is of the correct datatype        
-        if y.dtype.startswith('int'):
-            # the T.neq operator returns a vector of 0s and 1s, where 1
-            # represents a mistake in prediction
-            return T.mean(T.neq(self.y_pred, y))
-        else:
-            raise NotImplementedError()
-
-
-
-
-
-
-
-def cg_optimization_mnist( n_iter=50 ):
-    """Demonstrate conjugate gradient optimization of a log-linear model 
-
-    This is demonstrated on MNIST.
-    
-    :param n_iter: number of iterations ot run the optimizer 
-
-    """
-
-    # Load the dataset 
-    f = gzip.open('mnist.pkl.gz','rb')
-    train_set, valid_set, test_set = cPickle.load(f)
-    f.close()
-
-    # make minibatches of size 20 
-    batch_size = 20    # sized of the minibatch
-
-    # Dealing with the training set
-    # get the list of training images (x) and their labels (y)
-    (train_set_x, train_set_y) = train_set
-    # initialize the list of training minibatches with empty list
-    train_batches = []
-    for i in xrange(0, len(train_set_x), batch_size):
-        # add to the list of minibatches the minibatch starting at 
-        # position i, ending at position i+batch_size
-        # a minibatch is a pair ; the first element of the pair is a list 
-        # of datapoints, the second element is the list of corresponding 
-        # labels
-        train_batches = train_batches + \
-               [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
-
-    # Dealing with the validation set
-    (valid_set_x, valid_set_y) = valid_set
-    # initialize the list of validation minibatches 
-    valid_batches = []
-    for i in xrange(0, len(valid_set_x), batch_size):
-        valid_batches = valid_batches + \
-               [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
-
-    # Dealing with the testing set
-    (test_set_x, test_set_y) = test_set
-    # initialize the list of testing minibatches 
-    test_batches = []
-    for i in xrange(0, len(test_set_x), batch_size):
-        test_batches = test_batches + \
-              [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
-
-
-    ishape     = (28,28) # this is the size of MNIST images
-    n_in       = 28*28   # number of input units
-    n_out      = 10      # number of output units
-    # allocate symbolic variables for the data
-    x = T.fmatrix()  # the data is presented as rasterized images
-    y = T.lvector()  # the labels are presented as 1D vector of 
-                          # [long int] labels
-
- 
-    # construct the logistic regression class
-    classifier = LogisticRegression( \
-                   input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10)
-
-    # the cost we minimize during training is the negative log likelihood of 
-    # the model in symbolic format
-    cost = classifier.negative_log_likelihood(y).mean() 
-
-    # compile a theano function that computes the mistakes that are made by 
-    # the model on a minibatch
-    test_model = theano.function([x,y], classifier.errors(y))
-    # compile a theano function that returns the gradient of the minibatch 
-    # with respect to theta
-    batch_grad = theano.function([x, y], T.grad(cost, classifier.theta))
-    #  compile a thenao function that returns the cost of a minibatch
-    batch_cost = theano.function([x, y], cost)
-
-    # creates a function that computes the average cost on the training set
-    def train_fn(theta_value):
-        classifier.theta.value = theta_value
-        cost = 0.
-        for x,y in train_batches :
-            cost += batch_cost(x,y)
-        return cost / len(train_batches)
-
-    # creates a function that computes the average gradient of cost with 
-    # respect to theta
-    def train_fn_grad(theta_value):
-        classifier.theta.value = theta_value
-        grad = numpy.zeros(n_in * n_out + n_out)
-        for x,y in train_batches:
-            grad += batch_grad(x,y)
-        return grad/ len(train_batches)
-
-
-
-    validation_scores = [float('inf'), 0]
- 
-    # creates the validation function
-    def callback(theta_value):
-        classifier.theta.value = theta_value
-        #compute the validation loss
-        this_validation_loss = 0.
-        for x,y in valid_batches:
-            this_validation_loss += test_model(x,y)
-
-        this_validation_loss /= len(valid_batches)
-
-        print('validation error %f %%' % (this_validation_loss*100.,))
-        
-        # check if it is better then best validation score got until now
-        if this_validation_loss < validation_scores[0]:
-            # if so, replace the old one, and compute the score on the 
-            # testing dataset
-            validation_scores[0] = this_validation_loss
-            test_score = 0.
-            for x,y in test_batches:
-                test_score += test_model(x,y)
-            validation_scores[1] = test_score / len(test_batches)
-
-    # using scipy conjugate gradient optimizer 
-    import scipy.optimize
-    print ("Optimizing using scipy.optimize.fmin_cg...")
-    start_time = time.clock()
-    best_w_b = scipy.optimize.fmin_cg(
-            f=train_fn, 
-            x0=numpy.zeros((n_in+1)*n_out, dtype=x.dtype),
-            fprime=train_fn_grad,
-            callback=callback,
-            disp=0,
-            maxiter=n_iter)
-    end_time = time.clock()
-    print(('Optimization complete with best validation score of %f %%, with '
-          'test performance %f %%') % 
-               (validation_scores[0]*100., validation_scores[1]*100.))
-
-    print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
-
-
-
-
-
-
-
-if __name__ == '__main__':
-    cg_optimization_mnist()
-
diff -r 0fda55a7de99 -r bcc87d3e33a3 code_tutoriel/logistic_sgd.py
--- a/code_tutoriel/logistic_sgd.py	Sun Jan 24 22:33:33 2010 -0500
+++ b/code_tutoriel/logistic_sgd.py	Sun Jan 24 22:34:29 2010 -0500
@@ -147,7 +147,7 @@
     :param learning_rate: learning rate used (factor for the stochastic 
     gradient
 
-    :param n_iter: number of iterations ot run the optimizer 
+    :param n_iter: maximal number of iterations ot run the optimizer 
 
     """
 
diff -r 0fda55a7de99 -r bcc87d3e33a3 code_tutoriel/mlp.py
--- a/code_tutoriel/mlp.py	Sun Jan 24 22:33:33 2010 -0500
+++ b/code_tutoriel/mlp.py	Sun Jan 24 22:34:29 2010 -0500
@@ -71,18 +71,20 @@
         # other tutorials
         
         # `W1` is initialized with `W1_values` which is uniformely sampled
-        # from -1/sqrt(n_in) and 1/sqrt(n_in)
+        # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
         # the output of uniform if converted using asarray to dtype 
         # theano.config.floatX so that the code is runable on GPU
         W1_values = numpy.asarray( numpy.random.uniform( \
-              low = -numpy.sqrt(6./(n_in+n_hidden)), high = numpy.sqrt(6./(n_in+n_hidden)), \
+              low = -numpy.sqrt(6./(n_in+n_hidden)), \
+              high = numpy.sqrt(6./(n_in+n_hidden)), \
               size = (n_in, n_hidden)), dtype = theano.config.floatX)
         # `W2` is initialized with `W2_values` which is uniformely sampled 
-        # from -1/sqrt(n_hidden) and 1/sqrt(n_hidden)
+        # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out)
         # the output of uniform if converted using asarray to dtype 
         # theano.config.floatX so that the code is runable on GPU
         W2_values = numpy.asarray( numpy.random.uniform( 
-              low = numpy.sqrt(6./(n_hidden+n_out)), high= numpy.sqrt(6./(n_hidden+n_out)),\
+              low = numpy.sqrt(6./(n_hidden+n_out)), \
+              high= numpy.sqrt(6./(n_hidden+n_out)),\
               size= (n_hidden, n_out)), dtype = theano.config.floatX)
 
         self.W1 = theano.shared( value = W1_values )
@@ -161,14 +163,15 @@
     :param learning_rate: learning rate used (factor for the stochastic 
     gradient
 
-    :param n_iter: number of iterations ot run the optimizer 
-
     :param L1_reg: L1-norm's weight when added to the cost (see 
     regularization)
 
     :param L2_reg: L2-norm's weight when added to the cost (see 
     regularization)
-    """
+ 
+    :param n_iter: maximal number of iterations ot run the optimizer 
+
+   """
 
     # Load the dataset 
     f = gzip.open('mnist.pkl.gz','rb')
@@ -264,6 +267,7 @@
 
     best_params          = None
     best_validation_loss = float('inf')
+    best_iter            = 0
     test_score           = 0.
     start_time = time.clock()
     # have a maximum of `n_iter` iterations through the entire dataset
@@ -300,9 +304,11 @@
                        improvement_threshold :
                     patience = max(patience, iter * patience_increase)
 
+                # save best validation score and iteration number
                 best_validation_loss = this_validation_loss
+                best_iter = iter
+
                 # test it on the test set
-            
                 test_score = 0.
                 for x,y in test_batches:
                     test_score += test_model(x,y)
@@ -313,19 +319,15 @@
                               test_score*100.))
 
         if patience <= iter :
-                break
+            break
 
     end_time = time.clock()
-    print(('Optimization complete with best validation score of %f %%,'
-           'with test performance %f %%') %  
-                 (best_validation_loss * 100., test_score*100.))
+    print(('Optimization complete. Best validation score of %f %% '
+           'obtained at iteration %i, with test performance %f %%') %  
+                 (best_validation_loss * 100., best_iter, test_score*100.))
     print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
 
 
-
-
-
-
 if __name__ == '__main__':
     sgd_optimization_mnist()