Mercurial > ift6266

--- a/baseline/conv_mlp/convolutional_mlp.py	Thu Mar 04 09:43:23 2010 -0500
+++ b/baseline/conv_mlp/convolutional_mlp.py	Thu Mar 04 20:43:21 2010 -0500
@@ -26,7 +26,8 @@
 import theano.sandbox.softsign
 import pylearn.datasets.MNIST
 from pylearn.io import filetensor as ft
-from theano.sandbox import conv, downsample
+from theano.tensor.signal import downsample
+from theano.tensor.nnet import conv

 class LeNetConvPoolLayer(object):
--- a/baseline/log_reg/log_reg.py	Thu Mar 04 09:43:23 2010 -0500
+++ b/baseline/log_reg/log_reg.py	Thu Mar 04 20:43:21 2010 -0500
@@ -35,11 +35,11 @@
 """
 __docformat__ = 'restructedtext en'

-import numpy, time, cPickle, gzip
+import numpy, time

 import theano
 import theano.tensor as T
-
+from ift6266 import datasets

 class LogisticRegression(object):
     """Multi-class Logistic Regression Class
@@ -112,6 +112,8 @@
         # i.e., the mean log-likelihood across the minibatch.
         return -T.mean( T.log( self.p_y_given_x )[ T.arange( y.shape[0] ), y ] )

+    def MSE(self, y):
+        return -T.mean(abs((self.p_t_given_x)[T.arange(y.shape[0]), y]-y)**2)

     def errors( self, y ):
         """Return a float representing the number of errors in the minibatch
@@ -135,107 +137,12 @@
         else:
             raise NotImplementedError()

-def shared_dataset( data_xy ):
-        """ Function that loads the dataset into shared variables
-
-        The reason we store our dataset in shared variables is to allow
-        Theano to copy it into the GPU memory (when code is run on GPU).
-        Since copying data into the GPU is slow, copying a minibatch everytime
-        is needed (the default behaviour if the data is not in a shared
-        variable) would lead to a large decrease in performance.
-        """
-        data_x, data_y = data_xy
-        shared_x = theano.shared( numpy.asarray( data_x, dtype = theano.config.floatX ) )
-        shared_y = theano.shared( numpy.asarray( data_y, dtype = theano.config.floatX ) )
-        # When storing data on the GPU it has to be stored as floats
-        # therefore we will store the labels as ``floatX`` as well
-        # (``shared_y`` does exactly that). But during our computations
-        # we need them as ints (we use labels as index, and if they are
-        # floats it doesn't make sense) therefore instead of returning
-        # ``shared_y`` we will have to cast it to int. This little hack
-        # lets ous get around this issue
-        return shared_x, T.cast( shared_y, 'int32' )
-
-def load_data_pkl_gz( dataset ):
-    ''' Loads the dataset
-
-    :type dataset: string
-    :param dataset: the path to the dataset (here MNIST)
-    '''
-
-    #--------------------------------------------------------------------------------------------------------------------
-    # Load Data
-    #--------------------------------------------------------------------------------------------------------------------
-
-
-    print '... loading data'
-
-    # Load the dataset
-    f = gzip.open(dataset,'rb')
-    train_set, valid_set, test_set = cPickle.load(f)
-    f.close()
-
-    test_set_x,  test_set_y  = shared_dataset( test_set )
-    valid_set_x, valid_set_y = shared_dataset( valid_set )
-    train_set_x, train_set_y = shared_dataset( train_set )
-
-    rval = [ ( train_set_x, train_set_y ), ( valid_set_x,valid_set_y ), ( test_set_x, test_set_y ) ]
-    return rval
-
-##def load_data_ft(      verbose = False,\
-##                                    data_path = '/data/lisa/data/nist/by_class/'\
-##                                    train_data = 'all/all_train_data.ft',\
-##                                    train_labels = 'all/all_train_labels.ft',\
-##                                    test_data = 'all/all_test_data.ft',\
-##                                    test_labels = 'all/all_test_labels.ft'):
-##
-##    train_data_file = open(data_path + train_data)
-##    train_labels_file = open(data_path + train_labels)
-##    test_labels_file = open(data_path + test_data)
-##    test_data_file = open(data_path + test_labels)
-##
-##    raw_train_data = ft.read( train_data_file)
-##    raw_train_labels = ft.read(train_labels_file)
-##    raw_test_data = ft.read( test_labels_file)
-##    raw_test_labels = ft.read( test_data_file)
-##
-##    f.close()
-##    g.close()
-##    i.close()
-##    h.close()
-##
-##
-##    test_set_x,  test_set_y  = shared_dataset(test_set)
-##    valid_set_x, valid_set_y = shared_dataset(valid_set)
-##    train_set_x, train_set_y = shared_dataset(train_set)
-##
-##    rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)]
-##    return rval
-##    #create a validation set the same size as the test size
-##    #use the end of the training array for this purpose
-##    #discard the last remaining so we get a %batch_size number
-##    test_size=len(raw_test_labels)
-##    test_size = int(test_size/batch_size)
-##    test_size*=batch_size
-##    train_size = len(raw_train_data)
-##    train_size = int(train_size/batch_size)
-##    train_size*=batch_size
-##    validation_size =test_size
-##    offset = train_size-test_size
-##    if verbose == True:
-##        print 'train size = %d' %train_size
-##        print 'test size = %d' %test_size
-##        print 'valid size = %d' %validation_size
-##        print 'offset = %d' %offset
-##
-##
-
 #--------------------------------------------------------------------------------------------------------------------
 # MAIN
 #--------------------------------------------------------------------------------------------------------------------

 def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \
-                    dataset_name = 'mnist.pkl.gz', image_size = 28 * 28, nb_class = 10,  \
+                    dataset=datasets.nist_digits, image_size = 32 * 32, nb_class = 10,  \
                     patience = 5000, patience_increase = 2, improvement_threshold = 0.995):

     """
@@ -254,9 +161,8 @@
     :type batch_size: int
     :param batch_size:  size of the minibatch

-    :type dataset_name: string
-    :param dataset: the path of the MNIST dataset file from
-                         http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
+    :type dataset: dataset
+    :param dataset: a dataset instance from ift6266.datasets

     :type image_size: int
     :param image_size: size of the input image in pixels (width * height)
@@ -275,17 +181,6 @@


     """
-    datasets = load_data_pkl_gz( dataset_name )
-
-    train_set_x, train_set_y = datasets[0]
-    valid_set_x, valid_set_y = datasets[1]
-    test_set_x , test_set_y   = datasets[2]
-
-    # compute number of minibatches for training, validation and testing
-    n_train_batches = train_set_x.value.shape[0] / batch_size
-    n_valid_batches = valid_set_x.value.shape[0] / batch_size
-    n_test_batches  = test_set_x.value.shape[0]  / batch_size
-
     #--------------------------------------------------------------------------------------------------------------------
     # Build actual model
     #--------------------------------------------------------------------------------------------------------------------
@@ -308,17 +203,11 @@

     # compiling a Theano function that computes the mistakes that are made by
     # the model on a minibatch
-    test_model = theano.function( inputs = [ index ],
-            outputs = classifier.errors( y ),
-            givens = {
-                x:test_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
-                y:test_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+    test_model = theano.function( inputs = [ x, y ],
+            outputs = classifier.errors( y ))

-    validate_model = theano.function( inputs = [ index ],
-            outputs = classifier.errors( y ),
-            givens = {
-                x:valid_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
-                y:valid_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+    validate_model = theano.function( inputs = [ x, y ],
+            outputs = classifier.errors( y ))

     # compute the gradient of cost with respect to theta = ( W, b )
     g_W = T.grad( cost = cost, wrt = classifier.W )
@@ -331,12 +220,9 @@
     # compiling a Theano function `train_model` that returns the cost, but in
     # the same time updates the parameter of the model based on the rules
     # defined in `updates`
-    train_model = theano.function( inputs = [ index ],
+    train_model = theano.function( inputs = [ x, y ],
             outputs = cost,
-            updates = updates,
-            givens = {
-                x: train_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
-                y: train_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+            updates = updates)

     #--------------------------------------------------------------------------------------------------------------------
     # Train model
@@ -349,38 +235,38 @@
                                   # found
     improvement_threshold = 0.995 # a relative improvement of this much is
                                   # considered significant
-    validation_frequency  = min( n_train_batches, patience * 0.5 )
+    validation_frequency  = patience * 0.5
                                   # go through this many
                                   # minibatche before checking the network
                                   # on the validation set; in this case we
                                   # check every epoch

-    best_params             = None
+    best_params          = None
     best_validation_loss = float('inf')
-    test_score                 = 0.
-    start_time                  = time.clock()
+    test_score           = 0.
+    start_time           = time.clock()

     done_looping = False
-    n_epochs       = nb_max_examples / train_set_x.value.shape[0]
-    epoch             = 0
+    n_iters      = nb_max_examples / batch_size
+    epoch        = 0
+    iter        = 0

-    while ( epoch < n_epochs ) and ( not done_looping ):
+    while ( iter < n_iters ) and ( not done_looping ):

       epoch = epoch + 1
-      for minibatch_index in xrange( n_train_batches ):
+      for x, y in dataset.train(batch_size):

-        minibatch_avg_cost = train_model( minibatch_index )
+        minibatch_avg_cost = train_model( x, y )
         # iteration number
-        iter = epoch * n_train_batches + minibatch_index
+        iter += 1

-        if ( iter + 1 ) % validation_frequency == 0:
+        if iter % validation_frequency == 0:
             # compute zero-one loss on validation set
-            validation_losses     = [ validate_model( i ) for i in xrange( n_valid_batches ) ]
+            validation_losses     = [ validate_model( xv, yv ) for xv, yv in dataset.valid(batch_size) ]
             this_validation_loss = numpy.mean( validation_losses )

-            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                 ( epoch, minibatch_index + 1,n_train_batches, \
-                  this_validation_loss*100. ) )
+            print('epoch %i, iter %i, validation error %f %%' % \
+                 ( epoch, iter, this_validation_loss*100. ) )


             # if we got the best validation score until now
@@ -393,12 +279,12 @@
                 best_validation_loss = this_validation_loss
                 # test it on the test set

-                test_losses = [test_model(i) for i in xrange(n_test_batches)]
+                test_losses = [test_model(xt, yt) for xt, yt in dataset.test(batch_size)]
                 test_score  = numpy.mean(test_losses)

-                print(('     epoch %i, minibatch %i/%i, test error of best '
+                print(('     epoch %i, iter %i, test error of best '
                        'model %f %%') % \
-                  (epoch, minibatch_index+1, n_train_batches,test_score*100.))
+                  (epoch, iter, test_score*100.))

         if patience <= iter :
                 done_looping = True
--- a/deep/convolutional_dae/stacked_convolutional_dae.py	Thu Mar 04 09:43:23 2010 -0500
+++ b/deep/convolutional_dae/stacked_convolutional_dae.py	Thu Mar 04 20:43:21 2010 -0500
@@ -7,44 +7,10 @@

 from theano.tensor.signal import downsample
 from theano.tensor.nnet import conv
-import gzip
-import cPickle
-
-
-class LogisticRegression(object):
-
-    def __init__(self, input, n_in, n_out):
-
-        self.W = theano.shared( value=numpy.zeros((n_in,n_out),
-                                            dtype = theano.config.floatX) )
-
-        self.b = theano.shared( value=numpy.zeros((n_out,),
-                                            dtype = theano.config.floatX) )
-
-        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
-

-        self.y_pred=T.argmax(self.p_y_given_x, axis=1)
-
-        self.params = [self.W, self.b]
-
-    def negative_log_likelihood(self, y):
-        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
-
-    def MSE(self, y):
-        return -T.mean(abs((self.p_y_given_x)[T.arange(y.shape[0]),y]-y)**2)
+from ift6266 import datasets

-    def errors(self, y):
-        if y.ndim != self.y_pred.ndim:
-            raise TypeError('y should have the same shape as self.y_pred',
-                ('y', target.type, 'y_pred', self.y_pred.type))
-
-
-        if y.dtype.startswith('int'):
-            return T.mean(T.neq(self.y_pred, y))
-        else:
-            raise NotImplementedError()
-
+from ift6266.baseline.log_reg.log_reg import LogisticRegression

 class SigmoidalLayer(object):
     def __init__(self, rng, input, n_in, n_out):
@@ -65,8 +31,9 @@

 class dA_conv(object):

-  def __init__(self, corruption_level = 0.1, input = None, shared_W = None,\
-                   shared_b = None, filter_shape = None, image_shape = None, poolsize = (2,2)):
+  def __init__(self, input, filter_shape, corruption_level = 0.1,
+               shared_W = None, shared_b = None, image_shape = None,
+               poolsize = (2,2)):

     theano_rng = RandomStreams()

@@ -80,13 +47,11 @@
         self.W = shared_W
         self.b = shared_b
     else:
-        initial_W = numpy.asarray( numpy.random.uniform( \
-              low = -numpy.sqrt(6./(fan_in+fan_out)), \
-              high = numpy.sqrt(6./(fan_in+fan_out)), \
+        initial_W = numpy.asarray( numpy.random.uniform(
+              low = -numpy.sqrt(6./(fan_in+fan_out)),
+              high = numpy.sqrt(6./(fan_in+fan_out)),
               size = filter_shape), dtype = theano.config.floatX)
-        initial_b = numpy.zeros((filter_shape[0],), dtype= theano.config.floatX)
-
-
+        initial_b = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
         self.W = theano.shared(value = initial_W, name = "W")
         self.b = theano.shared(value = initial_b, name = "b")

@@ -101,9 +66,8 @@

     self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level) * self.x

-    conv1_out = conv.conv2d(self.tilde_x, self.W, \
-                             filter_shape=filter_shape, \
-                                image_shape=image_shape, border_mode='valid')
+    conv1_out = conv.conv2d(self.tilde_x, self.W, filter_shape=filter_shape,
+                            image_shape=image_shape, border_mode='valid')


     self.y = T.tanh(conv1_out + self.b.dimshuffle('x', 0, 'x', 'x'))
@@ -111,19 +75,15 @@

     da_filter_shape = [ filter_shape[1], filter_shape[0], filter_shape[2],\
                        filter_shape[3] ]
-    da_image_shape = [ image_shape[0],filter_shape[0],image_shape[2]-filter_shape[2]+1, \
-                         image_shape[3]-filter_shape[3]+1 ]
     initial_W_prime =  numpy.asarray( numpy.random.uniform( \
               low = -numpy.sqrt(6./(fan_in+fan_out)), \
               high = numpy.sqrt(6./(fan_in+fan_out)), \
               size = da_filter_shape), dtype = theano.config.floatX)
     self.W_prime = theano.shared(value = initial_W_prime, name = "W_prime")

-    #import pdb;pdb.set_trace()
-
-    conv2_out = conv.conv2d(self.y, self.W_prime, \
-                               filter_shape = da_filter_shape, image_shape = da_image_shape ,\
-                                border_mode='full')
+    conv2_out = conv.conv2d(self.y, self.W_prime,
+                            filter_shape = da_filter_shape,
+                            border_mode='full')

     self.z =  (T.tanh(conv2_out + self.b_prime.dimshuffle('x', 0, 'x', 'x'))+center) / scale

@@ -134,19 +94,16 @@
     self.cost = T.mean(self.L)

     self.params = [ self.W, self.b, self.b_prime ]
-
-

 class LeNetConvPoolLayer(object):
-    def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2,2)):
-        assert image_shape[1]==filter_shape[1]
+    def __init__(self, rng, input, filter_shape, image_shape=None, poolsize=(2,2)):
         self.input = input

         W_values = numpy.zeros(filter_shape, dtype=theano.config.floatX)
-        self.W = theano.shared(value = W_values)
+        self.W = theano.shared(value=W_values)

-        b_values = numpy.zeros((filter_shape[0],), dtype= theano.config.floatX)
-        self.b = theano.shared(value= b_values)
+        b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
+        self.b = theano.shared(value=b_values)

         conv_out = conv.conv2d(input, self.W,
                 filter_shape=filter_shape, image_shape=image_shape)
@@ -168,67 +125,60 @@


 class SdA():
-    def __init__(self, input, n_ins_conv, n_ins_mlp, train_set_x, train_set_y, batch_size, \
-                     conv_hidden_layers_sizes, mlp_hidden_layers_sizes, corruption_levels, \
-                     rng, n_out, pretrain_lr, finetune_lr):
-
+    def __init__(self, input, n_ins_mlp, conv_hidden_layers_sizes,
+                 mlp_hidden_layers_sizes, corruption_levels, rng, n_out,
+                 pretrain_lr, finetune_lr):
+
         self.layers = []
         self.pretrain_functions = []
         self.params = []
         self.conv_n_layers = len(conv_hidden_layers_sizes)
         self.mlp_n_layers = len(mlp_hidden_layers_sizes)
-
-        index = T.lscalar() # index to a [mini]batch
+
         self.x = T.dmatrix('x') # the data is presented as rasterized images
         self.y = T.ivector('y') # the labels are presented as 1D vector of

-
-
         for i in xrange( self.conv_n_layers ):
-
             filter_shape=conv_hidden_layers_sizes[i][0]
             image_shape=conv_hidden_layers_sizes[i][1]
             max_poolsize=conv_hidden_layers_sizes[i][2]

             if i == 0 :
-                layer_input=self.x.reshape((batch_size,1,28,28))
+                layer_input=self.x.reshape((self.x.shape[0], 1, 32, 32))
             else:
                 layer_input=self.layers[-1].output
-
-            layer = LeNetConvPoolLayer(rng, input=layer_input, \
-                                image_shape=image_shape, \
-                                filter_shape=filter_shape,poolsize=max_poolsize)
-            print 'Convolutional layer '+str(i+1)+' created'
-
+
+            layer = LeNetConvPoolLayer(rng, input=layer_input,
+                                       image_shape=image_shape,
+                                       filter_shape=filter_shape,
+                                       poolsize=max_poolsize)
+            print 'Convolutional layer', str(i+1), 'created'
+
             self.layers += [layer]
             self.params += layer.params
-
-            da_layer = dA_conv(corruption_level = corruption_levels[0],\
-                                  input = layer_input, \
-                                  shared_W = layer.W, shared_b = layer.b,\
-                                  filter_shape = filter_shape , image_shape = image_shape )
-
-
+
+            da_layer = dA_conv(corruption_level = corruption_levels[0],
+                               input = layer_input,
+                               shared_W = layer.W, shared_b = layer.b,
+                               filter_shape = filter_shape,
+                               image_shape = image_shape )
+
             gparams = T.grad(da_layer.cost, da_layer.params)
-
+
             updates = {}
             for param, gparam in zip(da_layer.params, gparams):
-                    updates[param] = param - gparam * pretrain_lr
-
-
-            update_fn = theano.function([index], da_layer.cost, \
-                                        updates = updates,
-                                        givens = {
-                    self.x : train_set_x[index*batch_size:(index+1)*batch_size]} )
-
+                updates[param] = param - gparam * pretrain_lr
+
+            update_fn = theano.function([self.x], da_layer.cost, updates = updates)
+
             self.pretrain_functions += [update_fn]
-
+
         for i in xrange( self.mlp_n_layers ):
             if i == 0 :
                 input_size = n_ins_mlp
             else:
                 input_size = mlp_hidden_layers_sizes[i-1]
-
+
             if i == 0 :
                 if len( self.layers ) == 0 :
                     layer_input=self.x
@@ -236,72 +186,43 @@
                     layer_input = self.layers[-1].output.flatten(2)
             else:
                 layer_input = self.layers[-1].output
-
+
             layer = SigmoidalLayer(rng, layer_input, input_size,
                                         mlp_hidden_layers_sizes[i] )
-
+
             self.layers += [layer]
             self.params += layer.params

-
-            print 'MLP layer '+str(i+1)+' created'
+            print 'MLP layer', str(i+1), 'created'

         self.logLayer = LogisticRegression(input=self.layers[-1].output, \
                                                      n_in=mlp_hidden_layers_sizes[-1], n_out=n_out)
         self.params += self.logLayer.params
-
+
         cost = self.logLayer.negative_log_likelihood(self.y)
+
+        gparams = T.grad(cost, self.params)

-        gparams = T.grad(cost, self.params)
         updates = {}
-
         for param,gparam in zip(self.params, gparams):
             updates[param] = param - gparam*finetune_lr
-
-        self.finetune = theano.function([index], cost,
-                updates = updates,
-                givens = {
-                  self.x : train_set_x[index*batch_size:(index+1)*batch_size],
-                  self.y : train_set_y[index*batch_size:(index+1)*batch_size]} )
-
+
+        self.finetune = theano.function([self.x, self.y], cost, updates = updates)
+
+        self.errors = self.logLayer.errors(self.y)

-        self.errors = self.logLayer.errors(self.y)
-
-
-
 def sgd_optimization_mnist( learning_rate=0.1, pretraining_epochs = 2, \
                             pretrain_lr = 0.01, training_epochs = 1000, \
-                            dataset='mnist.pkl.gz'):
-
-    f = gzip.open(dataset,'rb')
-    train_set, valid_set, test_set = cPickle.load(f)
-    f.close()
-
-
-    def shared_dataset(data_xy):
-        data_x, data_y = data_xy
-        shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
-        shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
-        return shared_x, T.cast(shared_y, 'int32')
-
-
-    test_set_x, test_set_y = shared_dataset(test_set)
-    valid_set_x, valid_set_y = shared_dataset(valid_set)
-    train_set_x, train_set_y = shared_dataset(train_set)
-
+                            dataset=datasets.nist_digits):
+
     batch_size = 500 # size of the minibatch

-
-    n_train_batches = train_set_x.value.shape[0] / batch_size
-    n_valid_batches = valid_set_x.value.shape[0] / batch_size
-    n_test_batches = test_set_x.value.shape[0] / batch_size
-
     # allocate symbolic variables for the data
     index = T.lscalar() # index to a [mini]batch
     x = T.matrix('x') # the data is presented as rasterized images
     y = T.ivector('y') # the labels are presented as 1d vector of
-                           # [int] labels
-    layer0_input = x.reshape((batch_size,1,28,28))
+    # [int] labels
+    layer0_input = x.reshape((x.shape[0],1,32,32))


     # Setup the convolutional layers with their DAs(add as many as you want)
@@ -310,45 +231,34 @@
     ker1=2
     ker2=2
     conv_layers=[]
-    conv_layers.append([[ker1,1,5,5], [batch_size,1,28,28], [2,2] ])
-    conv_layers.append([[ker2,ker1,5,5], [batch_size,ker1,12,12], [2,2] ])
+    conv_layers.append([[ker1,1,5,5], None, [2,2] ])
+    conv_layers.append([[ker2,ker1,5,5], None, [2,2] ])

     # Setup the MLP layers of the network
     mlp_layers=[500]

-    network = SdA(input = layer0_input, n_ins_conv = 28*28, n_ins_mlp = ker2*4*4, \
-                      train_set_x = train_set_x, train_set_y = train_set_y, batch_size = batch_size,
-                      conv_hidden_layers_sizes = conv_layers,  \
-                      mlp_hidden_layers_sizes = mlp_layers, \
-                      corruption_levels = corruption_levels , n_out = 10, \
-                      rng = rng , pretrain_lr = pretrain_lr , finetune_lr = learning_rate )
+    network = SdA(input = layer0_input, n_ins_mlp = ker2*4*4,
+                  conv_hidden_layers_sizes = conv_layers,
+                  mlp_hidden_layers_sizes = mlp_layers,
+                  corruption_levels = corruption_levels , n_out = 10,
+                  rng = rng , pretrain_lr = pretrain_lr ,
+                  finetune_lr = learning_rate )

-    test_model = theano.function([index], network.errors,
-             givens = {
-                network.x: test_set_x[index*batch_size:(index+1)*batch_size],
-                network.y: test_set_y[index*batch_size:(index+1)*batch_size]})
+    test_model = theano.function([network.x, network.y], network.errors)

-    validate_model = theano.function([index], network.errors,
-           givens = {
-                network.x: valid_set_x[index*batch_size:(index+1)*batch_size],
-                network.y: valid_set_y[index*batch_size:(index+1)*batch_size]})
-
-
-
     start_time = time.clock()
     for i in xrange(len(network.layers)-len(mlp_layers)):
         for epoch in xrange(pretraining_epochs):
-            for batch_index in xrange(n_train_batches):
-                c = network.pretrain_functions[i](batch_index)
-            print 'pre-training convolution layer %i, epoch %d, cost '%(i,epoch),c
+            for x, y in dataset.train(batch_size):
+                c = network.pretrain_functions[i](x)
+            print 'pre-training convolution layer %i, epoch %d, cost '%(i,epoch), c

     patience = 10000 # look as this many examples regardless
     patience_increase = 2. # WAIT THIS MUCH LONGER WHEN A NEW BEST IS
                                   # FOUND
     improvement_threshold = 0.995 # a relative improvement of this much is

-    validation_frequency = min(n_train_batches, patience/2)
-
+    validation_frequency = patience/2

     best_params = None
     best_validation_loss = float('inf')
@@ -357,23 +267,21 @@

     done_looping = False
     epoch = 0
-
+    iter = 0
+
     while (epoch < training_epochs) and (not done_looping):
       epoch = epoch + 1
-      for minibatch_index in xrange(n_train_batches):
+      for x, y in dataset.train(batch_size):

-        cost_ij = network.finetune(minibatch_index)
-        iter = epoch * n_train_batches + minibatch_index
-
-        if (iter+1) % validation_frequency == 0:
+        cost_ij = network.finetune(x, y)
+        iter += 1
+
+        if iter % validation_frequency == 0:
+            validation_losses = [test_model(xv, yv) for xv, yv in dataset.valid(batch_size)]
+            this_validation_loss = numpy.mean(validation_losses)
+            print('epoch %i, iter %i, validation error %f %%' % \
+                   (epoch, iter, this_validation_loss*100.))

-            validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
-            this_validation_loss = numpy.mean(validation_losses)
-            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                   (epoch, minibatch_index+1, n_train_batches, \
-                    this_validation_loss*100.))
-
-
             # if we got the best validation score until now
             if this_validation_loss < best_validation_loss:

@@ -381,35 +289,28 @@
                 if this_validation_loss < best_validation_loss * \
                        improvement_threshold :
                     patience = max(patience, iter * patience_increase)
-
+
                 # save best validation score and iteration number
                 best_validation_loss = this_validation_loss
                 best_iter = iter
-
+
                 # test it on the test set
-                test_losses = [test_model(i) for i in xrange(n_test_batches)]
+                test_losses = [test_model(xt, yt) for xt, yt in dataset.test(batch_size)]
                 test_score = numpy.mean(test_losses)
-                print((' epoch %i, minibatch %i/%i, test error of best '
+                print((' epoch %i, iter %i, test error of best '
                       'model %f %%') %
-                             (epoch, minibatch_index+1, n_train_batches,
-                              test_score*100.))
-
-
+                             (epoch, iter, test_score*100.))
+
         if patience <= iter :
-                done_looping = True
-                break
-
+            done_looping = True
+            break
+
     end_time = time.clock()
     print(('Optimization complete with best validation score of %f %%,'
            'with test performance %f %%') %
                  (best_validation_loss * 100., test_score*100.))
     print ('The code ran for %f minutes' % ((end_time-start_time)/60.))

-
-
-
-
-
 if __name__ == '__main__':
     sgd_optimization_mnist()
--- a/deep/stacked_dae/mnist_sda.py	Thu Mar 04 09:43:23 2010 -0500
+++ b/deep/stacked_dae/mnist_sda.py	Thu Mar 04 20:43:21 2010 -0500
@@ -1,6 +1,7 @@
 #!/usr/bin/python
 # coding: utf-8

+# TODO: This probably doesn't work anymore, adapt to new code in sgd_opt
 # Parameterize call to sgd_optimization for MNIST

 import numpy
--- a/deep/stacked_dae/nist_sda.py	Thu Mar 04 09:43:23 2010 -0500
+++ b/deep/stacked_dae/nist_sda.py	Thu Mar 04 20:43:21 2010 -0500
@@ -21,33 +21,35 @@
 import jobman, jobman.sql
 from pylearn.io import filetensor

-from utils import produit_croise_jobs
+from utils import produit_cartesien_jobs

 from sgd_optimization import SdaSgdOptimizer

-SERIES_AVAILABLE = False
-try:
-    from scalar_series import *
-    SERIES_AVAILABLE = True
-except ImportError:
-    print "Could not import Series"
+from ift6266.utils.scalar_series import *
+
+##############################################################################
+# GLOBALS

 TEST_CONFIG = False

 NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
-
-JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/fsavard_sda2'
+JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/fsavard_sda4'
+EXPERIMENT_PATH = "ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint"

 REDUCE_TRAIN_TO = None
 MAX_FINETUNING_EPOCHS = 1000
-REDUCE_EVERY = 1000 # number of minibatches before taking means for valid error etc.
+# number of minibatches before taking means for valid error etc.
+REDUCE_EVERY = 1000
+
 if TEST_CONFIG:
     REDUCE_TRAIN_TO = 1000
     MAX_FINETUNING_EPOCHS = 2
     REDUCE_EVERY = 10

-EXPERIMENT_PATH = "ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint"
-
+# Possible values the hyperparameters can take. These are then
+# combined with produit_cartesien_jobs so we get a list of all
+# possible combinations, each one resulting in a job inserted
+# in the jobman DB.
 JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001],
         'pretraining_epochs_per_layer': [10,20],
         'hidden_layers_sizes': [300,800],
@@ -58,30 +60,36 @@
         'num_hidden_layers':[2,3]}

 # Just useful for tests... minimal number of epochs
-DEFAULT_HP_NIST = DD({'finetuning_lr':0.01,
-                       'pretraining_lr':0.01,
-                       'pretraining_epochs_per_layer':1,
-                       'max_finetuning_epochs':1,
-                       'hidden_layers_sizes':1000,
+DEFAULT_HP_NIST = DD({'finetuning_lr':0.1,
+                       'pretraining_lr':0.1,
+                       'pretraining_epochs_per_layer':20,
+                       'max_finetuning_epochs':2,
+                       'hidden_layers_sizes':300,
                        'corruption_levels':0.2,
                        'minibatch_size':20,
-                       'reduce_train_to':1000,
-                       'num_hidden_layers':1})
+                       #'reduce_train_to':300,
+                       'num_hidden_layers':2})

+'''
+Function called by jobman upon launching each job
+Its path is the one given when inserting jobs:
+ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint
+'''
 def jobman_entrypoint(state, channel):
+    # record mercurial versions of each package
     pylearn.version.record_versions(state,[theano,ift6266,pylearn])
     channel.save()

     workingdir = os.getcwd()

     print "Will load NIST"
-    sys.stdout.flush()

-    nist = NIST(20)
+    nist = NIST(minibatch_size=20)

     print "NIST loaded"
-    sys.stdout.flush()

+    # For test runs, we don't want to use the whole dataset so
+    # reduce it to fewer elements if asked to.
     rtt = None
     if state.has_key('reduce_train_to'):
         rtt = state['reduce_train_to']
@@ -89,7 +97,7 @@
         rtt = REDUCE_TRAIN_TO

     if rtt:
-        print "Reducing training set to ", rtt, " examples"
+        print "Reducing training set to "+str(rtt)+ " examples"
         nist.reduce_train_set(rtt)

     train,valid,test = nist.get_tvt()
@@ -98,17 +106,13 @@
     n_ins = 32*32
     n_outs = 62 # 10 digits, 26*2 (lower, capitals)

-    hls = state.hidden_layers_sizes
-    cl = state.corruption_levels
-    nhl = state.num_hidden_layers
-    state.hidden_layers_sizes = [hls] * nhl
-    state.corruption_levels = [cl] * nhl
+    # b,b',W for each hidden layer
+    # + b,W of last layer (logreg)
+    numparams = state.num_hidden_layers * 3 + 2
+    series_mux = None
+    series_mux = create_series(workingdir, numparams)

-    # b,b',W for each hidden layer + b,W of last layer (logreg)
-    numparams = nhl * 3 + 2
-    series_mux = None
-    if SERIES_AVAILABLE:
-        series_mux = create_series(workingdir, numparams)
+    print "Creating optimizer with state, ", state

     optimizer = SdaSgdOptimizer(dataset=dataset, hyperparameters=state, \
                                     n_ins=n_ins, n_outs=n_outs,\
@@ -120,11 +124,10 @@
     optimizer.finetune()
     channel.save()

-    pylearn.version.record_versions(state,[theano,ift6266,pylearn])
-    channel.save()
-
     return channel.COMPLETE

+# These Series objects are used to save various statistics
+# during the training.
 def create_series(basedir, numparams):
     mux = SeriesMultiplexer()

@@ -146,8 +149,11 @@

     return mux

+# Perform insertion into the Postgre DB based on combination
+# of hyperparameter values above
+# (see comment for produit_cartesien_jobs() to know how it works)
 def jobman_insert_nist():
-    jobs = produit_croise_jobs(JOB_VALS)
+    jobs = produit_cartesien_jobs(JOB_VALS)

     db = jobman.sql.db(JOBDB)
     for job in jobs:
@@ -233,35 +239,6 @@

     raw_input("Press any key")

-# hp for hyperparameters
-def sgd_optimization_nist(hp=None, dataset_dir='/data/lisa/data/nist'):
-    global DEFAULT_HP_NIST
-    hp = hp and hp or DEFAULT_HP_NIST
-
-    print "Will load NIST"
-
-    import time
-    t1 = time.time()
-    nist = NIST(20, reduce_train_to=100)
-    t2 = time.time()
-
-    print "NIST loaded. time delta = ", t2-t1
-
-    train,valid,test = nist.get_tvt()
-    dataset = (train,valid,test)
-
-    print train[0][15]
-    print type(train[0][1])
-
-
-    print "Lengths train, valid, test: ", len(train[0]), len(valid[0]), len(test[0])
-
-    n_ins = 32*32
-    n_outs = 62 # 10 digits, 26*2 (lower, capitals)
-
-    optimizer = SdaSgdOptimizer(dataset, hp, n_ins, n_outs, input_divider=255.0)
-    optimizer.train()
-
 if __name__ == '__main__':

     import sys
@@ -275,11 +252,9 @@
         jobman_insert_nist()

     elif len(args) > 0 and args[0] == 'test_jobman_entrypoint':
-        chanmock = DD({'COMPLETE':0})
+        chanmock = DD({'COMPLETE':0,'save':(lambda:None)})
         jobman_entrypoint(DEFAULT_HP_NIST, chanmock)

-    elif len(args) > 0 and args[0] == 'estimate':
-        estimate_total_time()
     else:
-        sgd_optimization_nist()
+        print "Bad arguments"
--- a/deep/stacked_dae/sgd_optimization.py	Thu Mar 04 09:43:23 2010 -0500
+++ b/deep/stacked_dae/sgd_optimization.py	Thu Mar 04 20:43:21 2010 -0500
@@ -6,6 +6,7 @@
 import numpy
 import theano
 import time
+import datetime
 import theano.tensor as T
 import sys

@@ -59,20 +60,27 @@
         # compute number of minibatches for training, validation and testing
         self.n_train_batches = self.train_set_x.value.shape[0] / self.hp.minibatch_size
         self.n_valid_batches = self.valid_set_x.value.shape[0] / self.hp.minibatch_size
-        self.n_test_batches  = self.test_set_x.value.shape[0]  / self.hp.minibatch_size
+        # remove last batch in case it's incomplete
+        self.n_test_batches  = (self.test_set_x.value.shape[0]  / self.hp.minibatch_size) - 1

     def init_classifier(self):
         print "Constructing classifier"

+        # we don't want to save arrays in DD objects, so
+        # we recreate those arrays here
+        nhl = self.hp.num_hidden_layers
+        layers_sizes = [self.hp.hidden_layers_sizes] * nhl
+        corruption_levels = [self.hp.corruption_levels] * nhl
+
         # construct the stacked denoising autoencoder class
         self.classifier = SdA( \
                           train_set_x= self.train_set_x, \
                           train_set_y = self.train_set_y,\
                           batch_size = self.hp.minibatch_size, \
                           n_ins= self.n_ins, \
-                          hidden_layers_sizes = self.hp.hidden_layers_sizes, \
+                          hidden_layers_sizes = layers_sizes, \
                           n_outs = self.n_outs, \
-                          corruption_levels = self.hp.corruption_levels,\
+                          corruption_levels = corruption_levels,\
                           rng = self.rng,\
                           pretrain_lr = self.hp.pretraining_lr, \
                           finetune_lr = self.hp.finetuning_lr,\
@@ -85,7 +93,7 @@
         self.finetune()

     def pretrain(self):
-        print "STARTING PRETRAINING"
+        print "STARTING PRETRAINING, time = ", datetime.datetime.now()
         sys.stdout.flush()

         start_time = time.clock()
@@ -101,6 +109,8 @@

                 print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c
                 sys.stdout.flush()
+
+                self.series_mux.append("params", self.classifier.all_params)

         end_time = time.clock()

@@ -110,7 +120,7 @@
         sys.stdout.flush()

     def finetune(self):
-        print "STARTING FINETUNING"
+        print "STARTING FINETUNING, time = ", datetime.datetime.now()

         index   = T.lscalar()    # index to a [mini]batch
         minibatch_size = self.hp.minibatch_size
--- a/deep/stacked_dae/stacked_dae.py	Thu Mar 04 09:43:23 2010 -0500
+++ b/deep/stacked_dae/stacked_dae.py	Thu Mar 04 20:43:21 2010 -0500
@@ -10,6 +10,15 @@

 from utils import update_locals

+# taken from LeDeepNet/daa.py
+# has a special case when taking log(0) (defined =0)
+# modified to not take the mean anymore
+from theano.tensor.xlogx import xlogx, xlogy0
+# it's target*log(output)
+def binary_cross_entropy(target, output, sum_axis=1):
+    XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output))
+    return -T.sum(XE, axis=sum_axis)
+
 class LogisticRegression(object):
     def __init__(self, input, n_in, n_out):
         # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
@@ -128,7 +137,16 @@
     # Equation (4)
     # note : we sum over the size of a datapoint; if we are using minibatches,
     #        L will  be a vector, with one entry per example in minibatch
-    self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 )
+    #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 )
+    #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1)
+
+    # I added this epsilon to avoid getting log(0) and 1/0 in grad
+    # This means conceptually that there'd be no probability of 0, but that
+    # doesn't seem to me as important (maybe I'm wrong?).
+    eps = 0.00000001
+    eps_1 = 1-eps
+    self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \
+                    + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 )
     # note : L is now a vector, where each element is the cross-entropy cost
     #        of the reconstruction of the corresponding example of the
     #        minibatch. We need to compute the average of all these to get
@@ -138,8 +156,6 @@
     self.params = [ self.W, self.b, self.b_prime ]


-
-
 class SdA(object):
     def __init__(self, train_set_x, train_set_y, batch_size, n_ins,
                  hidden_layers_sizes, n_outs,
@@ -147,6 +163,7 @@
         # Just to make sure those are not modified somewhere else afterwards
         hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes)
         corruption_levels = copy.deepcopy(corruption_levels)
+
         update_locals(self, locals())

         self.layers             = []
@@ -157,6 +174,17 @@
         self.all_params         = []
         self.n_layers           = len(hidden_layers_sizes)

+        print "Creating SdA with params:"
+        print "batch_size", batch_size
+        print "hidden_layers_sizes", hidden_layers_sizes
+        print "corruption_levels", corruption_levels
+        print "n_ins", n_ins
+        print "n_outs", n_outs
+        print "pretrain_lr", pretrain_lr
+        print "finetune_lr", finetune_lr
+        print "input_divider", input_divider
+        print "----"
+
         self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX))

         if len(hidden_layers_sizes) < 1 :
--- a/deep/stacked_dae/utils.py	Thu Mar 04 09:43:23 2010 -0500
+++ b/deep/stacked_dae/utils.py	Thu Mar 04 20:43:21 2010 -0500
@@ -1,14 +1,26 @@
 #!/usr/bin/python
+# coding: utf-8
+
+from __future__ import with_statement

 from jobman import DD

 # from pylearn codebase
+# useful in __init__(param1, param2, etc.) to save
+# values in self.param1, self.param2... just call
+# update_locals(self, locals())
 def update_locals(obj, dct):
     if 'self' in dct:
         del dct['self']
     obj.__dict__.update(dct)

-def produit_croise_jobs(val_dict):
+# from a dictionary of possible values for hyperparameters, e.g.
+# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]}
+# create a list of other dictionaries representing all the possible
+# combinations, thus in this example creating:
+# [{'learning_rate': 0.1, 'num_layers': 1}, ...]
+# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2))
+def produit_cartesien_jobs(val_dict):
     job_list = [DD()]
     all_keys = val_dict.keys()

@@ -24,9 +36,9 @@

     return job_list

-def test_produit_croise_jobs():
+def test_produit_cartesien_jobs():
     vals = {'a': [1,2], 'b': [3,4,5]}
-    print produit_croise_jobs(vals)
+    print produit_cartesien_jobs(vals)


 # taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python
--- a/scripts/launch_generate100.py	Thu Mar 04 09:43:23 2010 -0500
+++ b/scripts/launch_generate100.py	Thu Mar 04 20:43:21 2010 -0500
@@ -3,10 +3,12 @@
 import os
 dir1 = "/data/lisa/data/ift6266h10/"

+mach = "brams0c.iro.umontreal.ca,brams02.iro.umontreal.ca,brams03.iro.umontreal.ca,maggie22.iro.umontreal.ca"
+
 for i,s in enumerate(['valid','test']):
     for j,c in enumerate([0.3,0.5,0.7,1]):
         l = str(c).replace('.','')
-        os.system("dbidispatch --condor --os=fc9 --machine=brams0c.iro.umontreal.ca ./run_pipeline.sh -o %sdata/P%s_%s_data.ft -p %sdata/P%s_%s_params -x %sdata/P%s_%s_labels.ft -f %s%s_data.ft -l %s%s_labels.ft -c %socr_%s_data.ft -d %socr_%s_labels.ft -m 0.3 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s %d -y %d" % (dir1, l, s, dir1, l, s, dir1, l, s, dir1, s, dir1, s, dir1, s, dir1, s, [20000,80000][i], 200+i*4+j))
+        os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/P%s_%s_data.ft -p %sdata/P%s_%s_params -x %sdata/P%s_%s_labels.ft -f %s%s_data.ft -l %s%s_labels.ft -c %socr_%s_data.ft -d %socr_%s_labels.ft -m 0.3 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s %d -y %d" % (mach, dir1, l, s, dir1, l, s, dir1, l, s, dir1, s, dir1, s, dir1, s, dir1, s, [20000,80000][i], 200+i*4+j))

 for i in range(100):
-    os.system("dbidispatch --condor --os=fc9 --machine=brams0c.iro.umontreal.ca ./run_pipeline.sh -o %sdata/P07_train%d_data.ft -p %sdata/P07_train%d_params -x %sdata/P07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d" % (dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1, 100+i))
+    os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/P07_train%d_data.ft -p %sdata/P07_train%d_params -x %sdata/P07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d" % (mach, dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1, 100+i))
--- a/test.py	Thu Mar 04 09:43:23 2010 -0500
+++ b/test.py	Thu Mar 04 20:43:21 2010 -0500
@@ -1,8 +1,7 @@
 import doctest, sys, pkgutil

-def runTests(options = doctest.ELLIPSIS or doctest.DONT_ACCEPT_TRUE_FOR_1):
+def runTests():
     import ift6266
-    predefs = ift6266.__dict__
     for (_, name, ispkg) in pkgutil.walk_packages(ift6266.__path__, ift6266.__name__+'.'):
         if not ispkg:
             if name.startswith('ift6266.scripts.') or \
@@ -11,9 +10,21 @@
                         'ift6266.data_generation.transformations.testmod',
                         'ift6266.data_generation.transformations.gimp_script']:
                 continue
-            print "Testing:", name
-            __import__(name)
-            doctest.testmod(sys.modules[name], extraglobs=predefs, optionflags=options)
+            test(name)
+
+def test(name):
+    import ift6266
+    predefs = ift6266.__dict__
+    options = doctest.ELLIPSIS or doctest.DONT_ACCEPT_TRUE_FOR_1
+    print "Testing:", name
+    __import__(name)
+    doctest.testmod(sys.modules[name], extraglobs=predefs, optionflags=options)

 if __name__ == '__main__':
-    runTests()
+    if len(sys.argv) > 1:
+        for mod in sys.argv[1:]:
+            if mod.endswith('.py'):
+                mod = mod[:-3]
+            test(mod)
+    else:
+        runTests()