diff deep/stacked_dae/stacked_dae.py @ 275:7b4507295eba

merge
author Xavier Glorot <glorotxa@iro.umontreal.ca>
date Mon, 22 Mar 2010 10:20:10 -0400
parents acb942530923 c8fe09a65039
children
line wrap: on
line diff
--- a/deep/stacked_dae/stacked_dae.py	Mon Mar 22 10:19:45 2010 -0400
+++ b/deep/stacked_dae/stacked_dae.py	Mon Mar 22 10:20:10 2010 -0400
@@ -127,13 +127,13 @@
     #        this will produce an array of 0s and 1s where 1 has a 
     #        probability of 1 - ``corruption_level`` and 0 with
     #        ``corruption_level``
-    self.tilde_x  = theano_rng.binomial( self.x.shape,  1,  1 - corruption_level) * self.x
+    self.tilde_x  = theano_rng.binomial( self.x.shape,  1,  1 - corruption_level, dtype=theano.config.floatX) * self.x
     # Equation (2)
     # note  : y is stored as an attribute of the class so that it can be 
     #         used later when stacking dAs. 
     self.y   = T.nnet.sigmoid(T.dot(self.tilde_x, self.W      ) + self.b)
     # Equation (3)
-    self.z   = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
+    #self.z   = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
     # Equation (4)
     # note : we sum over the size of a datapoint; if we are using minibatches,
     #        L will  be a vector, with one entry per example in minibatch
@@ -141,17 +141,20 @@
     #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1)
 
     # bypassing z to avoid running to log(0)
-    #self.z_a = T.dot(self.y, self.W_prime) + self.b_prime)
-    #self.L = -T.sum( self.x * (T.log(1)-T.log(1+T.exp(-self.z_a))) \
-    #                + (1.0-self.x) * (T.log(1)-T.log(1+T.exp(-self.z_a))), axis=1 )
+    z_a = T.dot(self.y, self.W_prime) + self.b_prime
+    log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a))
+    # log(1-sigmoid(z_a))
+    log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a))
+    self.L = -T.sum( self.x * (log_sigmoid) \
+                    + (1.0-self.x) * (log_1_sigmoid), axis=1 )
 
     # I added this epsilon to avoid getting log(0) and 1/0 in grad
     # This means conceptually that there'd be no probability of 0, but that
     # doesn't seem to me as important (maybe I'm wrong?).
-    eps = 0.00000001
-    eps_1 = 1-eps
-    self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \
-                    + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 )
+    #eps = 0.00000001
+    #eps_1 = 1-eps
+    #self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \
+    #                + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 )
     # note : L is now a vector, where each element is the cross-entropy cost 
     #        of the reconstruction of the corresponding example of the 
     #        minibatch. We need to compute the average of all these to get 
@@ -162,9 +165,9 @@
 
 
 class SdA(object):
-    def __init__(self, train_set_x, train_set_y, batch_size, n_ins, 
+    def __init__(self, batch_size, n_ins, 
                  hidden_layers_sizes, n_outs, 
-                 corruption_levels, rng, pretrain_lr, finetune_lr, input_divider=1.0):
+                 corruption_levels, rng, pretrain_lr, finetune_lr):
         # Just to make sure those are not modified somewhere else afterwards
         hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes)
         corruption_levels = copy.deepcopy(corruption_levels)
@@ -187,17 +190,14 @@
         print "n_outs", n_outs
         print "pretrain_lr", pretrain_lr
         print "finetune_lr", finetune_lr
-        print "input_divider", input_divider
         print "----"
 
-        self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX))
-
         if len(hidden_layers_sizes) < 1 :
             raiseException (' You must have at least one hidden layer ')
 
 
         # allocate symbolic variables for the data
-        index   = T.lscalar()    # index to a [mini]batch 
+        #index   = T.lscalar()    # index to a [mini]batch 
         self.x  = T.matrix('x')  # the data is presented as rasterized images
         self.y  = T.ivector('y') # the labels are presented as 1D vector of 
                                  # [int] labels
@@ -244,10 +244,15 @@
                 updates[param] = param - gparam * pretrain_lr
             
             # create a function that trains the dA
-            update_fn = theano.function([index], dA_layer.cost, \
-                  updates = updates,
-                  givens = { 
-                     self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider})
+            update_fn = theano.function([self.x], dA_layer.cost, \
+                  updates = updates)#,
+            #     givens = { 
+            #         self.x : ensemble})
+            # collect this function into a list
+            #update_fn = theano.function([index], dA_layer.cost, \
+            #      updates = updates,
+            #      givens = { 
+            #         self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider})
             # collect this function into a list
             self.pretrain_functions += [update_fn]
 
@@ -270,11 +275,11 @@
         for param,gparam in zip(self.params, gparams):
             updates[param] = param - gparam*finetune_lr
             
-        self.finetune = theano.function([index], cost, 
-                updates = updates,
-                givens = {
-                  self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider,
-                  self.y : train_set_y[index*batch_size:(index+1)*batch_size]} )
+        self.finetune = theano.function([self.x,self.y], cost, 
+                updates = updates)#,
+        #        givens = {
+        #          self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider,
+        #          self.y : train_set_y[index*batch_size:(index+1)*batch_size]} )
 
         # symbolic variable that points to the number of errors made on the
         # minibatch given by self.x and self.y