changeset 791:166a89917669

changed behavior of DAAig and StackedDAAig, added a cost scaling parameter
author Xavier Glorot <glorotxa@iro.umontreal.ca>
date Fri, 10 Jul 2009 16:56:52 -0400
parents d98117100166
children 961dc1a7921b
files pylearn/algorithms/sandbox/DAA_inputs_groups.py
diffstat 1 files changed, 123 insertions(+), 102 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/algorithms/sandbox/DAA_inputs_groups.py	Fri Jul 10 12:16:20 2009 -0400
+++ b/pylearn/algorithms/sandbox/DAA_inputs_groups.py	Fri Jul 10 16:56:52 2009 -0400
@@ -10,8 +10,8 @@
 
 from pylearn.algorithms.logistic_regression import LogRegN
 
-# used to initialize containers
-class ScratchPad:
+# Initialize containers:
+class CreateContainer:
     pass
 
 # regularisation utils:-------------------------------------------
@@ -19,8 +19,7 @@
     if type == 'l1':
         return T.sum(T.abs(param))
     if type == 'l2':
-        return T.sum(param*param)#faster...
-        return T.sum(T.pow(param,2))
+        return T.sum(param*param)
     raise NotImplementedError('Only l1 and l2 regularization are currently implemented')
 
 def get_reg_cost(params, type):
@@ -33,14 +32,13 @@
 def sigmoid_act(x):
     return theano.tensor.nnet.sigmoid(x)
 
+#tanh is scaled by 2 to have the same gradient than sigmoid [sigmoid(x)=(tanh(x/2.0)+1)/2.0]
 def tanh_act(x):
     return theano.tensor.tanh(x/2.0)
 
 # costs utils:---------------------------------------------------
-
 # in order to fix numerical instability of the cost and gradient calculation for the cross entropy we calculate it
 # with the following functions direclty from the activation:
-
 def sigmoid_cross_entropy(target, output_act, mean_axis, sum_axis):
     XE =-target * T.log(1 + T.exp(-output_act)) + (1 - target) * (- T.log(1 + T.exp(output_act)))
     return -T.mean(T.sum(XE, axis=sum_axis),axis=mean_axis)
@@ -60,25 +58,30 @@
 def quadratic(target, output, act, axis = 1):
     return pylearn.algorithms.cost.quadratic(target, output, axis)
 
-
-
 # DAAig module----------------------------------------------------------------
 class DAAig(module.Module):
-    """De-noising Auto-encoder
+    """De-noising Auto-encoder with inputs groups and missing values
     """
     
     def __init__(self, input = None, auxinput = None,
                 in_size=None, auxin_size= None, n_hid=1,
-                regularize = False, tie_weights = False, hid_fn = 'sigmoid_act',
-                reconstruction_cost_function='cross_entropy', interface = True,
-                ignore_missing=None, reconstruct_missing=False,
-                corruption_pattern=None,
-                **init):
+                regularize = False, tie_weights = False, hid_fn = 'tanh_act',
+                rec_fn = 'tanh_act',reconstruction_cost_function='cross_entropy',
+                scale_cost = False, interface = True, ignore_missing=None, reconstruct_missing=False,
+                corruption_pattern=None, **init):
         """
+        :param input: WRITEME
+        :param auxinput: WRITEME
+        :param in_size: WRITEME
+        :param auxin_size: WRITEME
+        :param n_hid: WRITEME
         :param regularize: WRITEME
         :param tie_weights: WRITEME
         :param hid_fn: WRITEME
-        :param reconstruction_cost: Should return one cost per example (row)
+        :param rec_fn: WRITEME
+        :param reconstruction_cost_function: WRITEME
+        :param scale_cost: WRITEME
+        :param interface: WRITEME
         :param ignore_missing: if not None, the input will be scanned in order
             to detect missing values, and these values will be replaced. Also,
             the reconstruction cost's gradient will be computed only on non
@@ -93,7 +96,7 @@
             in the current implementation, auxiliary inputs cannot be used when
             this option is True.
         :param corruption_pattern: if not None, may specify a particular way to
-        corrupt the input with missing values. Valid choices are:
+            corrupt the input with missing values. Valid choices are:
             - 'by_pair': consider that features are given as pairs, and corrupt
             (or not) the whole pair instead of considering them independently.
             Elements in a pair are not consecutive, instead they are assumed to
@@ -108,6 +111,12 @@
         print '\t\tin_size = ', in_size
         print '\t\tauxin_size = ', auxin_size
         print '\t\tn_hid = ', n_hid
+        print '\t\tregularize = ', regularize
+        print '\t\ttie_weights = ', tie_weights
+        print '\t\thid_fn = ', hid_fn
+        print '\t\trec_fn = ', rec_fn
+        print '\t\treconstruction_cost_function = ', reconstruction_cost_function
+        print '\t\tscale_cost = ', scale_cost
         
         super(DAAig, self).__init__()
         self.random = T.RandomStreams()
@@ -122,11 +131,14 @@
         self.ignore_missing = ignore_missing
         self.reconstruct_missing = reconstruct_missing
         self.corruption_pattern = corruption_pattern
-        
+        self.scale_cost = scale_cost
         
         assert hid_fn in ('sigmoid_act','tanh_act')
         self.hid_fn = eval(hid_fn)
-        self.hid_name = hid_fn
+        
+        assert rec_fn in ('sigmoid_act','tanh_act')
+        self.rec_fn = eval(rec_fn)
+        self.rec_name = rec_fn
         
         assert reconstruction_cost_function in ('cross_entropy','quadratic')
         self.reconstruction_cost_function = eval(reconstruction_cost_function)
@@ -134,13 +146,13 @@
         
         ### DECLARE MODEL VARIABLES and default
         self.input = input
+        self.noisy_input = None
         if self.ignore_missing is not None and self.input is not None:
             no_missing = FillMissing(self.ignore_missing)(self.input)
             self.input = no_missing[0]  # With missing values replaced.
             self.input_missing_mask = no_missing[1] # Missingness pattern.
         else:
             self.input_missing_mask = None
-        self.noisy_input = None
         self.auxinput = auxinput
         self.idx_list = T.ivector('idx_list') if self.auxinput is not None else None
         self.noisy_idx_list, self.noisy_auxinput = None, None
@@ -160,6 +172,7 @@
         #hyper-parameters
         if self.interface:
             self.lr = T.scalar('lr')
+        
         self.noise_level = T.scalar('noise_level')
         self.noise_level_group = T.scalar('noise_level_group')
         
@@ -174,29 +187,31 @@
             self.noisy_input = self.corrupt_input()
         if self.auxinput is not None:
             self.noisy_idx_list , self.noisy_auxinput = \
-                scannoise(self.idx_list, self.auxinput,self.noise_level,
-                        self.noise_level_group)
+                scannoise(self.idx_list, self.auxinput,self.noise_level, self.noise_level_group)
         
-        self.noise = ScratchPad()
-        self.clean = ScratchPad()
+        self.noise = CreateContainer()
+        self.clean = CreateContainer()
         
         self.define_behavioural(self.clean, self.input, self.idx_list, self.auxinput)
         self.define_behavioural(self.noise, self.noisy_input, self.noisy_idx_list, self.noisy_auxinput)
         
         self.define_regularization()  # call before cost
-        self.define_cost(self.clean)
-        self.define_cost(self.noise)
+        self.define_cost(self.noise)  # the cost is only needed for the noise (not used for the clean part)
         self.define_params()
         if self.interface:
             self.define_gradients()
             self.define_interface()
-        
+     
     def define_behavioural(self, container, input, idx_list, auxinput):
         self.define_propup(container, input, idx_list , auxinput)
         container.hidden = self.hid_fn(container.hidden_activation)
         self.define_propdown(container, idx_list , auxinput)
-        container.rec = self.hid_fn(container.rec_activation)
-       
+        if self.input is not None:
+            container.rec_in = self.rec_fn(container.rec_activation_in)
+        if (self.auxinput is not None):
+            container.rec_aux = self.rec_fn(container.rec_activation_aux)
+        container.rec = self.rec_fn(container.rec_activation)
+    
     def define_propup(self, container, input, idx_list, auxinput):
         if self.input is not None:
             container.hidden_activation = self.filter_up(input, self.wenc, self.benc)
@@ -205,36 +220,33 @@
         else:
             if self.auxinput is not None:
                 container.hidden_activation = scandotenc(idx_list,auxinput,self.wauxenc) + self.benc
-        
-    # DEPENDENCY: define_propup
+    
     def define_propdown(self, container, idx_list, auxinput):
         if self.input is not None:
-            rec_activation1 = self.filter_down(container.hidden,self.wdec,self.bdec)
+            container.rec_activation_in = self.filter_down(container.hidden,self.wdec,self.bdec)
         if self.auxinput is not None:
-            rec_activation2 = scandotdec(idx_list,auxinput,container.hidden,self.wauxdec) +\
+            container.rec_activation_aux = scandotdec(idx_list,auxinput,container.hidden,self.wauxdec) +\
                     scanbiasdec(idx_list,auxinput,self.bauxdec)
         
+        if (self.ignore_missing is not None and self.input is not None and not self.reconstruct_missing):
+            # Apply mask to gradient to ensure we do not backpropagate on the
+            # cost computed on missing inputs (that have been imputed).
+            container.rec_activation_in = mask_gradient(container.rec_activation_in,
+                    self.input_missing_mask)
+        
         if (self.input is not None) and (self.auxinput is not None):
-            container.rec_activation = T.join(1,rec_activation1,rec_activation2)
+            container.rec_activation = T.join(1,container.rec_activation_in,container.rec_activation_aux)
         else:
             if self.input is not None:
-                container.rec_activation = rec_activation1
-            else:
-                container.rec_activation = rec_activation2
-
-        if (self.ignore_missing is not None and self.input is not None and not
-                self.reconstruct_missing):
-            # Apply mask to gradient to ensure we do not backpropagate on the
-            # cost computed on missing inputs (that have been imputed).
-            container.rec_activation = mask_gradient(container.rec_activation,
-                    self.input_missing_mask)
-  
+                container.rec_activation = container.rec_activation_in
+            if (self.auxinput is not None):
+                container.rec_activation = container.rec_activation_aux
+    
     def filter_up(self, vis, w, b=None):
         out = T.dot(vis, w)
         return out + b if b else out
     filter_down = filter_up
     
-    # TODO: fix regularization type (outside parameter ?)
     def define_regularization(self):
         self.reg_coef = T.scalar('reg_coef')
         if self.auxinput is not None:
@@ -255,20 +267,38 @@
         self.regularization = self.reg_coef * get_reg_cost(listweights,'l2')
         self.regularizationenc = self.reg_coef * get_reg_cost(listweightsenc,'l2')
     
-    
-    # DEPENDENCY: define_behavioural, define_regularization
     def define_cost(self, container):
         if self.reconstruction_cost_function_name == 'cross_entropy':
-            container.reconstruction_cost = self.reconstruction_costs(container.rec_activation)
+            if (self.input is not None):
+                container.reconstruction_cost_in = \
+                    self.reconstruction_cost_function(self.input,container.rec_activation_in,self.rec_name)
+            if (self.auxinput is not None):
+                container.reconstruction_cost_aux = \
+                    self.reconstruction_cost_function(scaninputs(self.idx_list,self.auxinput),container.rec_activation_aux,\
+                    self.rec_name)
         else:
-            container.reconstruction_cost = self.reconstruction_costs(container.rec)
+            if (self.input is not None):
+                container.reconstruction_cost_in = \
+                    self.reconstruction_cost_function(self.input,container.rec_in,self.rec_name)
+            if (self.auxinput is not None):
+                container.reconstruction_cost_aux = \
+                    self.reconstruction_cost_function(scaninputs(self.idx_list,self.auxinput),container.rec_aux,\
+                    self.rec_name)
         # TOTAL COST
+        if (self.input is not None) and (self.auxinput is not None):
+            container.reconstruction_cost = (T.constant(min(1,1+self.scale_cost)) *container.reconstruction_cost_in +\
+                T.constant(min(1,1-self.scale_cost)) * container.reconstruction_cost_aux )
+        else:
+            if self.input is not None:
+                container.reconstruction_cost = container.reconstruction_cost_in
+            if (self.auxinput is not None):
+                container.reconstruction_cost = container.reconstruction_cost_aux
+        
         if self.regularize: #if stacked don't merge regularization and cost here but in the stackeddaaig module
             container.cost = container.reconstruction_cost + self.regularization
         else:
             container.cost = container.reconstruction_cost
     
-    # DEPENDENCY: define_cost
     def define_params(self):
         if not hasattr(self,'params'):
             self.params = []
@@ -286,14 +316,11 @@
             if self.auxinput is not None:
                 self.params += self.wauxdec
     
-    # DEPENDENCY: define_cost, define_gradients
     def define_gradients(self):
         self.gradients = T.grad(self.noise.cost, self.params)
         self.updates = dict((p, p - self.lr * g) for p, g in \
                 zip(self.params, self.gradients))
     
-    
-    # DEPENDENCY: define_behavioural, define_regularization, define_cost, define_gradients
     def define_interface(self):
         # declare function to interface with module (if not stacked)
         if self.input is None:
@@ -310,7 +337,6 @@
             self.auxnoisify = theano.Method(listin, self.noisy_auxinput)
         self.reconstruction = theano.Method(listin, self.clean.rec)
         self.representation = theano.Method(listin, self.clean.hidden)
-        self.validate = theano.Method(listin, [self.clean.cost, self.clean.rec])
     
     def corrupt_input(self):
         if self.corruption_pattern is None:
@@ -327,17 +353,6 @@
                     % self.corruption_pattern)
         return mask * self.input
     
-    def reconstruction_costs(self, rec):
-        if (self.input is not None) and (self.auxinput is not None):
-            return self.reconstruction_cost_function(T.join(1,self.input,scaninputs(self.idx_list,self.auxinput)),\
-                    rec, self.hid_name)
-        if self.input is not None:
-            return self.reconstruction_cost_function(self.input, rec, self.hid_name)
-        if self.auxinput is not None:
-            return self.reconstruction_cost_function(scaninputs(self.idx_list,self.auxinput), rec, self.hid_name)
-        # All cases should be covered above. If not, something is wrong!
-        assert False
-    
     def _instance_initialize(self, obj, lr = 1 , reg_coef = 0, noise_level = 0 , noise_level_group = 0,
                             seed=1, alloc=True, **init):
         super(DAAig, self)._instance_initialize(obj, **init)
@@ -365,14 +380,12 @@
             self.inf = 1/numpy.sqrt(sum(self.auxin_size)+self.in_size)
         self.hif = 1/numpy.sqrt(self.n_hid)
         
-        
         if alloc:
             if self.input is not None:
                 wencshp = (self.in_size, self.n_hid)
                 wdecshp = tuple(reversed(wencshp))
                 print 'wencshp = ', wencshp
                 print 'wdecshp = ', wdecshp
-                
                 obj.wenc = self.R.uniform(size=wencshp, low = -self.inf, high = self.inf)
                 if not(self.tie_weights):
                     obj.wdec = self.R.uniform(size=wdecshp, low=-self.hif, high=self.hif)
@@ -383,7 +396,6 @@
                 wauxdecshp = [tuple(reversed(i)) for i in wauxencshp]
                 print 'wauxencshp = ', wauxencshp
                 print 'wauxdecshp = ', wauxdecshp
-                
                 obj.wauxenc = [self.R.uniform(size=i, low = -self.inf, high = self.inf) for i in wauxencshp]
                 if not(self.tie_weights):
                     obj.wauxdec = [self.R.uniform(size=i, low=-self.hif, high=self.hif) for i in wauxdecshp]
@@ -400,20 +412,21 @@
 class StackedDAAig(module.Module):
     def __init__(self, depth = 1, input = T.dmatrix('input'), auxinput = [None],
                 in_size = None, auxin_size = [None], n_hid = [1],
-                regularize = False, tie_weights = False, hid_fn = 'sigmoid_act',
-                reconstruction_cost_function='cross_entropy',
+                regularize = False, tie_weights = False, hid_fn = 'tanh_act',
+                rec_fn = 'tanh_act',reconstruction_cost_function='cross_entropy', scale_cost=False,
                 n_out = 2, target = None, debugmethod = False, totalupdatebool=False,
                 ignore_missing=None, reconstruct_missing=False,
                 corruption_pattern=None,
                 **init):
         
         super(StackedDAAig, self).__init__()
-        print '\t**** StackedDAAig.__init__ ****'
-        print '\tinput = ', input
-        print '\tauxinput = ', auxinput
-        print '\tin_size = ', in_size
-        print '\tauxin_size = ', auxin_size
-        print '\tn_hid = ', n_hid
+        
+        # utils
+        def listify(param,depth):
+            if type(param) is list:
+                return param if len(param)==depth else [param[0]]*depth
+            else:
+                return [param]*depth
         
         # save parameters
         self.depth = depth
@@ -421,11 +434,13 @@
         self.auxinput = auxinput
         self.in_size = in_size
         auxin_size = auxin_size
-        self.n_hid = n_hid
+        self.n_hid = listify(n_hid,depth)
         self.regularize = regularize
-        self.tie_weights = tie_weights
-        self.hid_fn = hid_fn
-        self.reconstruction_cost_function = reconstruction_cost_function
+        self.tie_weights = listify(tie_weights,depth)
+        self.hid_fn = listify(hid_fn,depth)
+        self.rec_fn = listify(rec_fn,depth)
+        self.reconstruction_cost_function = listify(reconstruction_cost_function,depth)
+        self.scale_cost = listify(scale_cost,depth)
         self.n_out = n_out
         self.target = target if target is not None else T.lvector('target')
         self.debugmethod = debugmethod
@@ -434,6 +449,21 @@
         self.reconstruct_missing = reconstruct_missing
         self.corruption_pattern = corruption_pattern
         
+        print '\t**** StackedDAAig.__init__ ****'
+        print '\tdepth = ', self.depth
+        print '\tinput = ', self.input
+        print '\tauxinput = ', self.auxinput
+        print '\tin_size = ', self.in_size
+        print '\tauxin_size = ', auxin_size
+        print '\tn_hid = ', self.n_hid
+        print '\tregularize = ', self.regularize
+        print '\ttie_weights = ', self.tie_weights
+        print '\thid_fn = ', self.hid_fn
+        print '\trec_fn = ', self.rec_fn
+        print '\treconstruction_cost_function = ', self.reconstruction_cost_function
+        print '\tscale_cost = ', self.scale_cost
+        print '\tn_out = ', self.n_out
+        
         # init for model construction
         inputprec = input
         in_sizeprec = in_size
@@ -448,14 +478,11 @@
         self.globalupdate = [None] * (self.depth+1)#update wrt the layer cost backproped untill the input layer
         if self.totalupdatebool:
             self.totalupdate = [None] * (self.depth+1) #update wrt all the layers cost backproped untill the input layer
-        #
-        self.classify = None
         
-        #others methods
+        # facultative methods
         if self.debugmethod:
             self.representation = [None] * (self.depth)
             self.reconstruction = [None] * (self.depth)
-            self.validate = [None] * (self.depth)
             self.noisyinputs = [None] * (self.depth)
             self.compute_localcost = [None] * (self.depth+1)
             self.compute_localgradients = [None] * (self.depth+1)
@@ -464,7 +491,6 @@
             if self.totalupdatebool:
                 self.compute_totalcost = [None] * (self.depth+1)
                 self.compute_totalgradients = [None] * (self.depth+1)
-        #
         
         # some theano Variables we want to keep track on
         if self.regularize:
@@ -481,7 +507,6 @@
         paramstot = []
         paramsenc = []
         self.inputs = [None] * (self.depth+1)
-        
         if self.input is not None:
             self.inputs[0] = [self.input]
         else:
@@ -490,20 +515,19 @@
         offset = 0
         for i in range(self.depth):
             
+            dict_params = dict(input = inputprec, in_size = in_sizeprec, auxin_size = auxin_size[i],
+                    n_hid = self.n_hid[i], regularize = False, tie_weights = self.tie_weights[i], hid_fn = self.hid_fn[i],
+                    rec_fn = self.rec_fn[i], reconstruction_cost_function = self.reconstruction_cost_function[i],
+                    scale_cost = self.scale_cost[i], interface = False, ignore_missing = self.ignore_missing,
+                    reconstruct_missing = self.reconstruct_missing,corruption_pattern = self.corruption_pattern)
             if auxin_size[i] is None:
                 offset +=1
-                param = [inputprec, None, in_sizeprec, auxin_size[i], self.n_hid[i],\
-                    False, self.tie_weights, self.hid_fn, self.reconstruction_cost_function,False]
+                dict_params.update({'auxinput' : None})
             else:
-                param = [inputprec, self.auxinput[i-offset], in_sizeprec, auxin_size[i], self.n_hid[i],\
-                    False, self.tie_weights, self.hid_fn, self.reconstruction_cost_function,False]
-
-            dict_params = dict(ignore_missing = self.ignore_missing,
-                    reconstruct_missing = self.reconstruct_missing,
-                    corruption_pattern = self.corruption_pattern)
+                dict_params.update({'auxinput' : self.auxinput[i-offset]})
             
             print '\tLayer init= ', i+1
-            self.daaig[i] = DAAig(*param, **dict_params)
+            self.daaig[i] = DAAig(**dict_params)
             
             # method input, outputs and parameters update
             if i:
@@ -533,7 +557,6 @@
                     self.regularizationenccost[i] = self.regularizationenccost[i-1]+self.daaig[i-1].regularizationenc
                 else:
                     self.regularizationenccost[i] = 0
-                
                 self.localcost[i] += self.daaig[i].regularization
                 self.globalcost[i] += self.regularizationenccost[i]
                 if self.totalupdatebool:
@@ -556,11 +579,10 @@
             self.globalupdate[i] = theano.Method(self.inputs[i],self.globalcost[i],global_grads)
             if self.totalupdatebool:
                 self.totalupdate[i] = theano.Method(self.inputs[i],self.totalcost[i],total_grads)
-            #
+            
             if self.debugmethod:
                 self.representation[i] = theano.Method(self.inputs[i],self.daaig[i].clean.hidden_activation)
                 self.reconstruction[i] = theano.Method(self.inputs[i],self.daaig[i].clean.rec)
-                self.validate[i] =theano.Method(self.inputs[i], [self.daaig[i].clean.cost, self.daaig[i].clean.rec])
                 self.noisyinputs[i] =theano.Method(self.inputs[i], noisyout)
                 self.compute_localcost[i] = theano.Method(self.inputs[i],self.localcost[i])
                 self.compute_localgradients[i] = theano.Method(self.inputs[i],self.localgradients[i])
@@ -569,13 +591,12 @@
                 if self.totalupdatebool:
                     self.compute_totalcost[i] = theano.Method(self.inputs[i],self.totalcost[i])
                     self.compute_totalgradients[i] = theano.Method(self.inputs[i],self.totalgradients[i])
-            #
             
             paramsenc += self.daaig[i].paramsenc
             inputprec = self.daaig[i].clean.hidden
             in_sizeprec = self.n_hid[i]
         
-        # supervised layer
+        # supervised layer------------------------------------------------------------------------
         print '\tLayer supervised init'
         self.inputs[-1] = copy.copy(self.inputs[-2])+[self.target]
         self.daaig[-1] = LogRegN(in_sizeprec,self.n_out,sigmoid_act(self.daaig[-2].clean.hidden_activation),self.target)
@@ -608,7 +629,7 @@
         self.globalupdate[-1] = theano.Method(self.inputs[-1],self.globalcost[-1],global_grads)
         if self.totalupdatebool:
             self.totalupdate[-1] = theano.Method(self.inputs[-1],self.totalcost[-1],total_grads)
-
+        
         totallocal_grads={}
         for k in range(self.depth):
             totallocal_grads.update(dict((j, j - self.unsup_lr * g) for j,g in
@@ -616,9 +637,9 @@
         totallocal_grads.update(dict((j, j - self.sup_lr * g) for j,g in zip(self.daaig[-1].params,self.localgradients[-1])))
         self.totallocalupdate = theano.Method(self.inputs[-1],self.localcost,totallocal_grads)
         
+        # interface for the user
         self.classify = theano.Method(self.inputs[-2],self.daaig[-1].argmax_standalone)
         self.NLL = theano.Method(self.inputs[-1],self.daaig[-1]._xent)
-
         
         if self.debugmethod:
             self.compute_localcost[-1] = theano.Method(self.inputs[-1],self.localcost[-1])