changeset 819:7dfecf11cbf4

...
author dumitru@deepnets.mtv.corp.google.com
date Wed, 02 Sep 2009 14:23:50 -0700
parents f4729745bb58 (current diff) db2c26a2c97c (diff)
children 2333cd78f574
files
diffstat 2 files changed, 144 insertions(+), 54 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/algorithms/sandbox/DAA_inputs_groups.py	Wed Sep 02 14:22:02 2009 -0700
+++ b/pylearn/algorithms/sandbox/DAA_inputs_groups.py	Wed Sep 02 14:23:50 2009 -0700
@@ -6,9 +6,12 @@
 from theano.compile import module
 
 from pylearn.sandbox.scan_inputs_groups import scaninputs, scandotdec, scandotenc, scannoise, scanbiasdec, \
-        scanmaskenc,scanmaskdec, FillMissing, mask_gradient
+        scanmaskenc,scanmaskdec, FillMissing, mask_gradient, blockgrad
 
 from pylearn.algorithms.logistic_regression import LogRegN
+import pylearn.algorithms.cost
+
+import time
 
 from pylearn.io import filetensor
 import os
@@ -19,7 +22,14 @@
     print 'save ndarray to file: ', save_dir + fname
     file_handle = open(os.path.join(save_dir, fname), 'w')
     filetensor.write(file_handle, mat)
-    file_handle.close()
+    writebool = False
+    while not writebool:
+        try:
+            file_handle.close()
+            writebool = True
+        except:
+            print 'save model error'
+            time.sleep((numpy.random.randint(10)+2)*10)
 
 def load_mat(fname, save_dir=''):
     print 'loading ndarray from file: ', save_dir + fname
@@ -83,27 +93,42 @@
 def tanh_act(x):
     return theano.tensor.tanh(x/2.0)
 
+def softsign_act(x):
+    return x/(1.0 + theano.tensor.abs(x))
+
 # costs utils:---------------------------------------------------
 # in order to fix numerical instability of the cost and gradient calculation for the cross entropy we calculate it
 # with the following functions direclty from the activation:
+# XS is used to get back the KL divergence, important for doing global updates
+
 def sigmoid_cross_entropy(target, output_act, mean_axis, sum_axis):
-    XE =-target * T.log(1 + T.exp(-output_act)) + (1 - target) * (- T.log(1 + T.exp(output_act)))
-    return -T.mean(T.sum(XE, axis=sum_axis),axis=mean_axis)
+    XE = target * (- T.log(1 + T.exp(-output_act))) + (1 - target) * (- T.log(1 + T.exp(output_act)))
+    XS = T.xlogx.xlogx(target) + T.xlogx.xlogx(1-target)
+    return -T.mean(T.sum(XE-XS, axis=sum_axis),axis=mean_axis)
 
 def tanh_cross_entropy(target, output_act, mean_axis, sum_axis):
-    XE =-(target+1)/2.0 * T.log(1 + T.exp(- output_act)) + \
+    XE = (target+1)/2.0 * (- T.log(1 + T.exp(- output_act))) + \
             (1 - (target+1)/2.0) * (- T.log(1 + T.exp(output_act)))
-    return -T.mean(T.sum(XE, axis=sum_axis),axis=mean_axis)
+    XS = T.xlogx.xlogx((target+1)/2.0) + T.xlogx.xlogx(1-(target+1)/2.0)
+    return -T.mean(T.sum(XE-XS, axis=sum_axis),axis=mean_axis)
+
+def softsign_cross_entropy(target, output_act, mean_axis, sum_axis):
+    newact = ((output_act/(1.0 + theano.tensor.abs(output_act)))+1)/2.0
+    XE = (target+1)/2.0 * T.log(newact) + (1 - (target+1)/2.0) * T.log(1 - newact)
+    XS = T.xlogx.xlogx((target+1)/2.0) + T.xlogx.xlogx(1-(target+1)/2.0)
+    return -T.mean(T.sum(XE-XS, axis=sum_axis),axis=mean_axis)
 
 def cross_entropy(target, output_act, act, mean_axis=0, sum_axis=1):
     if act == 'sigmoid_act':
         return sigmoid_cross_entropy(target, output_act, mean_axis, sum_axis)
     if act == 'tanh_act':
         return tanh_cross_entropy(target, output_act, mean_axis, sum_axis)
+    if act == 'softsign_act':
+        return softsign_cross_entropy(target, output_act, mean_axis, sum_axis)
     assert False
 
-def quadratic(target, output, act, axis = 1):
-    return pylearn.algorithms.cost.quadratic(target, output, axis)
+def quadratic(target, output, act, mean_axis = 0):
+    return T.sum(pylearn.algorithms.cost.quadratic(target, output, mean_axis))
 
 # DAAig module----------------------------------------------------------------
 class DAAig(module.Module):
@@ -112,10 +137,10 @@
     
     def __init__(self, input = None, auxinput = None,
                 in_size=None, auxin_size= None, n_hid=1,
-                regularize = False, tie_weights = False, hid_fn = 'tanh_act',
+                regularize = False, tie_weights = False, tie_weights_aux = None, hid_fn = 'tanh_act',
                 rec_fn = 'tanh_act',reconstruction_cost_function='cross_entropy',
                 interface = True, ignore_missing=None, reconstruct_missing=False,
-                corruption_pattern=None, **init):
+                corruption_pattern=None, blockgrad = False, **init):
         """
         :param input: WRITEME
         :param auxinput: WRITEME
@@ -152,18 +177,6 @@
         missing inputs will be backpropagated. Otherwise, it will not.
         :todo: Default noise level for all daa levels
         """
-        print '\t\t**** DAAig.__init__ ****'
-        print '\t\tinput = ', input
-        print '\t\tauxinput = ', auxinput
-        print '\t\tin_size = ', in_size
-        print '\t\tauxin_size = ', auxin_size
-        print '\t\tn_hid = ', n_hid
-        print '\t\tregularize = ', regularize
-        print '\t\ttie_weights = ', tie_weights
-        print '\t\thid_fn = ', hid_fn
-        print '\t\trec_fn = ', rec_fn
-        print '\t\treconstruction_cost_function = ', reconstruction_cost_function
-        
         super(DAAig, self).__init__()
         self.random = T.RandomStreams()
         
@@ -173,15 +186,17 @@
         self.n_hid = n_hid
         self.regularize = regularize
         self.tie_weights = tie_weights
+        self.tie_weights_aux = tie_weights_aux if tie_weights_aux is not None else tie_weights
         self.interface = interface
         self.ignore_missing = ignore_missing
         self.reconstruct_missing = reconstruct_missing
         self.corruption_pattern = corruption_pattern
+        self.blockgrad = blockgrad
         
-        assert hid_fn in ('sigmoid_act','tanh_act')
+        assert hid_fn in ('sigmoid_act','tanh_act','softsign_act')
         self.hid_fn = eval(hid_fn)
         
-        assert rec_fn in ('sigmoid_act','tanh_act')
+        assert rec_fn in ('sigmoid_act','tanh_act','softsign_act')
         self.rec_fn = eval(rec_fn)
         self.rec_name = rec_fn
         
@@ -189,6 +204,19 @@
         self.reconstruction_cost_function = eval(reconstruction_cost_function)
         self.reconstruction_cost_function_name = reconstruction_cost_function
         
+        print '\t\t**** DAAig.__init__ ****'
+        print '\t\tinput = ', input
+        print '\t\tauxinput = ', auxinput
+        print '\t\tin_size = ', self.in_size
+        print '\t\tauxin_size = ', self.auxin_size
+        print '\t\tn_hid = ', self.n_hid
+        print '\t\tregularize = ', self.regularize
+        print '\t\ttie_weights = ', self.tie_weights
+        print '\t\ttie_weights_aux = ', self.tie_weights_aux
+        print '\t\thid_fn = ', hid_fn
+        print '\t\trec_fn = ', rec_fn
+        print '\t\treconstruction_cost_function = ', reconstruction_cost_function
+        
         ### DECLARE MODEL VARIABLES and default
         self.input = input
         if self.ignore_missing is not None and self.input is not None:
@@ -212,7 +240,8 @@
         
         if self.auxinput is not None:
             self.wauxenc = [T.dmatrix('wauxenc%s'%i) for i in range(len(auxin_size))]
-            self.wauxdec = [self.wauxenc[i].T if tie_weights else T.dmatrix('wauxdec%s'%i) for i in range(len(auxin_size))]
+            self.wauxdec =[ self.wauxenc[i].T if self.tie_weights_aux else T.dmatrix('wauxdec%s'%i) for i in\
+                    range(len(auxin_size))]
             self.bauxdec = [T.dvector('bauxdec%s'%i) for i in range(len(auxin_size))]
         
         #hyper-parameters
@@ -330,8 +359,8 @@
         tmpbool = (self.reconstruction_cost_function_name == 'cross_entropy')
         if (self.input is not None):
             container.reconstruction_cost_in = \
-                self.reconstruction_cost_function(self.input, container.rec_activation_in \
-                if tmpbool else container.rec_in, self.rec_name)
+                self.reconstruction_cost_function(blockgrad(self.input) if self.blockgrad else self.input,\
+                        container.rec_activation_in if tmpbool else container.rec_in, self.rec_name)
         if (self.auxinput is not None):
             container.reconstruction_cost_aux = \
                 self.reconstruction_cost_function(scaninputs(self.idx_list, self.auxinput), container.rec_activation_aux \
@@ -370,6 +399,7 @@
         if not(self.tie_weights):
             if self.input is not None:
                 self.params += [self.wdec]
+        if not(self.tie_weights_aux):
             if self.auxinput is not None:
                 self.params += self.wauxdec
     
@@ -391,8 +421,8 @@
         self.update = theano.Method(listin, self.noise.cost, self.updates)
         self.compute_cost = theano.Method(listin, self.noise.cost)
         self.noisify = theano.Method(listin, listout)
-        self.recactivation = theano.Method(listin, self.clean.rec_activation)
-        self.reconstruction = theano.Method(listin, self.clean.rec)
+        self.recactivation = theano.Method(listin, self.noise.rec_activation)
+        self.reconstruction = theano.Method(listin, self.noise.rec)
         self.activation = theano.Method(listin, self.clean.hidden_activation)
         self.representation = theano.Method(listin, self.clean.hidden)
     
@@ -441,12 +471,12 @@
                 wauxdecshp = [tuple(reversed(i)) for i in wauxencshp]
                 obj.bauxdec = [numpy.zeros(i) for i in self.auxin_size]
                 obj.wauxenc = [self.R.uniform(size=i, low = -self.inf, high = self.inf) for i in wauxencshp]
-                if not(self.tie_weights):
+                if not(self.tie_weights_aux):
                     obj.wauxdec = [copy.copy(obj.wauxenc[i].T) for i in range(len(wauxdecshp))] if tieinit else\
                             [self.R.uniform(size=i, low=-self.hif, high=self.hif) for i in wauxdecshp]
                 if orthoinit:
                     obj.wauxenc = [orthogonalinit(w) for w in obj.wauxenc]
-                    if not(self.tie_weights):
+                    if not(self.tie_weights_aux):
                         obj.wauxdec = [orthogonalinit(w,0) for w in obj.wauxdec]
                 print 'wauxencshp = ', wauxencshp
                 print 'wauxdecshp = ', wauxdecshp
@@ -462,11 +492,11 @@
 class StackedDAAig(module.Module):
     def __init__(self, depth = 1, input = T.dmatrix('input'), auxinput = [None],
                 in_size = None, auxin_size = [None], n_hid = [1],
-                regularize = False, tie_weights = False, hid_fn = 'tanh_act',
+                regularize = False, tie_weights = False, tie_weights_aux = None, hid_fn = 'tanh_act',
                 rec_fn = 'tanh_act',reconstruction_cost_function='cross_entropy',
                 n_out = 2, target = None, debugmethod = False, totalupdatebool=False,
                 ignore_missing=None, reconstruct_missing=False,
-                corruption_pattern=None,
+                corruption_pattern=None, blockgrad = False, act_reg = 'sigmoid_act',
                 **init):
         
         super(StackedDAAig, self).__init__()
@@ -487,6 +517,7 @@
         self.n_hid = listify(n_hid,depth)
         self.regularize = regularize
         tie_weights = listify(tie_weights,depth)
+        tie_weights_aux = listify(tie_weights_aux,depth)
         hid_fn = listify(hid_fn,depth)
         rec_fn = listify(rec_fn,depth)
         reconstruction_cost_function = listify(reconstruction_cost_function,depth)
@@ -497,6 +528,10 @@
         self.ignore_missing = ignore_missing
         self.reconstruct_missing = reconstruct_missing
         self.corruption_pattern = corruption_pattern
+        self.blockgrad = blockgrad
+        
+        assert act_reg in ('sigmoid_act','tanh_act','softsign_act')
+        self.act_reg = eval(act_reg)
         
         print '\t**** StackedDAAig.__init__ ****'
         print '\tdepth = ', self.depth
@@ -507,9 +542,13 @@
         print '\tn_hid = ', self.n_hid
         print '\tregularize = ', self.regularize
         print '\ttie_weights = ', tie_weights
+        print '\ttie_weights_aux = ', tie_weights_aux
         print '\thid_fn = ', hid_fn
         print '\trec_fn = ', rec_fn
+        print '\tact_reg = ', act_reg
         print '\treconstruction_cost_function = ', reconstruction_cost_function
+        print '\tblockgrad = ', blockgrad
+        print '\tact_reg = ', act_reg
         print '\tn_out = ', self.n_out
         
         # init for model construction
@@ -529,7 +568,7 @@
         
         # facultative methods
         if self.debugmethod:
-            self.activation = [None] * (self.depth)
+            self.activation = [None] * (self.depth+1)
             self.representation = [None] * (self.depth)
             self.recactivation = [None] * (self.depth)
             self.reconstruction = [None] * (self.depth)
@@ -570,10 +609,12 @@
         for i in range(self.depth):
             
             dict_params = dict(input = inputprec, in_size = in_sizeprec, auxin_size = auxin_size[i],
-                    n_hid = self.n_hid[i], regularize = False, tie_weights = tie_weights[i], hid_fn = hid_fn[i],
+                    n_hid = self.n_hid[i], regularize = False, tie_weights = tie_weights[i],
+                    tie_weights_aux = tie_weights_aux[i], hid_fn = hid_fn[i],
                     rec_fn = rec_fn[i], reconstruction_cost_function = reconstruction_cost_function[i],
                     interface = False, ignore_missing = self.ignore_missing,
-                    reconstruct_missing = self.reconstruct_missing,corruption_pattern = self.corruption_pattern)
+                    reconstruct_missing = self.reconstruct_missing,corruption_pattern = self.corruption_pattern,
+                    blockgrad=self.blockgrad)
             if auxin_size[i] is None:
                 offset +=1
                 dict_params.update({'auxinput' : None})
@@ -606,7 +647,7 @@
             if self.regularize:
                 self.regularizationenccost[i] = self.regularizationenccost[i-1]+self.daaig[i-1].regularizationenc if i else 0
                 self.localcost[i] += self.daaig[i].regularization
-                self.globalcost[i] += self.regularizationenccost[i]
+                self.globalcost[i] += self.regularizationenccost[i] + self.daaig[i].regularization
                 if self.totalupdatebool:
                     self.totalcost[i] += self.daaig[i].regularization
             
@@ -634,8 +675,8 @@
             if self.debugmethod:
                 self.activation[i] = theano.Method(self.inputs[i],self.daaig[i].clean.hidden_activation)
                 self.representation[i] = theano.Method(self.inputs[i],self.daaig[i].clean.hidden)
-                self.recactivation[i] = theano.Method(self.inputs[i],self.daaig[i].clean.rec_activation)
-                self.reconstruction[i] = theano.Method(self.inputs[i],self.daaig[i].clean.rec)
+                self.recactivation[i] = theano.Method(self.inputs[i],self.daaig[i].noise.rec_activation)
+                self.reconstruction[i] = theano.Method(self.inputs[i],self.daaig[i].noise.rec)
                 self.noisyinputs[i] =theano.Method(self.inputs[i], noisyout)
                 self.compute_localcost[i] = theano.Method(self.inputs[i],self.localcost[i])
                 self.compute_localgradients[i] = theano.Method(self.inputs[i],self.localgradients[i])
@@ -654,7 +695,7 @@
         # supervised layer------------------------------------------------------------------------
         print '\tLayer supervised init'
         self.inputs[-1] = copy.copy(self.inputs[-2])+[self.target]
-        self.daaig[-1] = LogRegN(in_sizeprec,self.n_out,sigmoid_act(self.daaig[-2].clean.hidden_activation),self.target)
+        self.daaig[-1] = LogRegN(in_sizeprec,self.n_out,self.act_reg(self.daaig[-2].clean.hidden_activation),self.target)
         paramstot += self.daaig[-1].params
         
         self.localcost[-1] = self.daaig[-1].regularized_cost \
@@ -694,6 +735,7 @@
         self.NLL = theano.Method(self.inputs[-1],self.daaig[-1]._xent)
         
         if self.debugmethod:
+            self.activation[-1] = theano.Method(self.inputs[-2],self.daaig[-1].linear_output)
             self.compute_localcost[-1] = theano.Method(self.inputs[-1],self.localcost[-1])
             self.compute_localgradients[-1] = theano.Method(self.inputs[-1],self.localgradients[-1])
             self.compute_globalcost[-1] = theano.Method(self.inputs[-1],self.globalcost[-1])
@@ -748,11 +790,12 @@
                 save_mat('wenc%s.ft'%(i) ,inst.daaig[i].wenc, save_dir)
                 save_mat('bdec%s.ft'%(i) ,inst.daaig[i].bdec, save_dir)
             
-            if not self.daaig[i].tie_weights:
+            if not self.daaig[i].tie_weights_aux:
                 if self.daaig[i].auxinput is not None:
                     for j in range(len(inst.daaig[i].wauxdec)):
                         save_mat('wauxdec%s_%s.ft'%(i,j) ,inst.daaig[i].wauxdec[j], save_dir)
-                
+            
+            if not self.daaig[i].tie_weights:
                 if self.daaig[i].input is not None:
                     save_mat('wdec%s.ft'%(i) ,inst.daaig[i].wdec, save_dir)
         i=i+1
@@ -778,7 +821,7 @@
                 inst.daaig[i].wenc = load_mat('wenc%s.ft'%(i),save_dir)/coefenc[i]
                 inst.daaig[i].bdec = load_mat('bdec%s.ft'%(i),save_dir)/coefdec[i]
             
-            if not self.daaig[i].tie_weights:
+            if not self.daaig[i].tie_weights_aux:
                 if self.daaig[i].auxinput is not None:
                     for j in range(len(inst.daaig[i].wauxdec)):
                         if 'wauxdec%s_%s.ft'%(i,j) in os.listdir(save_dir):
@@ -786,7 +829,8 @@
                         else:
                             print "WARNING: no decoding 'wauxdec%s_%s.ft' file use 'wauxenc%s_%s.ft' instead"%(i,j,i,j)
                             inst.daaig[i].wauxdec[j] = numpy.transpose(load_mat('wauxenc%s_%s.ft'%(i,j),save_dir)/coefdec[i])
-                
+            
+            if not self.daaig[i].tie_weights:
                 if self.daaig[i].input is not None:
                     if 'wdec%s.ft'%(i) in os.listdir(save_dir):
                         inst.daaig[i].wdec = load_mat('wdec%s.ft'%(i),save_dir)/coefdec[i]
@@ -808,11 +852,28 @@
         return numpy.mean(numpy.median(abs(inst.recactivation[layer](*inputs)),1))
     
     def _instance_error(self,inst,inputs,target):
-        return numpy.sum(inst.classify(*inputs) != target) / float(len(target)) *100.0
+        return numpy.sum(inst.classify(*inputs) != target) / float(len(target))*100.0
     
     def _instance_nll(self,inst,inputs,target):
         return numpy.sum(inst.NLL(*(inputs+[target]))) / float(len(target))
     
+    #try--------------------------------------------------------------------
+    def _instance_rescalwsaturation(self,inst,inputs):
+        sat = [None]*(self.depth+1)
+        for i in range(self.depth+1):
+            sat[i] = inst.hidsaturation(i,inputs[min(i,self.depth-1)])
+        
+        for i in range(self.depth-1):
+            if sat[i+1] > max(sat[:i+1]):
+                inst.daaig[i+1].wenc = inst.daaig[i+1].wenc/sat[i+1]*max(sat[:i+1])
+                inst.daaig[i+1].benc = inst.daaig[i+1].benc/sat[i+1]*max(sat[:i+1])
+                sat[i+1] = max(sat[:i+1])
+        if sat[-1]>max(sat[:-1]):
+            inst.daaig[-1].w = inst.daaig[-1].w/sat[-1]*max(sat[:-1])
+            inst.daaig[-1].b = inst.daaig[-1].b/sat[-1]*max(sat[:-1])
+    
+    #-----------------------------------------------------------------------
+    
     def _instance_unsupgrad(self,inst,inputs,layer,param_name):
         inst.noiseseed(0)
         gradin = inst.compute_localgradients_in[layer](*inputs)
@@ -844,14 +905,14 @@
     
     def _instance_unsupupdate(self,inst,data,layer='all',typeup = 'local',printcost = False):
         cost = [None]*self.depth
-        if typeup is 'totallocal':
+        if typeup == 'totallocal':
             cost[-1] = inst.totallocalupdate(*data)
         else: 
-            if typeup is 'total':
-                if layer is 'all':
-                    cost[-1] = inst.totalupdate[-1](*data)
+            if typeup == 'total':
+                if layer == 'all':
+                    cost[-1] = inst.totalupdate[-1](*data[-1])
                 else:
-                    cost[layer] = inst.totalupdate[layer](*data)
+                    cost[layer] = inst.totalupdate[layer](*data[layer])
             else:
                 if layer is 'all':
                     for i in range(self.depth):
@@ -859,19 +920,23 @@
                             cost[i] = inst.localupdate[i](*data[i])
                         if typeup == 'global':
                             cost[i] = inst.globalupdate[i](*data[i])
+                            for j in range(i):
+                                dummy = inst.localupdate[j](*data[j])
                 else:
                     if typeup == 'local':
-                        cost[layer] = inst.localupdate[i](*data)
+                        cost[layer] = inst.localupdate[layer](*data[layer])
                     if typeup == 'global':
-                        cost[layer] = inst.globalupdate[i](*data)
+                        cost[layer] = inst.globalupdate[layer](*data[layer])
+                        for j in range(layer):
+                            dummy = inst.localupdate[j](*data[j])
         if printcost:
             print cost
         return cost
     
     def _instance_supupdate(self,inst,data,typeup = 'global',printcost = False):
-        if typeup is 'local':
+        if typeup == 'local':
             cost = inst.localupdate[-1](*data)
-        if typeup is 'global':
+        if typeup == 'global':
             cost = inst.globalupdate[-1](*data)
         if printcost:
             print cost
--- a/pylearn/sandbox/scan_inputs_groups.py	Wed Sep 02 14:22:02 2009 -0700
+++ b/pylearn/sandbox/scan_inputs_groups.py	Wed Sep 02 14:23:50 2009 -0700
@@ -71,6 +71,31 @@
         if nbias != 1: raise TypeError('not vector', bias_list[i])
     return bias_list
 
+
+# block grad Op------------------------------------
+class BlockGrad(Op):
+    """This Op block the gradient of a variable"""
+    def make_node(self, x):
+        x = T.as_tensor_variable(x)
+        if x.ndim == 1:
+            return Apply(self, [x], [T.dvector()])
+        else:
+            return Apply(self, [x], [T.dmatrix()])
+    
+    def perform(self, node , x ,(out,)):
+        out[0] = x[0].copy()
+    
+    def grad(self, x, (gx,)):
+        return [gx*0]
+    
+    def __hash__(self):
+        return hash(BlockGrad)^77612
+    
+    def __str__(self):
+        return "BlockGrad"
+
+blockgrad=BlockGrad()
+
 # Encoding scan dot product------------------------------------
 class ScanDotEnc(Op):
     """This Op takes an index list (as tensor.ivector), a list of matrices representing