changeset 231:38beb81f4e8b

Automated merge with ssh://projects@lgcm.iro.umontreal.ca/hg/pylearn
author Frederic Bastien <bastienf@iro.umontreal.ca>
date Tue, 27 May 2008 13:46:03 -0400
parents 17c5d080964b (diff) 4d1bd2513e06 (current diff)
children c047238e5b3f 9e96fe8b955c
files dataset.py test_dataset.py
diffstat 9 files changed, 375 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- a/__init__.py	Fri May 23 10:21:52 2008 -0400
+++ b/__init__.py	Tue May 27 13:46:03 2008 -0400
@@ -1,2 +1,5 @@
 import filetensor
 import nnet_ops
+
+from lookup_list import LookupList
+
--- a/_test_dataset.py	Fri May 23 10:21:52 2008 -0400
+++ b/_test_dataset.py	Tue May 27 13:46:03 2008 -0400
@@ -1,6 +1,8 @@
 from dataset import *
 from math import *
 import unittest
+import sys
+import numpy as N
 
 def _sum_all(a):
     s=a
@@ -92,6 +94,90 @@
         print b('x+y')
         
 
+
+
+# to be used with a any new dataset
+class T_dataset_tester(object):
+    """
+    This class' goal is to test any new dataset that is created
+    Tests are (will be!) designed to check the normal behaviours
+    of a dataset, as defined in dataset.py
+    """
+
+
+    def __init__(self,ds,runall=True) :
+        """if interested in only a subset of test, init with runall=False"""
+        self.ds = ds
+        
+        if runall :
+            self.test1_basicstats(ds)
+            self.test2_slicing(ds)
+            self.test3_fields_iterator_consistency(ds)
+
+    def test1_basicstats(self,ds) :
+        """print basics stats on a dataset, like length"""
+
+        print 'len(ds) = ',len(ds)
+        print 'num fields = ', len(ds.fieldNames())
+        print 'types of field: ',
+        for k in ds.fieldNames() :
+            print type(ds[0](k)[0]),
+        print ''
+
+    def test2_slicing(self,ds) :
+        """test if slicing works properly"""
+        print 'testing slicing...',
+        sys.stdout.flush()
+        
+        middle = len(ds) / 2
+        tenpercent = int(len(ds) * .1)
+        set1 = ds[:middle+tenpercent]
+        set2 = ds[middle-tenpercent:]
+        for k in range(tenpercent + tenpercent -1):
+            for k2 in ds.fieldNames() :
+                if type(set1[middle-tenpercent+k](k2)[0]) == N.ndarray :
+                    for k3 in range(len(set1[middle-tenpercent+k](k2)[0])) :
+                        assert set1[middle-tenpercent+k](k2)[0][k3] == set2[k](k2)[0][k3]
+                else :
+                    assert set1[middle-tenpercent+k](k2)[0] == set2[k](k2)[0]
+        assert tenpercent > 1
+        set3 = ds[middle-tenpercent:middle+tenpercent:2]
+        for k2 in ds.fieldNames() :
+            if type(set2[2](k2)[0]) == N.ndarray :
+                for k3 in range(len(set2[2](k2)[0])) :
+                    assert set2[2](k2)[0][k3] == set3[1](k2)[0][k3]
+            else :
+                assert set2[2](k2)[0] == set3[1](k2)[0]
+
+        print 'done'
+
+
+    def test3_fields_iterator_consistency(self,ds) :
+        """ check if the number of iterator corresponds to the number of fields"""
+        print 'testing fields/iterator consistency...',
+        sys.stdout.flush()
+
+        # basic test
+        maxsize = min(len(ds)-1,100)
+        for iter in ds[:maxsize] :
+            assert len(iter) == len(ds.fieldNames())
+        if len(ds.fieldNames()) == 1 :
+            print 'done'
+            return
+
+        # with minibatches iterator
+        ds2 = ds.minibatches[:maxsize]([ds.fieldNames()[0],ds.fieldNames()[1]],minibatch_size=2)
+        for iter in ds2 :
+            assert len(iter) == 2
+
+        print 'done'
+
+
+
+
+
+###################################################################
+# main
 if __name__ == '__main__':
     unittest.main()
     
--- a/dataset.py	Fri May 23 10:21:52 2008 -0400
+++ b/dataset.py	Tue May 27 13:46:03 2008 -0400
@@ -442,7 +442,9 @@
         rows=None
         # or a slice
         if type(i) is slice:
+            #print 'i=',i
             if not i.start: i=slice(0,i.stop,i.step)
+            if not i.stop: i=slice(i.start,len(self),i.step)
             if not i.step: i=slice(i.start,i.stop,1)
             if i.step is 1:
                 return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples()
@@ -662,10 +664,16 @@
         and a values_hstack(fieldnames,fieldvalues) functions behaving with the same
         semantics as the DataSet methods of the same name (but without the self argument).
         """
+
         self._fields=fields_lookuplist
         assert len(fields_lookuplist)>0
         self.length=len(fields_lookuplist[0])
         for field in fields_lookuplist[1:]:
+            if self.length != len(field) :
+                print 'self.length = ',self.length
+                print 'len(field) = ', len(field)
+                print 'self._fields.keys() = ', self._fields.keys()
+                print 'field=',field
             assert self.length==len(field)
         self.values_vstack=values_vstack
         self.values_hstack=values_hstack
@@ -694,8 +702,13 @@
         return True
 
     def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+        #@TODO bug somewhere here, fieldnames doesnt seem to be well handled
         class Iterator(object):
-            def __init__(self,ds):
+            def __init__(self,ds,fieldnames):
+                # tbm: added two next lines to handle fieldnames
+                if fieldnames is None: fieldnames = ds._fields.keys()
+                self.fieldnames = fieldnames
+
                 self.ds=ds
                 self.next_example=offset
                 assert minibatch_size > 0
@@ -706,13 +719,21 @@
             def next(self):
                 upper = self.next_example+minibatch_size
                 assert upper<=self.ds.length
-                minibatch = Example(self.ds._fields.keys(),
-                                    [field[self.next_example:upper]
-                                     for field in self.ds._fields])
+                #minibatch = Example(self.ds._fields.keys(),
+                #                    [field[self.next_example:upper]
+                #                     for field in self.ds._fields])
+                # tbm: modif to use fieldnames
+                values = []
+                for f in self.fieldnames :
+                    #print 'we have field',f,'in fieldnames'
+                    values.append( self.ds._fields[f][self.next_example:upper] )
+                minibatch = Example(self.fieldnames,values)
+                #print minibatch
                 self.next_example+=minibatch_size
                 return minibatch
 
-        return Iterator(self)
+        # tbm: added fieldnames to handle subset of fieldnames
+        return Iterator(self,fieldnames)
 
     def valuesVStack(self,fieldname,fieldvalues):
         return self.values_vstack(fieldname,fieldvalues)
@@ -966,7 +987,14 @@
         for fieldname, fieldcolumns in self.fields_columns.items():
             if type(fieldcolumns) is int:
                 assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1]
-                self.fields_columns[fieldname]=[fieldcolumns]
+                if 0:
+                    #I changed this because it didn't make sense to me,
+                    # and it made it more difficult to write my learner.
+                    # If it breaks stuff, let's talk about it.
+                    # - James 22/05/2008
+                    self.fields_columns[fieldname]=[fieldcolumns]
+                else:
+                    self.fields_columns[fieldname]=fieldcolumns
             elif type(fieldcolumns) is slice:
                 start,step=None,None
                 if not fieldcolumns.start:
@@ -1165,6 +1193,9 @@
       Note that the expected semantics of the function differs in minibatch mode
       (it takes minibatches of inputs and produces minibatches of outputs, as
       documented in the class comment).
+
+      TBM: are filedtypes the old field types (from input_dataset) or the new ones
+      (for the new dataset created)?
       """
       self.input_dataset=input_dataset
       self.function=function
@@ -1207,6 +1238,7 @@
                   return all_outputs
               return Example(fieldnames,[all_outputs[name] for name in fieldnames])
 
+
       return ApplyFunctionIterator(self)
 
   def __iter__(self): # only implemented for increased efficiency
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/denoising_aa.py	Tue May 27 13:46:03 2008 -0400
@@ -0,0 +1,216 @@
+"""
+A denoising auto-encoder
+"""
+
+import theano
+from theano.formula import *
+from learner import *
+from theano import tensor as t
+from nnet_ops import *
+import math
+from misc import *
+from theano.tensor_random import binomial
+
+def hiding_corruption_formula(seed,average_fraction_hidden):
+    """
+    Return a formula for the corruption process, in which a random
+    subset of the input numbers are hidden (mapped to 0). 
+
+    @param seed: seed of the random generator
+    @type seed: anything that numpy.random.RandomState accepts
+    
+    @param average_fraction_hidden: the probability with which each
+                                    input number is hidden (set to 0).
+    @type average_fraction_hidden: 0 <= real number <= 1
+    """
+    class HidingCorruptionFormula(Formulas):
+        x = t.matrix()
+        corrupted_x = x * binomial(seed,x,1,fraction_sampled)
+
+    return HidingCorruptionFormula()
+
+def squash_affine_formula(squash_function=sigmoid):
+    """
+    Simply does: squash_function(b + xW)
+    By convention prefix the parameters by _
+    """
+    class SquashAffineFormula(Formulas):
+        x = t.matrix() # of dimensions minibatch_size x n_inputs
+        _b = t.row() # of dimensions 1 x n_outputs
+        _W = t.matrix() # of dimensions n_inputs x n_outputs
+        a = _b + t.dot(x,_W) # of dimensions minibatch_size x n_outputs
+        y = squash_function(a)
+    return SquashAffineFormula()
+
+def gradient_descent_update_formula():
+    class GradientDescentUpdateFormula(Formula):
+        param = t.matrix()
+        learning_rate = t.scalar()
+        cost = t.column() # cost of each example in a minibatch
+        param_update = t.add_inplace(param, -learning_rate*t.sgrad(cost))
+    return gradient_descent_update_formula()
+    
+def probabilistic_classifier_loss_formula():
+    class ProbabilisticClassifierLossFormula(Formulas):
+        a = t.matrix() # of dimensions minibatch_size x n_classes, pre-softmax output
+        target_class = t.ivector() # dimension (minibatch_size)
+        nll, probability_predictions = crossentropy_softmax_1hot(a, target_class) # defined in nnet_ops.py
+    return ProbabilisticClassifierLossFormula()
+
+def binomial_cross_entropy_formula():
+    class BinomialCrossEntropyFormula(Formulas):
+        a = t.matrix() # pre-sigmoid activations, minibatch_size x dim
+        p = sigmoid(a) # model prediction
+        q = t.matrix() # target binomial probabilities, minibatch_size x dim
+        # using the identity softplus(a) - softplus(-a) = a,
+        # we obtain that q log(p) + (1-q) log(1-p) = q a - softplus(a)
+        nll = -t.sum(q*a - softplus(-a))
+    # next line was missing... hope it's all correct above
+    return BinomialCrossEntropyFormula()
+
+def squash_affine_autoencoder_formula(hidden_squash=t.tanh,
+                                      reconstruction_squash=sigmoid,
+                                      share_weights=True,
+                                      reconstruction_nll_formula=binomial_cross_entropy_formula(),
+                                      update_formula=gradient_descent_update_formula):
+    if share_weights:
+        autoencoder = squash_affine_formula(hidden_squash).rename(a='code_a') + \
+                      squash_affine_formula(reconstruction_squash).rename(x='hidden',y='reconstruction',_b='_c') + \
+                      reconstruction_nll_formula
+    else:
+        autoencoder = squash_affine_formula(hidden_squash).rename(a='code_a',_W='_W1') + \
+                      squash_affine_formula(reconstruction_squash).rename(x='hidden',y='reconstruction',_b='_c',_W='_W2') + \
+                      reconstruction_nll_formula
+    autoencoder = autoencoder + [update_formula().rename(cost = 'nll',
+                                                         param = p)
+                                 for p in autoencoder.get_all('_.*')]
+    return autoencoder
+
+    
+# @todo: try other corruption formulae. The above is the default one.
+# not quite used in the ICML paper... (had a fixed number of 0s).
+
+class DenoisingAutoEncoder(LearningAlgorithm):
+    
+    def __init__(self,n_inputs,n_hidden_per_layer,
+                 learning_rate=0.1,
+                 max_n_epochs=100,
+                 L1_regularizer=0,
+                 init_range=1.,
+                 corruption_formula = hiding_corruption_formula(),
+                 autoencoder = squash_affine_autoencoder_formula(),
+                 minibatch_size=None,linker = "c|py"):
+        for name,val in locals().items():
+            if val is not self: self.__setattribute__(name,val)
+        self.denoising_autoencoder_formula = corruption_formula + autoencoder.rename(x='corrupted_x')
+        
+    def __call__(self, training_set=None):
+        """ Allocate and optionnaly train a model"""
+        model = DenoisingAutoEncoderModel(self)
+        if training_set:
+            print 'DenoisingAutoEncoder(): what do I do if training_set????'
+            # copied from mlp_factory_approach:
+            if len(trainset) == sys.maxint:
+                raise NotImplementedError('Learning from infinite streams is not supported')
+            nval = int(self.validation_portion * len(trainset))
+            nmin = len(trainset) - nval
+            assert nmin >= 0
+            minset = trainset[:nmin] #real training set for minimizing loss
+            valset = trainset[nmin:] #validation set for early stopping
+            best = model
+            for stp in self.early_stopper():
+                model.update(
+                    minset.minibatches([input, target], minibatch_size=min(32,
+                        len(trainset))))
+                #print 'mlp.__call__(), we did an update'
+                if stp.set_score:
+                    stp.score = model(valset, ['loss_01'])
+                    if (stp.score < stp.best_score):
+                        best = copy.copy(model)
+            model = best
+            # end of the copy from mlp_factory_approach
+ 
+        return model
+
+            
+    def compile(self, inputs, outputs):
+        return theano.function(inputs,outputs,unpack_single=False,linker=self.linker)
+    
+class DenoisingAutoEncoderModel(LearnerModel):
+    def __init__(self,learning_algorithm,params):
+        self.learning_algorithm=learning_algorithm
+        self.params=params
+        v = learning_algorithm.v
+        self.update_fn = learning_algorithm.compile(learning_algorithm.denoising_autoencoder_formula.inputs,
+                                                    learning_algorithm.denoising_autoencoder_formula.outputs)
+
+    def update(self, training_set, train_stats_collector=None):
+        
+        print 'dont update you crazy frog!'
+
+# old stuff
+
+#         self._learning_rate = t.scalar('learning_rate') # this is the symbol
+#         self.L1_regularizer = L1_regularizer
+#         self._L1_regularizer = t.scalar('L1_regularizer')
+#         self._input = t.matrix('input') # n_examples x n_inputs
+#         self._W = t.matrix('W')
+#         self._b = t.row('b')
+#         self._c = t.row('b')
+#         self._regularization_term = self._L1_regularizer * t.sum(t.abs(self._W))
+#         self._corrupted_input = corruption_process(self._input)
+#         self._hidden = t.tanh(self._b + t.dot(self._input, self._W.T))
+#         self._reconstruction_activations =self._c+t.dot(self._hidden,self._W)
+#         self._nll,self._output = crossentropy_softmax_1hot(Print("output_activations")(self._output_activations),self._target_vector)
+#         self._output_class = t.argmax(self._output,1)
+#         self._class_error = t.neq(self._output_class,self._target_vector)
+#         self._minibatch_criterion = self._nll + self._regularization_term / t.shape(self._input)[0]
+#         OnlineGradientTLearner.__init__(self)
+            
+#     def attributeNames(self):
+#         return ["parameters","b1","W2","b2","W2", "L2_regularizer","regularization_term"]
+
+#     def parameterAttributes(self):
+#         return ["b1","W1", "b2", "W2"]
+    
+#     def updateMinibatchInputFields(self):
+#         return ["input","target"]
+    
+#     def updateEndOutputAttributes(self):
+#         return ["regularization_term"]
+
+#     def lossAttribute(self):
+#         return "minibatch_criterion"
+    
+#     def defaultOutputFields(self, input_fields):
+#         output_fields = ["output", "output_class",]
+#         if "target" in input_fields:
+#             output_fields += ["class_error", "nll"]
+#         return output_fields
+        
+#     def allocate(self,minibatch):
+#         minibatch_n_inputs  = minibatch["input"].shape[1]
+#         if not self._n_inputs:
+#             self._n_inputs = minibatch_n_inputs
+#             self.b1 = numpy.zeros((1,self._n_hidden))
+#             self.b2 = numpy.zeros((1,self._n_outputs))
+#             self.forget()
+#         elif self._n_inputs!=minibatch_n_inputs:
+#             # if the input changes dimension on the fly, we resize and forget everything
+#             self.forget()
+            
+#     def forget(self):
+#         if self._n_inputs:
+#             r = self._init_range/math.sqrt(self._n_inputs)
+#             self.W1 = numpy.random.uniform(low=-r,high=r,
+#                                            size=(self._n_hidden,self._n_inputs))
+#             r = self._init_range/math.sqrt(self._n_hidden)
+#             self.W2 = numpy.random.uniform(low=-r,high=r,
+#                                            size=(self._n_outputs,self._n_hidden))
+#             self.b1[:]=0
+#             self.b2[:]=0
+#             self._n_epochs=0
+
+#     def isLastEpoch(self):
+#         self._n_epochs +=1
+#         return self._n_epochs>=self._max_n_epochs
--- a/learner.py	Fri May 23 10:21:52 2008 -0400
+++ b/learner.py	Tue May 27 13:46:03 2008 -0400
@@ -1,6 +1,7 @@
+
 
 from exceptions import *
-
+from dataset import AttributesHolder
 
 class LearningAlgorithm(object):
     """
--- a/mlp_factory_approach.py	Fri May 23 10:21:52 2008 -0400
+++ b/mlp_factory_approach.py	Tue May 27 13:46:03 2008 -0400
@@ -1,10 +1,24 @@
+"""
+
+
+
+This file is deprecated. I'm continuing development in hpu/models.py.
+
+Get that project like this: hg clone ssh://user@lgcm/../bergstrj/hpu
+
+
+
+
+
+"""
 import copy, sys
 import numpy
 
 import theano
 from theano import tensor as t
 
-from tlearn import dataset, nnet_ops, stopper
+from pylearn import dataset, nnet_ops, stopper
+
 
 def _randshape(*shape): 
     return (numpy.random.rand(*shape) -0.5) * 0.001
@@ -30,17 +44,19 @@
         """Update this model from more training data."""
         params = self.params
         #TODO: why should we have to unpack target like this?
+        # tbm : creates problem...
         for input, target in input_target:
-            self.update_fn(input, target[:,0], *params)
+            rval= self.update_fn(input, target[:,0], *params)
+            #print rval[0]
 
-    def __call__(self, testset, fieldnames=['output_class']):
+    def __call__(self, testset, fieldnames=['output_class'],input='input',target='target'):
         """Apply this model (as a function) to new data"""
         #TODO: cache fn between calls
-        assert 'input' == testset.fieldNames()[0]
+        assert input == testset.fieldNames()[0] # why first one???
         assert len(testset.fieldNames()) <= 2
         v = self.algo.v
         outputs = [getattr(v, name) for name in fieldnames]
-        inputs = [v.input] + ([v.target] if 'target' in testset else [])
+        inputs = [v.input] + ([v.target] if target in testset else [])
         inputs.extend(v.params)
         theano_fn = _cache(self._fn_cache, (tuple(inputs), tuple(outputs)),
                 lambda: self.algo._fn(inputs, outputs))
@@ -102,7 +118,7 @@
         # prefer caching in _Model.__call__
         return theano.function(inputs, outputs, unpack_single=False, linker=self.linker)
 
-    def __call__(self, trainset=None, iparams=None):
+    def __call__(self, trainset=None, iparams=None, input='input', target='target'):
         """Allocate and optionally train a model"""
         if iparams is None:
             iparams = [_randshape(self.nhid, self.nclass), _randshape(self.nclass)]\
@@ -119,8 +135,9 @@
             best = rval
             for stp in self.early_stopper():
                 rval.update(
-                    trainset.minibatches(['input', 'target'], minibatch_size=min(32,
+                    minset.minibatches([input, target], minibatch_size=min(32,
                         len(trainset))))
+                #print 'mlp.__call__(), we did an update'
                 if stp.set_score:
                     stp.score = rval(valset, ['loss_01'])
                     if (stp.score < stp.best_score):
@@ -154,7 +171,7 @@
                 , linker='c&py'
                 , early_stopper = lambda:stopper.NStages(100,1))
 
-        model1 = learn_algo(training_set1)
+        model1 = learn_algo(training_set1,input='input',target='target')
 
         model2 = learn_algo(training_set2)
 
--- a/nnet_ops.py	Fri May 23 10:21:52 2008 -0400
+++ b/nnet_ops.py	Tue May 27 13:46:03 2008 -0400
@@ -44,7 +44,7 @@
         return ScalarSoftplus.static_impl(x)
     def grad(self, (x,), (gz,)):
         return [gz * scalar_sigmoid(x)]
-    def c_code(self, name, node, (x,), (z,), sub):
+    def c_code(self, node, name, (x,), (z,), sub):
         if node.inputs[0].type in [scalar.float32, scalar.float64]:
             return """%(z)s =
                 %(x)s < -30.0 
--- a/statscollector.py	Fri May 23 10:21:52 2008 -0400
+++ b/statscollector.py	Tue May 27 13:46:03 2008 -0400
@@ -77,7 +77,7 @@
         total_loss = regularizer+t.examplewise_sum(nll)
         avg_nll = t.examplewise_mean(nll)
         avg_class_error = t.examplewise_mean(class_error)
-        for name,val in locals(): val.name = name
+        for name,val in locals().items(): val.name = name
         return StatsCollector([regularizer],[nll,class_error],[total_loss,avg_nll,avg_class_error])
     
 
--- a/stopper.py	Fri May 23 10:21:52 2008 -0400
+++ b/stopper.py	Tue May 27 13:46:03 2008 -0400
@@ -75,6 +75,9 @@
 
     E_set_score = 'when iter.set_score is True, caller must assign a score to iter.score'
     def next(self):
+
+        #print 'ICML08 stopper, were doing a next'
+
         if self.set_score: #left over from last time
             if self.score is None:
                 raise Exception(ICML08Stopper.E_set_score)