Mercurial > pylearn

--- a/_test_dataset.py	Fri May 23 17:11:39 2008 -0400
+++ b/_test_dataset.py	Fri May 23 17:12:12 2008 -0400
@@ -1,6 +1,8 @@
 from dataset import *
 from math import *
 import unittest
+import sys
+import numpy as N

 def _sum_all(a):
     s=a
@@ -92,6 +94,90 @@
         print b('x+y')


+
+
+# to be used with a any new dataset
+class T_dataset_tester(object):
+    """
+    This class' goal is to test any new dataset that is created
+    Tests are (will be!) designed to check the normal behaviours
+    of a dataset, as defined in dataset.py
+    """
+
+
+    def __init__(self,ds,runall=True) :
+        """if interested in only a subset of test, init with runall=False"""
+        self.ds = ds
+
+        if runall :
+            self.test1_basicstats(ds)
+            self.test2_slicing(ds)
+            self.test3_fields_iterator_consistency(ds)
+
+    def test1_basicstats(self,ds) :
+        """print basics stats on a dataset, like length"""
+
+        print 'len(ds) = ',len(ds)
+        print 'num fields = ', len(ds.fieldNames())
+        print 'types of field: ',
+        for k in ds.fieldNames() :
+            print type(ds[0](k)[0]),
+        print ''
+
+    def test2_slicing(self,ds) :
+        """test if slicing works properly"""
+        print 'testing slicing...',
+        sys.stdout.flush()
+
+        middle = len(ds) / 2
+        tenpercent = int(len(ds) * .1)
+        set1 = ds[:middle+tenpercent]
+        set2 = ds[middle-tenpercent:]
+        for k in range(tenpercent + tenpercent -1):
+            for k2 in ds.fieldNames() :
+                if type(set1[middle-tenpercent+k](k2)[0]) == N.ndarray :
+                    for k3 in range(len(set1[middle-tenpercent+k](k2)[0])) :
+                        assert set1[middle-tenpercent+k](k2)[0][k3] == set2[k](k2)[0][k3]
+                else :
+                    assert set1[middle-tenpercent+k](k2)[0] == set2[k](k2)[0]
+        assert tenpercent > 1
+        set3 = ds[middle-tenpercent:middle+tenpercent:2]
+        for k2 in ds.fieldNames() :
+            if type(set2[2](k2)[0]) == N.ndarray :
+                for k3 in range(len(set2[2](k2)[0])) :
+                    assert set2[2](k2)[0][k3] == set3[1](k2)[0][k3]
+            else :
+                assert set2[2](k2)[0] == set3[1](k2)[0]
+
+        print 'done'
+
+
+    def test3_fields_iterator_consistency(self,ds) :
+        """ check if the number of iterator corresponds to the number of fields"""
+        print 'testing fields/iterator consistency...',
+        sys.stdout.flush()
+
+        # basic test
+        maxsize = min(len(ds)-1,100)
+        for iter in ds[:maxsize] :
+            assert len(iter) == len(ds.fieldNames())
+        if len(ds.fieldNames()) == 1 :
+            print 'done'
+            return
+
+        # with minibatches iterator
+        ds2 = ds.minibatches[:maxsize]([ds.fieldNames()[0],ds.fieldNames()[1]],minibatch_size=2)
+        for iter in ds2 :
+            assert len(iter) == 2
+
+        print 'done'
+
+
+
+
+
+###################################################################
+# main
 if __name__ == '__main__':
     unittest.main()
--- a/dataset.py	Fri May 23 17:11:39 2008 -0400
+++ b/dataset.py	Fri May 23 17:12:12 2008 -0400
@@ -245,8 +245,7 @@
             if n_batches is not None:
                 ds_nbatches = min(n_batches,ds_nbatches)
             if fieldnames:
-                if not dataset.hasFields(*fieldnames):
-                    raise ValueError('field not present', fieldnames)
+                assert dataset.hasFields(*fieldnames)
             else:
                 self.fieldnames=dataset.fieldNames()
             self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size,
@@ -670,6 +669,11 @@
         assert len(fields_lookuplist)>0
         self.length=len(fields_lookuplist[0])
         for field in fields_lookuplist[1:]:
+            if self.length != len(field) :
+                print 'self.length = ',self.length
+                print 'len(field) = ', len(field)
+                print 'self._fields.keys() = ', self._fields.keys()
+                print 'field=',field
             assert self.length==len(field)
         self.values_vstack=values_vstack
         self.values_hstack=values_hstack
@@ -698,8 +702,13 @@
         return True

     def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+        #@TODO bug somewhere here, fieldnames doesnt seem to be well handled
         class Iterator(object):
-            def __init__(self,ds):
+            def __init__(self,ds,fieldnames):
+                # tbm: added two next lines to handle fieldnames
+                if fieldnames is None: fieldnames = ds._fields.keys()
+                self.fieldnames = fieldnames
+
                 self.ds=ds
                 self.next_example=offset
                 assert minibatch_size > 0
@@ -710,13 +719,21 @@
             def next(self):
                 upper = self.next_example+minibatch_size
                 assert upper<=self.ds.length
-                minibatch = Example(self.ds._fields.keys(),
-                                    [field[self.next_example:upper]
-                                     for field in self.ds._fields])
+                #minibatch = Example(self.ds._fields.keys(),
+                #                    [field[self.next_example:upper]
+                #                     for field in self.ds._fields])
+                # tbm: modif to use fieldnames
+                values = []
+                for f in self.fieldnames :
+                    #print 'we have field',f,'in fieldnames'
+                    values.append( self.ds._fields[f][self.next_example:upper] )
+                minibatch = Example(self.fieldnames,values)
+                #print minibatch
                 self.next_example+=minibatch_size
                 return minibatch

-        return Iterator(self)
+        # tbm: added fieldnames to handle subset of fieldnames
+        return Iterator(self,fieldnames)

     def valuesVStack(self,fieldname,fieldvalues):
         return self.values_vstack(fieldname,fieldvalues)
@@ -970,16 +987,7 @@
         for fieldname, fieldcolumns in self.fields_columns.items():
             if type(fieldcolumns) is int:
                 assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1]
-
-                if 0:
-                    #I changed this because it didn't make sense to me,
-                    # and it made it more difficult to write my learner.
-                    # If it breaks stuff, let's talk about it.
-                    # - James 22/05/2008
-                    self.fields_columns[fieldname]=[fieldcolumns]
-                else:
-                    self.fields_columns[fieldname]=fieldcolumns
-
+                self.fields_columns[fieldname]=[fieldcolumns]
             elif type(fieldcolumns) is slice:
                 start,step=None,None
                 if not fieldcolumns.start:
--- a/denoising_aa.py	Fri May 23 17:11:39 2008 -0400
+++ b/denoising_aa.py	Fri May 23 17:12:12 2008 -0400
@@ -31,6 +31,7 @@

 def squash_affine_formula(squash_function=sigmoid):
     """
+    Simply does: squash_function(b + xW)
     By convention prefix the parameters by _
     """
     class SquashAffineFormula(Formulas):
@@ -53,7 +54,7 @@
     class ProbabilisticClassifierLossFormula(Formulas):
         a = t.matrix() # of dimensions minibatch_size x n_classes, pre-softmax output
         target_class = t.ivector() # dimension (minibatch_size)
-        nll, probability_predictions = crossentropy_softmax_1hot(a, target_class)
+        nll, probability_predictions = crossentropy_softmax_1hot(a, target_class) # defined in nnet_ops.py
     return ProbabilisticClassifierLossFormula()

 def binomial_cross_entropy_formula():
@@ -64,6 +65,8 @@
         # using the identity softplus(a) - softplus(-a) = a,
         # we obtain that q log(p) + (1-q) log(1-p) = q a - softplus(a)
         nll = -t.sum(q*a - softplus(-a))
+    # next line was missing... hope it's all correct above
+    return BinomialCrossEntropyFormula()

 def squash_affine_autoencoder_formula(hidden_squash=t.tanh,
                                       reconstruction_squash=sigmoid,
@@ -102,9 +105,33 @@
         self.denoising_autoencoder_formula = corruption_formula + autoencoder.rename(x='corrupted_x')

     def __call__(self, training_set=None):
+        """ Allocate and optionnaly train a model"""
         model = DenoisingAutoEncoderModel(self)
         if training_set:
-            print 'what do I do if training set????'
+            print 'DenoisingAutoEncoder(): what do I do if training_set????'
+            # copied from mlp_factory_approach:
+            if len(trainset) == sys.maxint:
+                raise NotImplementedError('Learning from infinite streams is not supported')
+            nval = int(self.validation_portion * len(trainset))
+            nmin = len(trainset) - nval
+            assert nmin >= 0
+            minset = trainset[:nmin] #real training set for minimizing loss
+            valset = trainset[nmin:] #validation set for early stopping
+            best = model
+            for stp in self.early_stopper():
+                model.update(
+                    minset.minibatches([input, target], minibatch_size=min(32,
+                        len(trainset))))
+                #print 'mlp.__call__(), we did an update'
+                if stp.set_score:
+                    stp.score = model(valset, ['loss_01'])
+                    if (stp.score < stp.best_score):
+                        best = copy.copy(model)
+            model = best
+            # end of the copy from mlp_factory_approach
+
+        return model
+

     def compile(self, inputs, outputs):
         return theano.function(inputs,outputs,unpack_single=False,linker=self.linker)
--- a/mlp_factory_approach.py	Fri May 23 17:11:39 2008 -0400
+++ b/mlp_factory_approach.py	Fri May 23 17:12:12 2008 -0400
@@ -17,7 +17,7 @@
 import theano
 from theano import tensor as t

-import dataset, nnet_ops, stopper
+from pylearn import dataset, nnet_ops, stopper


 def _randshape(*shape):
@@ -44,18 +44,19 @@
         """Update this model from more training data."""
         params = self.params
         #TODO: why should we have to unpack target like this?
+        # tbm : creates problem...
         for input, target in input_target:
             rval= self.update_fn(input, target[:,0], *params)
             #print rval[0]

-    def __call__(self, testset, fieldnames=['output_class']):
+    def __call__(self, testset, fieldnames=['output_class'],input='input',target='target'):
         """Apply this model (as a function) to new data"""
         #TODO: cache fn between calls
-        assert 'input' == testset.fieldNames()[0]
+        assert input == testset.fieldNames()[0] # why first one???
         assert len(testset.fieldNames()) <= 2
         v = self.algo.v
         outputs = [getattr(v, name) for name in fieldnames]
-        inputs = [v.input] + ([v.target] if 'target' in testset else [])
+        inputs = [v.input] + ([v.target] if target in testset else [])
         inputs.extend(v.params)
         theano_fn = _cache(self._fn_cache, (tuple(inputs), tuple(outputs)),
                 lambda: self.algo._fn(inputs, outputs))
--- a/test_dataset.py	Fri May 23 17:11:39 2008 -0400
+++ b/test_dataset.py	Fri May 23 17:12:12 2008 -0400
@@ -491,5 +491,4 @@
     test_ArrayDataSet()
     test_CachedDataSet()
     test_ApplyFunctionDataSet()
-
 #test pmat.py