diff _test_dataset.py @ 292:174374d59405

merge
author James Bergstra <bergstrj@iro.umontreal.ca>
date Fri, 06 Jun 2008 15:56:18 -0400
parents 58e17421c69c 8e923cb2e8fc
children 4bfdda107a17
line wrap: on
line diff
--- a/_test_dataset.py	Thu Jun 05 18:43:16 2008 -0400
+++ b/_test_dataset.py	Fri Jun 06 15:56:18 2008 -0400
@@ -1,183 +1,442 @@
+#!/bin/env python
 from dataset import *
 from math import *
-import unittest
-import sys
-import numpy as N
+import numpy,unittest
+from misc import *
+
+def have_raised(to_eval, **var):
+    have_thrown = False
+    try:
+        eval(to_eval)
+    except :
+        have_thrown = True
+    return have_thrown
+
+def have_raised2(f, *args, **kwargs):
+    have_thrown = False
+    try:
+        f(*args, **kwargs)
+    except :
+        have_thrown = True
+    return have_thrown
+
+def test1():
+    print "test1"
+    global a,ds
+    a = numpy.random.rand(10,4)
+    print a
+    ds = ArrayDataSet(a,{'x':slice(3),'y':3,'z':[0,2]})
+    print "len(ds)=",len(ds)
+    assert(len(ds)==10)
+    print "example 0 = ",ds[0]
+#    assert
+    print "x=",ds["x"]
+    print "x|y"
+    for x,y in ds("x","y"):
+        print x,y
+    minibatch_iterator = ds.minibatches(fieldnames=['z','y'],n_batches=1,minibatch_size=3,offset=4)
+    minibatch = minibatch_iterator.__iter__().next()
+    print "minibatch=",minibatch
+    for var in minibatch:
+        print "var=",var
+    print "take a slice and look at field y",ds[1:6:2]["y"]
+
+    del a,ds,x,y,minibatch_iterator,minibatch,var
 
-def _sum_all(a):
-    s=a
-    while isinstance(s,numpy.ndarray):
-        s=sum(s)
-    return s
-    
-class T_arraydataset(unittest.TestCase):
-    def setUp(self):
-        numpy.random.seed(123456)
+def test_iterate_over_examples(array,ds):
+#not in doc!!!
+    i=0
+    for example in range(len(ds)):
+        assert (ds[example]['x']==array[example][:3]).all()
+        assert ds[example]['y']==array[example][3]
+        assert (ds[example]['z']==array[example][[0,2]]).all()
+        i+=1
+    assert i==len(ds)
+    del example,i
+
+#     - for example in dataset:
+    i=0
+    for example in ds:
+        assert len(example)==3
+        assert (example['x']==array[i][:3]).all()
+        assert example['y']==array[i][3]
+        assert (example['z']==array[i][0:3:2]).all()
+        assert (numpy.append(example['x'],example['y'])==array[i]).all()
+        i+=1
+    assert i==len(ds)
+    del example,i
 
+#     - for val1,val2,... in dataset:
+    i=0
+    for x,y,z in ds:
+        assert (x==array[i][:3]).all()
+        assert y==array[i][3]
+        assert (z==array[i][0:3:2]).all()
+        assert (numpy.append(x,y)==array[i]).all()
+        i+=1
+    assert i==len(ds)
+    del x,y,z,i
+
+#     - for example in dataset(field1, field2,field3, ...):
+    i=0
+    for example in ds('x','y','z'):
+        assert len(example)==3
+        assert (example['x']==array[i][:3]).all()
+        assert example['y']==array[i][3]
+        assert (example['z']==array[i][0:3:2]).all()
+        assert (numpy.append(example['x'],example['y'])==array[i]).all()
+        i+=1
+    assert i==len(ds)
+    del example,i
+    i=0
+    for example in ds('y','x'):
+        assert len(example)==2
+        assert (example['x']==array[i][:3]).all()
+        assert example['y']==array[i][3]
+        assert (numpy.append(example['x'],example['y'])==array[i]).all()
+        i+=1
+    assert i==len(ds)
+    del example,i
 
-    def test_ctor_len(self):
-        n = numpy.random.rand(8,3)
-        a=ArrayDataSet(n)
-        self.failUnless(a.data is n)
-        self.failUnless(a.fields is None)
+#     - for val1,val2,val3 in dataset(field1, field2,field3):
+    i=0
+    for x,y,z in ds('x','y','z'):
+        assert (x==array[i][:3]).all()
+        assert y==array[i][3]
+        assert (z==array[i][0:3:2]).all()
+        assert (numpy.append(x,y)==array[i]).all()
+        i+=1
+    assert i==len(ds)
+    del x,y,z,i
+    i=0
+    for y,x in ds('y','x',):
+        assert (x==array[i][:3]).all()
+        assert y==array[i][3]
+        assert (numpy.append(x,y)==array[i]).all()
+        i+=1
+    assert i==len(ds)
+    del x,y,i
 
-        self.failUnless(len(a) == n.shape[0])
-        self.failUnless(a[0].shape == (n.shape[1],))
+    def test_minibatch_size(minibatch,minibatch_size,len_ds,nb_field,nb_iter_finished):
+        ##full minibatch or the last minibatch
+        for idx in range(nb_field):
+            test_minibatch_field_size(minibatch[idx],minibatch_size,len_ds,nb_iter_finished)
+        del idx
+    def test_minibatch_field_size(minibatch_field,minibatch_size,len_ds,nb_iter_finished):
+        assert len(minibatch_field)==minibatch_size or ((nb_iter_finished*minibatch_size+len(minibatch_field))==len_ds and len(minibatch_field)<minibatch_size)
+
+#     - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N):
+    i=0
+    mi=0
+    m=ds.minibatches(['x','z'], minibatch_size=3)
+    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    for minibatch in m:
+        assert isinstance(minibatch,DataSetFields)
+        assert len(minibatch)==2
+        test_minibatch_size(minibatch,m.minibatch_size,len(ds),2,mi)
+        if type(ds)==ArrayDataSet:
+            assert (minibatch[0][:,::2]==minibatch[1]).all()
+        else:
+            for j in xrange(len(minibatch[0])):
+                (minibatch[0][j][::2]==minibatch[1][j]).all()
+        mi+=1
+        i+=len(minibatch[0])
+    assert i==len(ds)
+    assert mi==4
+    del minibatch,i,m,mi
 
-    def test_iter(self):
-        arr = numpy.random.rand(8,3)
-        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,3)})
-        for i, example in enumerate(a):
-            self.failUnless(numpy.all( example['x'] == arr[i,:2]))
-            self.failUnless(numpy.all( example['y'] == arr[i,1:3]))
+    i=0
+    mi=0
+    m=ds.minibatches(['x','y'], minibatch_size=3)
+    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    for minibatch in m:
+        assert len(minibatch)==2
+        test_minibatch_size(minibatch,m.minibatch_size,len(ds),2,mi)
+        mi+=1
+        for id in range(len(minibatch[0])):
+            assert (numpy.append(minibatch[0][id],minibatch[1][id])==array[i]).all()
+            i+=1
+    assert i==len(ds)
+    assert mi==4
+    del minibatch,i,id,m,mi
 
-    def test_zip(self):
-        arr = numpy.random.rand(8,3)
-        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,3)})
-        for i, x in enumerate(a.zip("x")):
-            self.failUnless(numpy.all( x == arr[i,:2]))
+#     - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N):
+    i=0
+    mi=0
+    m=ds.minibatches(['x','z'], minibatch_size=3)
+    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    for x,z in m:
+        test_minibatch_field_size(x,m.minibatch_size,len(ds),mi)
+        test_minibatch_field_size(z,m.minibatch_size,len(ds),mi)
+        for id in range(len(x)):
+            assert (x[id][::2]==z[id]).all()
+            i+=1
+        mi+=1
+    assert i==len(ds)
+    assert mi==4
+    del x,z,i,m,mi
+    i=0
+    mi=0
+    m=ds.minibatches(['x','y'], minibatch_size=3)
+    for x,y in m:
+        test_minibatch_field_size(x,m.minibatch_size,len(ds),mi)
+        test_minibatch_field_size(y,m.minibatch_size,len(ds),mi)
+        mi+=1
+        for id in range(len(x)):
+            assert (numpy.append(x[id],y[id])==array[i]).all()
+            i+=1
+    assert i==len(ds)
+    assert mi==4
+    del x,y,i,id,m,mi
+
+#not in doc
+    i=0
+    m=ds.minibatches(['x','y'],n_batches=1,minibatch_size=3,offset=4)
+    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    for x,y in m:
+        assert len(x)==m.minibatch_size
+        assert len(y)==m.minibatch_size
+        for id in range(m.minibatch_size):
+            assert (numpy.append(x[id],y[id])==array[i+4]).all()
+            i+=1
+    assert i==m.n_batches*m.minibatch_size
+    del x,y,i,id,m
 
-    def test_minibatch_basic(self):
-        arr = numpy.random.rand(10,4)
-        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
-        for i, mb in enumerate(a.minibatches(minibatch_size=2)): #all fields
-            self.failUnless(numpy.all( mb['x'] == arr[i*2:i*2+2,0:2]))
-            self.failUnless(numpy.all( mb['y'] == arr[i*2:i*2+2,1:4]))
+    i=0
+    m=ds.minibatches(['x','y'],n_batches=2,minibatch_size=3,offset=4)
+    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    for x,y in m:
+        assert len(x)==m.minibatch_size
+        assert len(y)==m.minibatch_size
+        for id in range(m.minibatch_size):
+            assert (numpy.append(x[id],y[id])==array[i+4]).all()
+            i+=1
+    assert i==m.n_batches*m.minibatch_size
+    del x,y,i,id,m
+
+    i=0
+    m=ds.minibatches(['x','y'],n_batches=20,minibatch_size=3,offset=4)
+    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    for x,y in m:
+        assert len(x)==m.minibatch_size
+        assert len(y)==m.minibatch_size
+        for id in range(m.minibatch_size):
+            assert (numpy.append(x[id],y[id])==array[(i+4)%array.shape[0]]).all()
+            i+=1
+    assert i==m.n_batches*m.minibatch_size
+    del x,y,i,id
+
+    #@todo: we can't do minibatch bigger then the size of the dataset???
+    assert have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array)+1,offset=0)
+    assert not have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array),offset=0)
 
-    def test_getattr(self):
-        arr = numpy.random.rand(10,4)
-        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
-        a_y = a.y
-        self.failUnless(numpy.all( a_y == arr[:,1:4]))
+def test_ds_iterator(array,iterator1,iterator2,iterator3):
+    l=len(iterator1)
+    i=0
+    for x,y in iterator1:
+        assert (x==array[i][:3]).all()
+        assert y==array[i][3]
+        assert (numpy.append(x,y)==array[i]).all()
+        i+=1
+    assert i==l
+    i=0
+    for y,z in iterator2:
+        assert y==array[i][3]
+        assert (z==array[i][0:3:2]).all()
+        i+=1
+    assert i==l
+    i=0
+    for x,y,z in iterator3:
+        assert (x==array[i][:3]).all()
+        assert y==array[i][3]
+        assert (z==array[i][0:3:2]).all()
+        assert (numpy.append(x,y)==array[i]).all()
+        i+=1
+    assert i==l
 
-    def test_minibatch_wraparound_even(self):
-        arr = numpy.random.rand(10,4)
-        arr2 = ArrayDataSet.Iterator.matcat(arr,arr)
+def test_getitem(array,ds):
+    def test_ds(orig,ds,index):
+        i=0
+        assert len(ds)==len(index)
+        for x,z,y in ds('x','z','y'):
+            assert (orig[index[i]]['x']==array[index[i]][:3]).all()
+            assert (orig[index[i]]['x']==x).all()
+            assert orig[index[i]]['y']==array[index[i]][3]
+            assert orig[index[i]]['y']==y
+            assert (orig[index[i]]['z']==array[index[i]][0:3:2]).all()
+            assert (orig[index[i]]['z']==z).all()
+            i+=1
+        del i
+        ds[0]
+        if len(ds)>2:
+            ds[:1]
+            ds[1:1]
+            ds[1:1:1]
+        if len(ds)>5:
+            ds[[1,2,3]]
+        for x in ds:
+            pass
 
-        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
+#ds[:n] returns a dataset with the n first examples.
+    ds2=ds[:3]
+    assert isinstance(ds2,DataSet)
+    test_ds(ds,ds2,index=[0,1,2])
+    del ds2
+
+#ds[i1:i2:s]# returns a ds with the examples i1,i1+s,...i2-s.
+    ds2=ds[1:7:2]
+    assert isinstance(ds2,DataSet)
+    test_ds(ds,ds2,[1,3,5])
+    del ds2
+
+#ds[i]
+    ds2=ds[5]
+    assert isinstance(ds2,Example)
+    assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds)  # index not defined
+    assert not have_raised("var['ds']["+str(len(ds)-1)+"]",ds=ds)
+    del ds2
+
+#ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in.
+    ds2=ds[[4,7,2,8]]
+    assert isinstance(ds2,DataSet)
+    test_ds(ds,ds2,[4,7,2,8])
+    del ds2
 
-        #print arr
-        for i, x in enumerate(a.minibatches(["x"], minibatch_size=2, n_batches=8)):
-            #print 'x' , x
-            self.failUnless(numpy.all( x == arr2[i*2:i*2+2,0:2]))
+    #ds.<property># returns the value of a property associated with
+      #the name <property>. The following properties should be supported:
+      #    - 'description': a textual description or name for the ds
+      #    - 'fieldtypes': a list of types (one per field)
 
-    def test_minibatch_wraparound_odd(self):
-        arr = numpy.random.rand(10,4)
-        arr2 = ArrayDataSet.Iterator.matcat(arr,arr)
+    #* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#????
+        #assert hstack([ds('x','y'),ds('z')])==ds
+        #hstack([ds('z','y'),ds('x')])==ds
+    assert have_raised2(hstack,[ds('x'),ds('x')])
+    assert have_raised2(hstack,[ds('y','x'),ds('x')])
+    assert not have_raised2(hstack,[ds('x'),ds('y')])
+        
+    #        i=0
+    #        for example in hstack([ds('x'),ds('y'),ds('z')]):
+    #            example==ds[i]
+    #            i+=1 
+    #        del i,example
+    #* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#????
 
-        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
-
-        for i, x in enumerate(a.minibatches(["x"], minibatch_size=3, n_batches=6)):
-            self.failUnless(numpy.all( x == arr2[i*3:i*3+3,0:2]))
+def test_fields_fct(ds):
+    #@todo, fill correctly
+    assert len(ds.fields())==3
+    i=0
+    v=0
+    for field in ds.fields():
+        for field_value in field: # iterate over the values associated to that field for all the ds examples
+            v+=1
+        i+=1
+    assert i==3
+    assert v==3*10
+    del i,v
+    
+    i=0
+    v=0
+    for field in ds('x','z').fields():
+        i+=1
+        for val in field:
+            v+=1
+    assert i==2
+    assert v==2*10
+    del i,v
+    
+    i=0
+    v=0
+    for field in ds.fields('x','y'):
+        i+=1
+        for val in field:
+            v+=1
+    assert i==2
+    assert v==2*10
+    del i,v
     
+    i=0
+    v=0
+    for field_examples in ds.fields():
+        for example_value in field_examples:
+            v+=1
+        i+=1
+    assert i==3
+    assert v==3*10
+    del i,v
+    
+    assert ds == ds.fields().examples()
+    assert len(ds('x','y').fields()) == 2
+    assert len(ds('x','z').fields()) == 2
+    assert len(ds('y').fields()) == 1
 
-class T_renamingdataset(unittest.TestCase):
-    def setUp(self):
-        numpy.random.seed(123456)
+    del field
+def test_all(array,ds):
+    assert len(ds)==10
+
+    test_iterate_over_examples(array, ds)
+    test_getitem(array, ds)
+    test_ds_iterator(array,ds('x','y'),ds('y','z'),ds('x','y','z'))
+    test_fields_fct(ds)
+
+class T_DataSet(unittest.TestCase):
+    def test_ArrayDataSet(self):
+        #don't test stream
+        #tested only with float value
+        #don't always test with y
+        #don't test missing value
+        #don't test with tuple
+        #don't test proterties
+        a2 = numpy.random.rand(10,4)
+        ds = ArrayDataSet(a2,{'x':slice(3),'y':3,'z':[0,2]})###???tuple not tested
+        ds = ArrayDataSet(a2,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
+        #assert ds==a? should this work?
+
+        test_all(a2,ds)
+
+        del a2, ds
+
+    def test_CachedDataSet(self):
+        a = numpy.random.rand(10,4)
+        ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
+        ds2 = CachedDataSet(ds1)
+        ds3 = CachedDataSet(ds1,cache_all_upon_construction=True)
+
+        test_all(a,ds2)
+        test_all(a,ds3)
+
+        del a,ds1,ds2,ds3
 
 
-    def test_hasfield(self):
-        n = numpy.random.rand(3,8)
-        a=ArrayDataSet(data=n,fields={"x":slice(2),"y":slice(1,4),"z":slice(4,6)})
-        b=a.rename({'xx':'x','zz':'z'})
-        self.failUnless(b.hasFields('xx','zz') and not b.hasFields('x') and not b.hasFields('y'))
+    def test_DataSetFields(self):
+        raise NotImplementedError()
 
-class T_applyfunctiondataset(unittest.TestCase):
-    def setUp(self):
-        numpy.random.seed(123456)
+    def test_ApplyFunctionDataSet(self):
+        a = numpy.random.rand(10,4)
+        a2 = a+1
+        ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
+
+        ds2 = ApplyFunctionDataSet(ds1,lambda x,y,z: (x+1,y+1,z+1), ['x','y','z'],minibatch_mode=False)
+        ds3 = ApplyFunctionDataSet(ds1,lambda x,y,z: (numpy.array(x)+1,numpy.array(y)+1,numpy.array(z)+1),
+                                   ['x','y','z'],
+                                   minibatch_mode=True)
 
-    def test_function(self):
-        n = numpy.random.rand(3,8)
-        a=ArrayDataSet(data=n,fields={"x":slice(2),"y":slice(1,4),"z":slice(4,6)})
-        b=a.apply_function(lambda x,y: x+y,x+1, ['x','y'], ['x+y','x+1'], False,False,False)
-        print b.fieldNames()
-        print b('x+y')
-        
+        test_all(a2,ds2)
+        test_all(a2,ds3)
 
+        del a,ds1,ds2,ds3
 
-
-# to be used with a any new dataset
-class T_dataset_tester(object):
-    """
-    This class' goal is to test any new dataset that is created
-    Tests are (will be!) designed to check the normal behaviours
-    of a dataset, as defined in dataset.py
-    """
+    def test_FieldsSubsetDataSet(self):
+        raise NotImplementedError()
+    def test_MinibatchDataSet(self):
+        raise NotImplementedError()
+    def test_HStackedDataSet(self):
+        raise NotImplementedError()
+    def test_VStackedDataSet(self):
+        raise NotImplementedError()
+    def test_ArrayFieldsDataSet(self):
+        raise NotImplementedError()
 
 
-    def __init__(self,ds,runall=True) :
-        """if interested in only a subset of test, init with runall=False"""
-        self.ds = ds
-        
-        if runall :
-            self.test1_basicstats(ds)
-            self.test2_slicing(ds)
-            self.test3_fields_iterator_consistency(ds)
-
-    def test1_basicstats(self,ds) :
-        """print basics stats on a dataset, like length"""
-
-        print 'len(ds) = ',len(ds)
-        print 'num fields = ', len(ds.fieldNames())
-        print 'types of field: ',
-        for k in ds.fieldNames() :
-            print type(ds[0](k)[0]),
-        print ''
+if __name__=='__main__':
+    unittest.main()
 
-    def test2_slicing(self,ds) :
-        """test if slicing works properly"""
-        print 'testing slicing...',
-        sys.stdout.flush()
-        
-        middle = len(ds) / 2
-        tenpercent = int(len(ds) * .1)
-        set1 = ds[:middle+tenpercent]
-        set2 = ds[middle-tenpercent:]
-        for k in range(tenpercent + tenpercent -1):
-            for k2 in ds.fieldNames() :
-                if type(set1[middle-tenpercent+k](k2)[0]) == N.ndarray :
-                    for k3 in range(len(set1[middle-tenpercent+k](k2)[0])) :
-                        assert set1[middle-tenpercent+k](k2)[0][k3] == set2[k](k2)[0][k3]
-                else :
-                    assert set1[middle-tenpercent+k](k2)[0] == set2[k](k2)[0]
-        assert tenpercent > 1
-        set3 = ds[middle-tenpercent:middle+tenpercent:2]
-        for k2 in ds.fieldNames() :
-            if type(set2[2](k2)[0]) == N.ndarray :
-                for k3 in range(len(set2[2](k2)[0])) :
-                    assert set2[2](k2)[0][k3] == set3[1](k2)[0][k3]
-            else :
-                assert set2[2](k2)[0] == set3[1](k2)[0]
-
-        print 'done'
-
-
-    def test3_fields_iterator_consistency(self,ds) :
-        """ check if the number of iterator corresponds to the number of fields"""
-        print 'testing fields/iterator consistency...',
-        sys.stdout.flush()
-
-        # basic test
-        maxsize = min(len(ds)-1,100)
-        for iter in ds[:maxsize] :
-            assert len(iter) == len(ds.fieldNames())
-        if len(ds.fieldNames()) == 1 :
-            print 'done'
-            return
-
-        # with minibatches iterator
-        ds2 = ds.minibatches[:maxsize]([ds.fieldNames()[0],ds.fieldNames()[1]],minibatch_size=2)
-        for iter in ds2 :
-            assert len(iter) == 2
-
-        print 'done'
-
-
-
-
-
-###################################################################
-# main
-if __name__ == '__main__':
-    unittest.main()
-