view _test_dataset.py @ 467:f3711bcc467e

Fixed a bug in how embeddings are read
author Joseph Turian <turian@iro.umontreal.ca>
date Mon, 20 Oct 2008 19:14:06 -0400
parents 18702ceb2096
children 82da179d95b2
line wrap: on
line source

#!/bin/env python
from dataset import *
from math import *
import numpy, unittest, sys
#from misc import *
from lookup_list import LookupList

def have_raised(to_eval, **var):
    have_thrown = False
    try:
        eval(to_eval)
    except :
        have_thrown = True
    return have_thrown

def have_raised2(f, *args, **kwargs):
    have_thrown = False
    try:
        f(*args, **kwargs)
    except :
        have_thrown = True
    return have_thrown

def test1():
    print "test1"
    global a,ds
    a = numpy.random.rand(10,4)
    print a
    ds = ArrayDataSet(a,{'x':slice(3),'y':3,'z':[0,2]})
    print "len(ds)=",len(ds)
    assert(len(ds)==10)
    print "example 0 = ",ds[0]
#    assert
    print "x=",ds["x"]
    print "x|y"
    for x,y in ds("x","y"):
        print x,y
    minibatch_iterator = ds.minibatches(fieldnames=['z','y'],n_batches=1,minibatch_size=3,offset=4)
    minibatch = minibatch_iterator.__iter__().next()
    print "minibatch=",minibatch
    for var in minibatch:
        print "var=",var
    print "take a slice and look at field y",ds[1:6:2]["y"]

    del a,ds,x,y,minibatch_iterator,minibatch,var

def test_iterate_over_examples(array,ds):
#not in doc!!!
    i=0
    for example in range(len(ds)):
        wanted = array[example][:3]
        returned = ds[example]['x']
        if (wanted != returned).all():
            print 'returned:', returned
            print 'wanted:', wanted
        assert (ds[example]['x']==array[example][:3]).all()
        assert ds[example]['y']==array[example][3]
        assert (ds[example]['z']==array[example][[0,2]]).all()
        i+=1
    assert i==len(ds)
    del example,i

#     - for example in dataset:
    i=0
    for example in ds:
        assert len(example)==3
        assert (example['x']==array[i][:3]).all()
        assert example['y']==array[i][3]
        assert (example['z']==array[i][0:3:2]).all()
        assert (numpy.append(example['x'],example['y'])==array[i]).all()
        i+=1
    assert i==len(ds)
    del example,i

#     - for val1,val2,... in dataset:
    i=0
    for x,y,z in ds:
        assert (x==array[i][:3]).all()
        assert y==array[i][3]
        assert (z==array[i][0:3:2]).all()
        assert (numpy.append(x,y)==array[i]).all()
        i+=1
    assert i==len(ds)
    del x,y,z,i

#     - for example in dataset(field1, field2,field3, ...):
    i=0
    for example in ds('x','y','z'):
        assert len(example)==3
        assert (example['x']==array[i][:3]).all()
        assert example['y']==array[i][3]
        assert (example['z']==array[i][0:3:2]).all()
        assert (numpy.append(example['x'],example['y'])==array[i]).all()
        i+=1
    assert i==len(ds)
    del example,i
    i=0
    for example in ds('y','x'):
        assert len(example)==2
        assert (example['x']==array[i][:3]).all()
        assert example['y']==array[i][3]
        assert (numpy.append(example['x'],example['y'])==array[i]).all()
        i+=1
    assert i==len(ds)
    del example,i

#     - for val1,val2,val3 in dataset(field1, field2,field3):
    i=0
    for x,y,z in ds('x','y','z'):
        assert (x==array[i][:3]).all()
        assert y==array[i][3]
        assert (z==array[i][0:3:2]).all()
        assert (numpy.append(x,y)==array[i]).all()
        i+=1
    assert i==len(ds)
    del x,y,z,i
    i=0
    for y,x in ds('y','x',):
        assert (x==array[i][:3]).all()
        assert y==array[i][3]
        assert (numpy.append(x,y)==array[i]).all()
        i+=1
    assert i==len(ds)
    del x,y,i

    def test_minibatch_size(minibatch,minibatch_size,len_ds,nb_field,nb_iter_finished):
        ##full minibatch or the last minibatch
        for idx in range(nb_field):
            test_minibatch_field_size(minibatch[idx],minibatch_size,len_ds,nb_iter_finished)
        del idx
    def test_minibatch_field_size(minibatch_field,minibatch_size,len_ds,nb_iter_finished):
        assert len(minibatch_field)==minibatch_size or ((nb_iter_finished*minibatch_size+len(minibatch_field))==len_ds and len(minibatch_field)<minibatch_size)

#     - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N):
    i=0
    mi=0
    size=3
    m=ds.minibatches(['x','z'], minibatch_size=size)
    assert hasattr(m,'__iter__')
    for minibatch in m:
        assert isinstance(minibatch,LookupList)
        assert len(minibatch)==2
        test_minibatch_size(minibatch,size,len(ds),2,mi)
        if type(ds)==ArrayDataSet:
            assert (minibatch[0][:,::2]==minibatch[1]).all()
        else:
            for j in xrange(len(minibatch[0])):
                (minibatch[0][j][::2]==minibatch[1][j]).all()
        mi+=1
        i+=len(minibatch[0])
    assert i==(len(ds)/size)*size
    assert mi==(len(ds)/size)
    del minibatch,i,m,mi,size

    i=0
    mi=0
    size=3
    m=ds.minibatches(['x','y'], minibatch_size=size)
    assert hasattr(m,'__iter__')
    for minibatch in m:
        assert isinstance(minibatch,LookupList)
        assert len(minibatch)==2
        test_minibatch_size(minibatch,size,len(ds),2,mi)
        mi+=1
        for id in range(len(minibatch[0])):
            assert (numpy.append(minibatch[0][id],minibatch[1][id])==array[i]).all()
            i+=1
    assert i==(len(ds)/size)*size
    assert mi==(len(ds)/size)
    del minibatch,i,id,m,mi,size

#     - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N):
    i=0
    mi=0
    size=3
    m=ds.minibatches(['x','z'], minibatch_size=size)
    assert hasattr(m,'__iter__')
    for x,z in m:
        test_minibatch_field_size(x,size,len(ds),mi)
        test_minibatch_field_size(z,size,len(ds),mi)
        for id in range(len(x)):
            assert (x[id][::2]==z[id]).all()
            i+=1
        mi+=1
    assert i==(len(ds)/size)*size
    assert mi==(len(ds)/size)
    del x,z,i,m,mi,size

    i=0
    mi=0
    size=3
    m=ds.minibatches(['x','y'], minibatch_size=3)
    assert hasattr(m,'__iter__')
    for x,y in m:
        assert len(x)==size
        assert len(y)==size
        test_minibatch_field_size(x,size,len(ds),mi)
        test_minibatch_field_size(y,size,len(ds),mi)
        mi+=1
        for id in range(len(x)):
            assert (numpy.append(x[id],y[id])==array[i]).all()
            i+=1
    assert i==(len(ds)/size)*size
    assert mi==(len(ds)/size)
    del x,y,i,id,m,mi,size

#not in doc
    i=0
    size=3
    m=ds.minibatches(['x','y'],n_batches=1,minibatch_size=size,offset=4)
    assert hasattr(m,'__iter__')
    for x,y in m:
        assert len(x)==size
        assert len(y)==size
        for id in range(size):
            assert (numpy.append(x[id],y[id])==array[i+4]).all()
            i+=1
    assert i==size
    del x,y,i,id,m,size

    i=0
    size=3
    m=ds.minibatches(['x','y'],n_batches=2,minibatch_size=size,offset=4)
    assert hasattr(m,'__iter__')
    for x,y in m:
        assert len(x)==size
        assert len(y)==size
        for id in range(size):
            assert (numpy.append(x[id],y[id])==array[i+4]).all()
            i+=1
    assert i==2*size
    del x,y,i,id,m,size

    i=0
    size=3
    m=ds.minibatches(['x','y'],n_batches=20,minibatch_size=size,offset=4)
    assert hasattr(m,'__iter__')
    for x,y in m:
        assert len(x)==size
        assert len(y)==size
        for id in range(size):
            assert (numpy.append(x[id],y[id])==array[(i+4)%array.shape[0]]).all()
            i+=1
    assert i==2*size # should not wrap
    del x,y,i,id,size

    assert have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array)+1,offset=0)
    assert not have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array),offset=0)

def test_ds_iterator(array,iterator1,iterator2,iterator3):
    l=len(iterator1)
    i=0
    for x,y in iterator1:
        assert (x==array[i][:3]).all()
        assert y==array[i][3]
        assert (numpy.append(x,y)==array[i]).all()
        i+=1
    assert i==l
    i=0
    for y,z in iterator2:
        assert y==array[i][3]
        assert (z==array[i][0:3:2]).all()
        i+=1
    assert i==l
    i=0
    for x,y,z in iterator3:
        assert (x==array[i][:3]).all()
        assert y==array[i][3]
        assert (z==array[i][0:3:2]).all()
        assert (numpy.append(x,y)==array[i]).all()
        i+=1
    assert i==l
    
def test_getitem(array,ds):
    def test_ds(orig,ds,index):
        i=0
        assert isinstance(ds,LookupList)
        assert len(ds)==3
        assert len(ds[0])==len(index)
#        for x,z,y in ds('x','z','y'):
        for idx in index:
            assert (orig[idx]['x']==array[idx][:3]).all()
            assert (orig[idx]['x']==ds['x'][i]).all()
            assert orig[idx]['y']==array[idx][3]
            assert (orig[idx]['y']==ds['y'][i]).all() # why does it crash sometimes?
            assert (orig[idx]['z']==array[idx][0:3:2]).all()
            assert (orig[idx]['z']==ds['z'][i]).all()
            i+=1
        del i
        ds[0]
        if len(ds)>2:
            ds[:1]
            ds[1:1]
            ds[1:1:1]
        if len(ds)>5:
            ds[[1,2,3]]
        for x in ds:
            pass

#ds[:n] returns a LookupList with the n first examples.
    ds2=ds[:3]
    test_ds(ds,ds2,index=[0,1,2])
    del ds2

#ds[i:j] returns a LookupList with examples i,i+1,...,j-1.
    ds2=ds[1:3]
    test_ds(ds,ds2,index=[1,2])
    del ds2

#ds[i1:i2:s] returns a LookupList with the examples i1,i1+s,...i2-s.
    ds2=ds[1:7:2]
    test_ds(ds,ds2,[1,3,5])
    del ds2

#ds[i] returns the (i+1)-th example of the dataset.
    ds2=ds[5]
    assert isinstance(ds2,Example)
    assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds)  # index not defined
    assert not have_raised("var['ds']["+str(len(ds)-1)+"]",ds=ds)
    del ds2

#ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in.
    ds2=ds[[4,7,2,8]]
#    assert isinstance(ds2,DataSet)
    test_ds(ds,ds2,[4,7,2,8])
    del ds2

    #ds.<property># returns the value of a property associated with
      #the name <property>. The following properties should be supported:
      #    - 'description': a textual description or name for the ds
      #    - 'fieldtypes': a list of types (one per field)

    #* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#????
        #assert hstack([ds('x','y'),ds('z')])==ds
        #hstack([ds('z','y'),ds('x')])==ds
    assert have_raised2(hstack,[ds('x'),ds('x')])
    assert have_raised2(hstack,[ds('y','x'),ds('x')])
    assert not have_raised2(hstack,[ds('x'),ds('y')])
        
    #        i=0
    #        for example in hstack([ds('x'),ds('y'),ds('z')]):
    #            example==ds[i]
    #            i+=1 
    #        del i,example
    #* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#????

def test_subset(array,ds):
    def test_ds(orig,ds,index):
        i=0
        assert isinstance(ds2,DataSet)
        assert len(ds)==len(index)
        for x,z,y in ds('x','z','y'):
            assert (orig[index[i]]['x']==array[index[i]][:3]).all()
            assert (orig[index[i]]['x']==x).all()
            assert orig[index[i]]['y']==array[index[i]][3]
            assert orig[index[i]]['y']==y
            assert (orig[index[i]]['z']==array[index[i]][0:3:2]).all()
            assert (orig[index[i]]['z']==z).all()
            i+=1
        del i
        ds[0]
        if len(ds)>2:
            ds[:1]
            ds[1:1]
            ds[1:1:1]
        if len(ds)>5:
            ds[[1,2,3]]
        for x in ds:
            pass

#ds[:n] returns a dataset with the n first examples.
    ds2=ds.subset[:3]
    test_ds(ds,ds2,index=[0,1,2])
#    del ds2

#ds[i1:i2:s]# returns a ds with the examples i1,i1+s,...i2-s.
    ds2=ds.subset[1:7:2]
    test_ds(ds,ds2,[1,3,5])
#     del ds2

# #ds[i]
#     ds2=ds.subset[5]
#     assert isinstance(ds2,Example)
#     assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds)  # index not defined
#     assert not have_raised("var['ds']["+str(len(ds)-1)+"]",ds=ds)
#     del ds2

#ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in.
    ds2=ds.subset[[4,7,2,8]]
    test_ds(ds,ds2,[4,7,2,8])
#     del ds2

#ds.<property># returns the value of a property associated with
  #the name <property>. The following properties should be supported:
  #    - 'description': a textual description or name for the ds
  #    - 'fieldtypes': a list of types (one per field)

#* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#????
    #assert hstack([ds('x','y'),ds('z')])==ds
    #hstack([ds('z','y'),ds('x')])==ds
    assert have_raised2(hstack,[ds('x'),ds('x')])
    assert have_raised2(hstack,[ds('y','x'),ds('x')])
    assert not have_raised2(hstack,[ds('x'),ds('y')])
    
#        i=0
#        for example in hstack([ds('x'),ds('y'),ds('z')]):
#            example==ds[i]
#            i+=1 
#        del i,example
#* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#????

def test_fields_fct(ds):
    #@todo, fill correctly
    assert len(ds.fields())==3
    i=0
    v=0
    for field in ds.fields():
        for field_value in field: # iterate over the values associated to that field for all the ds examples
            v+=1
        i+=1
    assert i==3
    assert v==3*10
    del i,v
    
    i=0
    v=0
    for field in ds('x','z').fields():
        i+=1
        for val in field:
            v+=1
    assert i==2
    assert v==2*10
    del i,v
    
    i=0
    v=0
    for field in ds.fields('x','y'):
        i+=1
        for val in field:
            v+=1
    assert i==2
    assert v==2*10
    del i,v
    
    i=0
    v=0
    for field_examples in ds.fields():
        for example_value in field_examples:
            v+=1
        i+=1
    assert i==3
    assert v==3*10
    del i,v
    
    assert ds == ds.fields().examples()
    assert len(ds('x','y').fields()) == 2
    assert len(ds('x','z').fields()) == 2
    assert len(ds('y').fields()) == 1

    del field

def test_overrides(ds) :
    """ Test for examples that an override __getitem__ acts as the one in DataSet """
    def ndarray_list_equal(nda,l) :
        """ 
        Compares if a ndarray is the same as the list. Do it by converting the list into
        an numpy.ndarray, if possible
        """
        try :
            l = numpy.asmatrix(l)
        except :
            return False
        return smart_equal(nda,l)
        
    def smart_equal(a1,a2) :
        """
        Handles numpy.ndarray, LookupList, and basic containers
        """
        if not isinstance(a1,type(a2)) and not isinstance(a2,type(a1)):
            #special case: matrix vs list of arrays
            if isinstance(a1,numpy.ndarray) :
                return ndarray_list_equal(a1,a2)
            elif isinstance(a2,numpy.ndarray) :
                return ndarray_list_equal(a2,a1)
            return False
        # compares 2 numpy.ndarray
        if isinstance(a1,numpy.ndarray):
            if len(a1.shape) != len(a2.shape):
                return False
            for k in range(len(a1.shape)) :
                if a1.shape[k] != a2.shape[k]:
                    return False
            return (a1==a2).all()
        # compares 2 lookuplists
        if isinstance(a1,LookupList) :
            if len(a1._names) != len(a2._names) :
                return False
            for k in a1._names :
                if k not in a2._names :
                    return False
                if not smart_equal(a1[k],a2[k]) :
                    return False
            return True
        # compares 2 basic containers
        if hasattr(a1,'__len__'):
            if len(a1) != len(a2) :
                return False
            for k in range(len(a1)) :
                if not smart_equal(a1[k],a2[k]):
                    return False
            return True
        # try basic equals
        return a1 is a2

    def mask(ds) :
        class TestOverride(type(ds)):
            def __init__(self,ds) :
                self.ds = ds
            def __getitem__(self,key) :
                res1 = self.ds[key]
                res2 = DataSet.__getitem__(ds,key)
                assert smart_equal(res1,res2)
                return res1
        return TestOverride(ds)
    # test getitem
    ds2 = mask(ds)
    for k in range(10):
        res = ds2[k]
    res = ds2[1:len(ds):3]
    
        

    


def test_all(array,ds):
    assert len(ds)==10
    test_iterate_over_examples(array, ds)
    test_overrides(ds)
    test_getitem(array, ds)
    test_subset(array, ds)
    test_ds_iterator(array,ds('x','y'),ds('y','z'),ds('x','y','z'))
    test_fields_fct(ds)


class T_DataSet(unittest.TestCase):
    def test_ArrayDataSet(self):
        #don't test stream
        #tested only with float value
        #don't always test with y
        #don't test missing value
        #don't test with tuple
        #don't test proterties
        a2 = numpy.random.rand(10,4)
        ds = ArrayDataSet(a2,{'x':slice(3),'y':3,'z':[0,2]})###???tuple not tested
        ds = ArrayDataSet(a2,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
        #assert ds==a? should this work?

        test_all(a2,ds)

        del a2, ds

    def test_CachedDataSet(self):
        a = numpy.random.rand(10,4)
        ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
        ds2 = CachedDataSet(ds1)
        ds3 = CachedDataSet(ds1,cache_all_upon_construction=True)

        test_all(a,ds2)
        test_all(a,ds3)

        del a,ds1,ds2,ds3


    def test_DataSetFields(self):
        raise NotImplementedError()

    def test_ApplyFunctionDataSet(self):
        a = numpy.random.rand(10,4)
        a2 = a+1
        ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested

        ds2 = ApplyFunctionDataSet(ds1,lambda x,y,z: (x+1,y+1,z+1), ['x','y','z'],minibatch_mode=False)
        ds3 = ApplyFunctionDataSet(ds1,lambda x,y,z: (numpy.array(x)+1,numpy.array(y)+1,numpy.array(z)+1),
                                   ['x','y','z'],
                                   minibatch_mode=True)

        test_all(a2,ds2)
        test_all(a2,ds3)

        del a,ds1,ds2,ds3

    def test_FieldsSubsetDataSet(self):
        a = numpy.random.rand(10,4)
        ds = ArrayDataSet(a,Example(['x','y','z','w'],[slice(3),3,[0,2],0]))
        ds = FieldsSubsetDataSet(ds,['x','y','z'])

        test_all(a,ds)

        del a, ds

    def test_RenamedFieldsDataSet(self):
        a = numpy.random.rand(10,4)
        ds = ArrayDataSet(a,Example(['x1','y1','z1','w1'],[slice(3),3,[0,2],0]))
        ds = RenamedFieldsDataSet(ds,['x1','y1','z1'],['x','y','z'])

        test_all(a,ds)

        del a, ds

    def test_MinibatchDataSet(self):
        raise NotImplementedError()
    def test_HStackedDataSet(self):
        raise NotImplementedError()
    def test_VStackedDataSet(self):
        raise NotImplementedError()
    def test_ArrayFieldsDataSet(self):
        raise NotImplementedError()


class T_Exotic1(unittest.TestCase):
    class DataSet(DataSet):
            """ Dummy dataset, where one field is a ndarray of variables size. """
            def __len__(self) :
                return 100
            def fieldNames(self) :
                return 'input','target','name'
            def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
                class MultiLengthDataSetIterator(object):
                    def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset):
                        if fieldnames is None: fieldnames = dataset.fieldNames()
                        self.minibatch = Example(fieldnames,range(len(fieldnames)))
                        self.dataset, self.minibatch_size, self.current = dataset, minibatch_size, offset
                    def __iter__(self):
                            return self
                    def next(self):
                        for k in self.minibatch._names :
                            self.minibatch[k] = []
                        for ex in range(self.minibatch_size) :
                            if 'input' in self.minibatch._names:
                                self.minibatch['input'].append( numpy.array( range(self.current + 1) ) )
                            if 'target' in self.minibatch._names:
                                self.minibatch['target'].append( self.current % 2 )
                            if 'name' in self.minibatch._names:
                                self.minibatch['name'].append( str(self.current) )
                            self.current += 1
                        return self.minibatch
                return MultiLengthDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
    
    def test_ApplyFunctionDataSet(self):
        ds = T_Exotic1.DataSet()
        dsa = ApplyFunctionDataSet(ds,lambda x,y,z: (x[-1],y*10,int(z)),['input','target','name'],minibatch_mode=False) #broken!!!!!!
        for k in range(len(dsa)):
            res = dsa[k]
            self.failUnless(ds[k]('input')[0][-1] == res('input')[0] , 'problem in first applied function')
        res = dsa[33:96:3]
          
    def test_CachedDataSet(self):
        ds = T_Exotic1.DataSet()
        dsc = CachedDataSet(ds)
        for k in range(len(dsc)) :
            self.failUnless(numpy.all( dsc[k]('input')[0] == ds[k]('input')[0] ) , (dsc[k],ds[k]) )
        res = dsc[:]

if __name__=='__main__':
    tests = []
    debug=False
    if len(sys.argv)==1:
        unittest.main()
    else:
        assert sys.argv[1]=="--debug"
        for arg in sys.argv[2:]:
            tests.append(arg)
        if tests:
            unittest.TestSuite(map(T_DataSet, tests)).debug()
        else:
            module = __import__("_test_dataset")
            tests = unittest.TestLoader().loadTestsFromModule(module)
            tests.debug()