# HG changeset patch # User James Bergstra # Date 1212782178 14400 # Node ID 174374d5940555ad068fc9d4b13cf4dfa147b733 # Parent 4e6b550fe131bc3b5d7027da11b93b7d916d38b7# Parent 2d08f46d17d8b2661615e26ae5a838ac2d70edb2 merge diff -r 4e6b550fe131 -r 174374d59405 _nnet_ops.py --- a/_nnet_ops.py Thu Jun 05 18:43:16 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ - -import unittest -import theano._test_tensor as TT -import numpy - -from nnet_ops import * - -class T_sigmoid(unittest.TestCase): - def setUp(self): - numpy.random.seed(9999) - def test_elemwise(self): - TT.verify_grad(self, sigmoid, [numpy.random.rand(3,4)]) - -class T_softplus(unittest.TestCase): - def setUp(self): - numpy.random.seed(9999) - def test_elemwise(self): - TT.verify_grad(self, softplus, [numpy.random.rand(3,4)]) - -class T_CrossentropySoftmax1Hot(unittest.TestCase): - def setUp(self): - numpy.random.seed(9999) - def test0(self): - y_idx = [0,1,3] - class Dummy(object): - def make_node(self, a,b): - return crossentropy_softmax_1hot_with_bias(a, b, y_idx)[0:1] - TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4), - numpy.random.rand(4)]) - - def test1(self): - y_idx = [0,1,3] - class Dummy(object): - def make_node(self, a): - return crossentropy_softmax_1hot(a, y_idx)[0:1] - TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4)]) - - - -if __name__ == '__main__': - unittest.main() diff -r 4e6b550fe131 -r 174374d59405 _test_dataset.py --- a/_test_dataset.py Thu Jun 05 18:43:16 2008 -0400 +++ b/_test_dataset.py Fri Jun 06 15:56:18 2008 -0400 @@ -1,183 +1,442 @@ +#!/bin/env python from dataset import * from math import * -import unittest -import sys -import numpy as N +import numpy,unittest +from misc import * + +def have_raised(to_eval, **var): + have_thrown = False + try: + eval(to_eval) + except : + have_thrown = True + return have_thrown + +def have_raised2(f, *args, **kwargs): + have_thrown = False + try: + f(*args, **kwargs) + except : + have_thrown = True + return have_thrown + +def test1(): + print "test1" + global a,ds + a = numpy.random.rand(10,4) + print a + ds = ArrayDataSet(a,{'x':slice(3),'y':3,'z':[0,2]}) + print "len(ds)=",len(ds) + assert(len(ds)==10) + print "example 0 = ",ds[0] +# assert + print "x=",ds["x"] + print "x|y" + for x,y in ds("x","y"): + print x,y + minibatch_iterator = ds.minibatches(fieldnames=['z','y'],n_batches=1,minibatch_size=3,offset=4) + minibatch = minibatch_iterator.__iter__().next() + print "minibatch=",minibatch + for var in minibatch: + print "var=",var + print "take a slice and look at field y",ds[1:6:2]["y"] + + del a,ds,x,y,minibatch_iterator,minibatch,var -def _sum_all(a): - s=a - while isinstance(s,numpy.ndarray): - s=sum(s) - return s - -class T_arraydataset(unittest.TestCase): - def setUp(self): - numpy.random.seed(123456) +def test_iterate_over_examples(array,ds): +#not in doc!!! + i=0 + for example in range(len(ds)): + assert (ds[example]['x']==array[example][:3]).all() + assert ds[example]['y']==array[example][3] + assert (ds[example]['z']==array[example][[0,2]]).all() + i+=1 + assert i==len(ds) + del example,i + +# - for example in dataset: + i=0 + for example in ds: + assert len(example)==3 + assert (example['x']==array[i][:3]).all() + assert example['y']==array[i][3] + assert (example['z']==array[i][0:3:2]).all() + assert (numpy.append(example['x'],example['y'])==array[i]).all() + i+=1 + assert i==len(ds) + del example,i +# - for val1,val2,... in dataset: + i=0 + for x,y,z in ds: + assert (x==array[i][:3]).all() + assert y==array[i][3] + assert (z==array[i][0:3:2]).all() + assert (numpy.append(x,y)==array[i]).all() + i+=1 + assert i==len(ds) + del x,y,z,i + +# - for example in dataset(field1, field2,field3, ...): + i=0 + for example in ds('x','y','z'): + assert len(example)==3 + assert (example['x']==array[i][:3]).all() + assert example['y']==array[i][3] + assert (example['z']==array[i][0:3:2]).all() + assert (numpy.append(example['x'],example['y'])==array[i]).all() + i+=1 + assert i==len(ds) + del example,i + i=0 + for example in ds('y','x'): + assert len(example)==2 + assert (example['x']==array[i][:3]).all() + assert example['y']==array[i][3] + assert (numpy.append(example['x'],example['y'])==array[i]).all() + i+=1 + assert i==len(ds) + del example,i - def test_ctor_len(self): - n = numpy.random.rand(8,3) - a=ArrayDataSet(n) - self.failUnless(a.data is n) - self.failUnless(a.fields is None) +# - for val1,val2,val3 in dataset(field1, field2,field3): + i=0 + for x,y,z in ds('x','y','z'): + assert (x==array[i][:3]).all() + assert y==array[i][3] + assert (z==array[i][0:3:2]).all() + assert (numpy.append(x,y)==array[i]).all() + i+=1 + assert i==len(ds) + del x,y,z,i + i=0 + for y,x in ds('y','x',): + assert (x==array[i][:3]).all() + assert y==array[i][3] + assert (numpy.append(x,y)==array[i]).all() + i+=1 + assert i==len(ds) + del x,y,i - self.failUnless(len(a) == n.shape[0]) - self.failUnless(a[0].shape == (n.shape[1],)) + def test_minibatch_size(minibatch,minibatch_size,len_ds,nb_field,nb_iter_finished): + ##full minibatch or the last minibatch + for idx in range(nb_field): + test_minibatch_field_size(minibatch[idx],minibatch_size,len_ds,nb_iter_finished) + del idx + def test_minibatch_field_size(minibatch_field,minibatch_size,len_ds,nb_iter_finished): + assert len(minibatch_field)==minibatch_size or ((nb_iter_finished*minibatch_size+len(minibatch_field))==len_ds and len(minibatch_field)2: + ds[:1] + ds[1:1] + ds[1:1:1] + if len(ds)>5: + ds[[1,2,3]] + for x in ds: + pass - a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)}) +#ds[:n] returns a dataset with the n first examples. + ds2=ds[:3] + assert isinstance(ds2,DataSet) + test_ds(ds,ds2,index=[0,1,2]) + del ds2 + +#ds[i1:i2:s]# returns a ds with the examples i1,i1+s,...i2-s. + ds2=ds[1:7:2] + assert isinstance(ds2,DataSet) + test_ds(ds,ds2,[1,3,5]) + del ds2 + +#ds[i] + ds2=ds[5] + assert isinstance(ds2,Example) + assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds) # index not defined + assert not have_raised("var['ds']["+str(len(ds)-1)+"]",ds=ds) + del ds2 + +#ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in. + ds2=ds[[4,7,2,8]] + assert isinstance(ds2,DataSet) + test_ds(ds,ds2,[4,7,2,8]) + del ds2 - #print arr - for i, x in enumerate(a.minibatches(["x"], minibatch_size=2, n_batches=8)): - #print 'x' , x - self.failUnless(numpy.all( x == arr2[i*2:i*2+2,0:2])) + #ds.# returns the value of a property associated with + #the name . The following properties should be supported: + # - 'description': a textual description or name for the ds + # - 'fieldtypes': a list of types (one per field) - def test_minibatch_wraparound_odd(self): - arr = numpy.random.rand(10,4) - arr2 = ArrayDataSet.Iterator.matcat(arr,arr) + #* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#???? + #assert hstack([ds('x','y'),ds('z')])==ds + #hstack([ds('z','y'),ds('x')])==ds + assert have_raised2(hstack,[ds('x'),ds('x')]) + assert have_raised2(hstack,[ds('y','x'),ds('x')]) + assert not have_raised2(hstack,[ds('x'),ds('y')]) + + # i=0 + # for example in hstack([ds('x'),ds('y'),ds('z')]): + # example==ds[i] + # i+=1 + # del i,example + #* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#???? - a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)}) - - for i, x in enumerate(a.minibatches(["x"], minibatch_size=3, n_batches=6)): - self.failUnless(numpy.all( x == arr2[i*3:i*3+3,0:2])) +def test_fields_fct(ds): + #@todo, fill correctly + assert len(ds.fields())==3 + i=0 + v=0 + for field in ds.fields(): + for field_value in field: # iterate over the values associated to that field for all the ds examples + v+=1 + i+=1 + assert i==3 + assert v==3*10 + del i,v + + i=0 + v=0 + for field in ds('x','z').fields(): + i+=1 + for val in field: + v+=1 + assert i==2 + assert v==2*10 + del i,v + + i=0 + v=0 + for field in ds.fields('x','y'): + i+=1 + for val in field: + v+=1 + assert i==2 + assert v==2*10 + del i,v + i=0 + v=0 + for field_examples in ds.fields(): + for example_value in field_examples: + v+=1 + i+=1 + assert i==3 + assert v==3*10 + del i,v + + assert ds == ds.fields().examples() + assert len(ds('x','y').fields()) == 2 + assert len(ds('x','z').fields()) == 2 + assert len(ds('y').fields()) == 1 -class T_renamingdataset(unittest.TestCase): - def setUp(self): - numpy.random.seed(123456) + del field +def test_all(array,ds): + assert len(ds)==10 + + test_iterate_over_examples(array, ds) + test_getitem(array, ds) + test_ds_iterator(array,ds('x','y'),ds('y','z'),ds('x','y','z')) + test_fields_fct(ds) + +class T_DataSet(unittest.TestCase): + def test_ArrayDataSet(self): + #don't test stream + #tested only with float value + #don't always test with y + #don't test missing value + #don't test with tuple + #don't test proterties + a2 = numpy.random.rand(10,4) + ds = ArrayDataSet(a2,{'x':slice(3),'y':3,'z':[0,2]})###???tuple not tested + ds = ArrayDataSet(a2,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested + #assert ds==a? should this work? + + test_all(a2,ds) + + del a2, ds + + def test_CachedDataSet(self): + a = numpy.random.rand(10,4) + ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested + ds2 = CachedDataSet(ds1) + ds3 = CachedDataSet(ds1,cache_all_upon_construction=True) + + test_all(a,ds2) + test_all(a,ds3) + + del a,ds1,ds2,ds3 - def test_hasfield(self): - n = numpy.random.rand(3,8) - a=ArrayDataSet(data=n,fields={"x":slice(2),"y":slice(1,4),"z":slice(4,6)}) - b=a.rename({'xx':'x','zz':'z'}) - self.failUnless(b.hasFields('xx','zz') and not b.hasFields('x') and not b.hasFields('y')) + def test_DataSetFields(self): + raise NotImplementedError() -class T_applyfunctiondataset(unittest.TestCase): - def setUp(self): - numpy.random.seed(123456) + def test_ApplyFunctionDataSet(self): + a = numpy.random.rand(10,4) + a2 = a+1 + ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested + + ds2 = ApplyFunctionDataSet(ds1,lambda x,y,z: (x+1,y+1,z+1), ['x','y','z'],minibatch_mode=False) + ds3 = ApplyFunctionDataSet(ds1,lambda x,y,z: (numpy.array(x)+1,numpy.array(y)+1,numpy.array(z)+1), + ['x','y','z'], + minibatch_mode=True) - def test_function(self): - n = numpy.random.rand(3,8) - a=ArrayDataSet(data=n,fields={"x":slice(2),"y":slice(1,4),"z":slice(4,6)}) - b=a.apply_function(lambda x,y: x+y,x+1, ['x','y'], ['x+y','x+1'], False,False,False) - print b.fieldNames() - print b('x+y') - + test_all(a2,ds2) + test_all(a2,ds3) + del a,ds1,ds2,ds3 - -# to be used with a any new dataset -class T_dataset_tester(object): - """ - This class' goal is to test any new dataset that is created - Tests are (will be!) designed to check the normal behaviours - of a dataset, as defined in dataset.py - """ + def test_FieldsSubsetDataSet(self): + raise NotImplementedError() + def test_MinibatchDataSet(self): + raise NotImplementedError() + def test_HStackedDataSet(self): + raise NotImplementedError() + def test_VStackedDataSet(self): + raise NotImplementedError() + def test_ArrayFieldsDataSet(self): + raise NotImplementedError() - def __init__(self,ds,runall=True) : - """if interested in only a subset of test, init with runall=False""" - self.ds = ds - - if runall : - self.test1_basicstats(ds) - self.test2_slicing(ds) - self.test3_fields_iterator_consistency(ds) - - def test1_basicstats(self,ds) : - """print basics stats on a dataset, like length""" - - print 'len(ds) = ',len(ds) - print 'num fields = ', len(ds.fieldNames()) - print 'types of field: ', - for k in ds.fieldNames() : - print type(ds[0](k)[0]), - print '' +if __name__=='__main__': + unittest.main() - def test2_slicing(self,ds) : - """test if slicing works properly""" - print 'testing slicing...', - sys.stdout.flush() - - middle = len(ds) / 2 - tenpercent = int(len(ds) * .1) - set1 = ds[:middle+tenpercent] - set2 = ds[middle-tenpercent:] - for k in range(tenpercent + tenpercent -1): - for k2 in ds.fieldNames() : - if type(set1[middle-tenpercent+k](k2)[0]) == N.ndarray : - for k3 in range(len(set1[middle-tenpercent+k](k2)[0])) : - assert set1[middle-tenpercent+k](k2)[0][k3] == set2[k](k2)[0][k3] - else : - assert set1[middle-tenpercent+k](k2)[0] == set2[k](k2)[0] - assert tenpercent > 1 - set3 = ds[middle-tenpercent:middle+tenpercent:2] - for k2 in ds.fieldNames() : - if type(set2[2](k2)[0]) == N.ndarray : - for k3 in range(len(set2[2](k2)[0])) : - assert set2[2](k2)[0][k3] == set3[1](k2)[0][k3] - else : - assert set2[2](k2)[0] == set3[1](k2)[0] - - print 'done' - - - def test3_fields_iterator_consistency(self,ds) : - """ check if the number of iterator corresponds to the number of fields""" - print 'testing fields/iterator consistency...', - sys.stdout.flush() - - # basic test - maxsize = min(len(ds)-1,100) - for iter in ds[:maxsize] : - assert len(iter) == len(ds.fieldNames()) - if len(ds.fieldNames()) == 1 : - print 'done' - return - - # with minibatches iterator - ds2 = ds.minibatches[:maxsize]([ds.fieldNames()[0],ds.fieldNames()[1]],minibatch_size=2) - for iter in ds2 : - assert len(iter) == 2 - - print 'done' - - - - - -################################################################### -# main -if __name__ == '__main__': - unittest.main() - diff -r 4e6b550fe131 -r 174374d59405 _test_filetensor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/_test_filetensor.py Fri Jun 06 15:56:18 2008 -0400 @@ -0,0 +1,116 @@ +from filetensor import * +import filetensor + +import unittest +import os + +class T(unittest.TestCase): + fname = '/tmp/some_mat' + + def setUp(self): + #TODO: test that /tmp/some_mat does not exist + try: + os.stat(self.fname) + except OSError: + return #assume file was not found + raise Exception('autotest file "%s" exists!' % self.fname) + + def tearDown(self): + os.remove(self.fname) + + def test_file(self): + gen = numpy.random.rand(1) + f = file(self.fname, 'w'); + write(f, gen) + f.flush() + f = file(self.fname, 'r'); + mat = read(f, None, debug=False) #load from filename + self.failUnless(gen.shape == mat.shape) + self.failUnless(numpy.all(gen == mat)) + + def test_filename(self): + gen = numpy.random.rand(1) + write(self.fname, gen) + mat = read(self.fname, None, debug=False) #load from filename + self.failUnless(gen.shape == mat.shape) + self.failUnless(numpy.all(gen == mat)) + + def testNd(self): + """shape and values are stored correctly for tensors of rank 0 to 5""" + whole_shape = [5, 6, 7, 8, 9] + for i in xrange(5): + gen = numpy.asarray(numpy.random.rand(*whole_shape[:i])) + f = file(self.fname, 'w'); + write(f, gen) + f.flush() + f = file(self.fname, 'r'); + mat = read(f, None, debug=False) #load from filename + self.failUnless(gen.shape == mat.shape) + self.failUnless(numpy.all(gen == mat)) + + def test_dtypes(self): + """shape and values are stored correctly for all dtypes """ + for dtype in filetensor._dtype_magic: + gen = numpy.asarray( + numpy.random.rand(4, 5, 2, 1) * 100, + dtype=dtype) + f = file(self.fname, 'w'); + write(f, gen) + f.flush() + f = file(self.fname, 'r'); + mat = read(f, None, debug=False) #load from filename + self.failUnless(gen.dtype == mat.dtype) + self.failUnless(gen.shape == mat.shape) + self.failUnless(numpy.all(gen == mat)) + + def test_dtype_invalid(self): + gen = numpy.zeros((3,4), dtype='uint16') #an unsupported dtype + f = file(self.fname, 'w') + passed = False + try: + write(f, gen) + except TypeError, e: + if e[0].startswith('Invalid ndarray dtype'): + passed = True + f.close() + self.failUnless(passed) + + +if __name__ == '__main__': + unittest.main() + + #a small test script, starts by reading sys.argv[1] + #print 'rval', rval.shape, rval.size + + if 0: + write(f, rval) + print '' + f.close() + f = file('/tmp/some_mat', 'r'); + rval2 = read(f) #load from file handle + print 'rval2', rval2.shape, rval2.size + + assert rval.dtype == rval2.dtype + assert rval.shape == rval2.shape + assert numpy.all(rval == rval2) + print 'ok' + + def _unused(): + f.seek(0,2) #seek to end + f_len = f.tell() + f.seek(f_data_start,0) #seek back to where we were + + if debug: print 'length:', f_len + + + f_data_bytes = (f_len - f_data_start) + + if debug: print 'data bytes according to header: ', dim_size * elsize + if debug: print 'data bytes according to file : ', f_data_bytes + + if debug: print 'reading data...' + sys.stdout.flush() + + def read_ndarray(f, dim, dtype): + return numpy.fromfile(f, dtype=dtype, count=_prod(dim)).reshape(dim) + diff -r 4e6b550fe131 -r 174374d59405 _test_lookup_list.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/_test_lookup_list.py Fri Jun 06 15:56:18 2008 -0400 @@ -0,0 +1,24 @@ +from lookup_list import * +import unittest + +class T_LookUpList(unittest.TestCase): + def test_LookupList(self): + #test only the example in the doc??? + example = LookupList(['x','y','z'],[1,2,3]) + example['x'] = [1, 2, 3] # set or change a field + x, y, z = example + x = example[0] + x = example["x"] + assert example.keys()==['x','y','z'] + assert example.values()==[[1,2,3],2,3] + assert example.items()==[('x',[1,2,3]),('y',2),('z',3)] + example.append_keyval('u',0) # adds item with name 'u' and value 0 + assert len(example)==4 # number of items = 4 here + example2 = LookupList(['v','w'], ['a','b']) + example3 = LookupList(['x','y','z','u','v','w'], [[1, 2, 3],2,3,0,'a','b']) + assert example+example2==example3 + self.assertRaises(AssertionError,example.__add__,example) + del example, example2, example3, x, y ,z + +if __name__=='__main__': + unittest.main() diff -r 4e6b550fe131 -r 174374d59405 _test_nnet_ops.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/_test_nnet_ops.py Fri Jun 06 15:56:18 2008 -0400 @@ -0,0 +1,41 @@ + +import unittest +import theano._test_tensor as TT +import numpy + +from nnet_ops import * + +class T_sigmoid(unittest.TestCase): + def setUp(self): + numpy.random.seed(9999) + def test_elemwise(self): + TT.verify_grad(self, sigmoid, [numpy.random.rand(3,4)]) + +class T_softplus(unittest.TestCase): + def setUp(self): + numpy.random.seed(9999) + def test_elemwise(self): + TT.verify_grad(self, softplus, [numpy.random.rand(3,4)]) + +class T_CrossentropySoftmax1Hot(unittest.TestCase): + def setUp(self): + numpy.random.seed(9999) + def test0(self): + y_idx = [0,1,3] + class Dummy(object): + def make_node(self, a,b): + return crossentropy_softmax_1hot_with_bias(a, b, y_idx)[0:1] + TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4), + numpy.random.rand(4)]) + + def test1(self): + y_idx = [0,1,3] + class Dummy(object): + def make_node(self, a): + return crossentropy_softmax_1hot(a, y_idx)[0:1] + TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4)]) + + + +if __name__ == '__main__': + unittest.main() diff -r 4e6b550fe131 -r 174374d59405 autotest.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/autotest.py Fri Jun 06 15:56:18 2008 -0400 @@ -0,0 +1,54 @@ +import unittest, os, sys, traceback + +def test_root_dir(debugmode=False): + suite = None + filenames = os.listdir('.') + for filename in filenames: + if filename[-3:] == '.py' and filename.startswith('_test'): + #print >>sys.stderr, 'Loading', modname + modname = filename[0:-3] + + try: + module = __import__(modname) + except Exception, e: + print >>sys.stderr, "====================================================" + print >>sys.stderr, "Failed to load %s.py" % modname + print >>sys.stderr, "====================================================" + traceback.print_exc() + print >>sys.stderr, "====================================================" + continue + + tests = unittest.TestLoader().loadTestsFromModule(module) + if tests.countTestCases() > 0: + print >>sys.stderr, 'Testing', modname + if suite is None: + suite = tests + else: + suite.addTests(tests) + if suite is None: + print >>sys.stderr, "No suite found" + sys.exit(1) + if debugmode: + suite.debug() + else: + unittest.TextTestRunner(verbosity=1).run(suite) + +if __name__ == '__main__': + + def printUsage(): + print >>sys.stderr, "Bad argument: ",sys.argv + print >>sys.stderr, "only --debug is supported" + sys.exit(1) + debugparam="" + + if len(sys.argv)==2: + if sys.argv[1]=="--debug": + debugparam="--debug" + sys.argv.remove(debugparam) + else: + printUsage() + elif len(sys.argv)>2: + printUsage() + + test_root_dir(debugparam!="") + diff -r 4e6b550fe131 -r 174374d59405 dataset.py --- a/dataset.py Thu Jun 05 18:43:16 2008 -0400 +++ b/dataset.py Fri Jun 06 15:56:18 2008 -0400 @@ -161,17 +161,55 @@ numpy_vstack = lambda fieldname,values: numpy.vstack(values) numpy_hstack = lambda fieldnames,values: numpy.hstack(values) - def __init__(self,description=None,fieldtypes=None): - if description is None: - # by default return "(,,...)" - description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" - self.description=description - self.fieldtypes=fieldtypes + def __init__(self, description=None, fieldnames=None, fieldtypes=None): + """ + @type fieldnames: list of strings + @type fieldtypes: list of python types, same length as fieldnames + @type description: string + @param description: description/name for this dataset + """ + def default_desc(): + return type(self).__name__ \ + + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" + + #self.fieldnames = fieldnames + + self.fieldtypes = fieldtypes if fieldtypes is not None \ + else [None]*1 #len(fieldnames) + + self.description = default_desc() if description is None \ + else description self._attribute_names = ["description"] - if fieldtypes: - self._attribute_names.append("fieldtypes") + + attributeNames = property(lambda self: copy.copy(self._attribute_names)) + + def __contains__(self, fieldname): + return (fieldname in self.fieldNames()) \ + or (fieldname in self.attributeNames()) + + def __iter__(self): + """Supports the syntax "for i in dataset: ..." - def attributeNames(self): return self._attribute_names + Using this syntax, "i" will be an Example instance (or equivalent) with + all the fields of DataSet self. Every field of "i" will give access to + a field of a single example. Fields should be accessible via + i["fielname"] or i[3] (in the order defined by the elements of the + Example returned by this iterator), but the derived class is free + to accept any type of identifier, and add extra functionality to the iterator. + + The default implementation calls the minibatches iterator and extracts the first example of each field. + """ + return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1)) + + def __len__(self): + """ + len(dataset) returns the number of examples in the dataset. + By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint). + Sub-classes which implement finite-length datasets should redefine this method. + Some methods only make sense for finite-length datasets. + """ + return None + class MinibatchToSingleExampleIterator(object): """ @@ -198,24 +236,6 @@ def next_index(self): return self.minibatch_iterator.next_index() - def __iter__(self): - """Supports the syntax "for i in dataset: ..." - - Using this syntax, "i" will be an Example instance (or equivalent) with - all the fields of DataSet self. Every field of "i" will give access to - a field of a single example. Fields should be accessible via - i["fielname"] or i[3] (in the order defined by the elements of the - Example returned by this iterator), but the derived class is free - to accept any type of identifier, and add extra functionality to the iterator. - - The default implementation calls the minibatches iterator and extracts the first example of each field. - """ - return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1)) - - def __contains__(self, fieldname): - return (fieldname in self.fieldNames()) \ - or (fieldname in self.attributeNames()) - class MinibatchWrapAroundIterator(object): """ An iterator for minibatches that handles the case where we need to wrap around the @@ -358,15 +378,6 @@ """ raise AbstractFunction() - def __len__(self): - """ - len(dataset) returns the number of examples in the dataset. - By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint). - Sub-classes which implement finite-length datasets should redefine this method. - Some methods only make sense for finite-length datasets. - """ - return maxint - def is_unbounded(self): """ Tests whether a dataset is unbounded (e.g. a stream). diff -r 4e6b550fe131 -r 174374d59405 learner.py --- a/learner.py Thu Jun 05 18:43:16 2008 -0400 +++ b/learner.py Fri Jun 06 15:56:18 2008 -0400 @@ -110,7 +110,7 @@ """ raise AbstractFunction() -class LearnerModel(LearnedModel): +class LearnerModel(TrainedModel): """ LearnerModel is a base class for models returned by instances of a LearningAlgorithm subclass. It is only given here to define the expected semantics. diff -r 4e6b550fe131 -r 174374d59405 mlp.py --- a/mlp.py Thu Jun 05 18:43:16 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,240 +0,0 @@ -""" -A straightforward classicial feedforward -one-hidden-layer neural net, with L2 regularization. -This is one of the simplest example of L{Learner}, and illustrates -the use of theano. -""" - -from learner import * -from theano import tensor as t -from nnet_ops import * -import math -from misc import * - -def function(inputs, outputs, linker='c&py'): - return theano.function(inputs, outputs, unpack_single=False,linker=linker) - -def randshape(*shape): return (numpy.random.rand(*shape) -0.5) * 0.001 - -class ManualNNet(object): - def __init__(self, ninputs, nhid, nclass, lr, nepochs, - linker='c&yp', - hidden_layer=None): - class Vars: - def __init__(self, lr, l2coef=0.0): - lr = t.constant(lr) - l2coef = t.constant(l2coef) - input = t.matrix('input') # n_examples x n_inputs - target = t.ivector('target') # n_examples x 1 - W2 = t.matrix('W2') - b2 = t.vector('b2') - - if hidden_layer: - hid, hid_params, hid_ivals, hid_regularization = hidden_layer(input) - else: - W1 = t.matrix('W1') - b1 = t.vector('b1') - hid = t.tanh(b1 + t.dot(input, W1)) - hid_params = [W1, b1] - hid_regularization = l2coef * t.sum(W1*W1) - hid_ivals = [randshape(ninputs, nhid), randshape(nhid)] - - params = [W2, b2] + hid_params - ivals = [randshape(nhid, nclass), randshape(nclass)]\ - + hid_ivals - nll, predictions = crossentropy_softmax_1hot( b2 + t.dot(hid, W2), target) - regularization = l2coef * t.sum(W2*W2) + hid_regularization - output_class = t.argmax(predictions,1) - loss_01 = t.neq(output_class, target) - g_params = t.grad(nll + regularization, params) - new_params = [t.sub_inplace(p, lr * gp) for p,gp in zip(params, g_params)] - self.__dict__.update(locals()); del self.self - self.nhid = nhid - self.nclass = nclass - self.nepochs = nepochs - self.v = Vars(lr) - self.params = None - - def update(self, trainset): - params = self.v.ivals - update_fn = function( - [self.v.input, self.v.target] + self.v.params, - [self.v.nll] + self.v.new_params) - for i in xrange(self.nepochs): - for input, target in trainset.minibatches(['input', 'target'], - minibatch_size=min(32, len(trainset))): - dummy = update_fn(input, target[:,0], *params) - if 0: print dummy[0] #the nll - return self.use - __call__ = update - - def use(self, dset, - output_fieldnames=['output_class'], - test_stats_collector=None, - copy_inputs=False, - put_stats_in_output_dataset=True, - output_attributes=[]): - inputs = [self.v.input, self.v.target] + self.v.params - fn = function(inputs, [getattr(self.v, name) for name in output_fieldnames]) - target = dset.fields()['target'] if ('target' in dset.fields()) else numpy.zeros((1,1),dtype='int64') - return ApplyFunctionDataSet(dset, - lambda input, target: fn(input, target[:,0], *self.v.ivals), - output_fieldnames) - - -class OneHiddenLayerNNetClassifier(OnlineGradientTLearner): - """ - Implement a straightforward classicial feedforward - one-hidden-layer neural net, with L2 regularization. - - The predictor parameters are obtained by minibatch/online gradient descent. - Training can proceed sequentially (with multiple calls to update with - different disjoint subsets of the training sets). - - Hyper-parameters: - - L2_regularizer - - learning_rate - - n_hidden - - For each (input_t,output_t) pair in a minibatch,:: - - output_activations_t = b2+W2*tanh(b1+W1*input_t) - output_t = softmax(output_activations_t) - output_class_t = argmax(output_activations_t) - class_error_t = 1_{output_class_t != target_t} - nll_t = -log(output_t[target_t]) - - and the training criterion is:: - - loss = L2_regularizer*(||W1||^2 + ||W2||^2) + sum_t nll_t - - The parameters are [b1,W1,b2,W2] and are obtained by minimizing the loss by - stochastic minibatch gradient descent:: - - parameters[i] -= learning_rate * dloss/dparameters[i] - - The fields and attributes expected and produced by use and update are the following: - - - Input and output fields (example-wise quantities): - - - 'input' (always expected by use and update) - - 'target' (optionally expected by use and always by update) - - 'output' (optionally produced by use) - - 'output_class' (optionally produced by use) - - 'class_error' (optionally produced by use) - - 'nll' (optionally produced by use) - - - optional attributes (optionally expected as input_dataset attributes) - (warning, this may be dangerous, the 'use' method will use those provided in the - input_dataset rather than those learned during 'update'; currently no support - for providing these to update): - - - 'L2_regularizer' - - 'b1' - - 'W1' - - 'b2' - - 'W2' - - 'parameters' = [b1, W1, b2, W2] - - 'regularization_term' - - """ - def __init__(self,n_hidden,n_classes,learning_rate,max_n_epochs,L2_regularizer=0,init_range=1.,n_inputs=None,minibatch_size=None,linker='c|py'): - self._n_inputs = n_inputs - self._n_outputs = n_classes - self._n_hidden = n_hidden - self._init_range = init_range - self._max_n_epochs = max_n_epochs - self._minibatch_size = minibatch_size - self.learning_rate = learning_rate # this is the float - self.L2_regularizer = L2_regularizer - self._learning_rate = t.scalar('learning_rate') # this is the symbol - self._input = t.matrix('input') # n_examples x n_inputs - self._target = t.lmatrix('target') # n_examples x 1 - self._target_vector = self._target[:,0] - self._L2_regularizer = t.scalar('L2_regularizer') - self._W1 = t.matrix('W1') - self._W2 = t.matrix('W2') - self._b1 = t.row('b1') - self._b2 = t.row('b2') - self._regularization_term = self._L2_regularizer * (t.sum(self._W1*self._W1) + t.sum(self._W2*self._W2)) - self._output_activations =self._b2+t.dot(t.tanh(self._b1+t.dot(self._input,self._W1.T)),self._W2.T) - self._nll,self._output = crossentropy_softmax_1hot(self._output_activations,self._target_vector) - self._output_class = t.argmax(self._output,1) - self._class_error = t.neq(self._output_class,self._target_vector) - self._minibatch_criterion = self._nll + self._regularization_term / t.shape(self._input)[0] - OnlineGradientTLearner.__init__(self, linker = linker) - - def attributeNames(self): - return ["parameters","b1","W2","b2","W2", "L2_regularizer","regularization_term"] - - def parameterAttributes(self): - return ["b1","W1", "b2", "W2"] - - def updateMinibatchInputFields(self): - return ["input","target"] - - def updateMinibatchInputAttributes(self): - return OnlineGradientTLearner.updateMinibatchInputAttributes(self)+["L2_regularizer"] - - def updateEndOutputAttributes(self): - return ["regularization_term"] - - def lossAttribute(self): - return "minibatch_criterion" - - def defaultOutputFields(self, input_fields): - output_fields = ["output", "output_class",] - if "target" in input_fields: - output_fields += ["class_error", "nll"] - return output_fields - - def updateMinibatch(self,minibatch): - MinibatchUpdatesTLearner.updateMinibatch(self,minibatch) - #print self.nll - - def allocate(self,minibatch): - minibatch_n_inputs = minibatch["input"].shape[1] - if not self._n_inputs: - self._n_inputs = minibatch_n_inputs - self.b1 = numpy.zeros((1,self._n_hidden)) - self.b2 = numpy.zeros((1,self._n_outputs)) - self.forget() - elif self._n_inputs!=minibatch_n_inputs: - # if the input changes dimension on the fly, we resize and forget everything - self.forget() - - def forget(self): - if self._n_inputs: - r = self._init_range/math.sqrt(self._n_inputs) - self.W1 = numpy.random.uniform(low=-r,high=r, - size=(self._n_hidden,self._n_inputs)) - r = self._init_range/math.sqrt(self._n_hidden) - self.W2 = numpy.random.uniform(low=-r,high=r, - size=(self._n_outputs,self._n_hidden)) - self.b1[:]=0 - self.b2[:]=0 - self._n_epochs=0 - - def isLastEpoch(self): - self._n_epochs +=1 - return self._n_epochs>=self._max_n_epochs - - def debug_updateMinibatch(self,minibatch): - # make sure all required fields are allocated and initialized - self.allocate(minibatch) - input_attributes = self.names2attributes(self.updateMinibatchInputAttributes()) - input_fields = minibatch(*self.updateMinibatchInputFields()) - print 'input attributes', input_attributes - print 'input fields', input_fields - results = self.update_minibatch_function(*(input_attributes+input_fields)) - print 'output attributes', self.updateMinibatchOutputAttributes() - print 'results', results - self.setAttributes(self.updateMinibatchOutputAttributes(), - results) - - if 0: - print 'n0', self.names2OpResults(self.updateMinibatchOutputAttributes()+ self.updateMinibatchInputFields()) - print 'n1', self.names2OpResults(self.updateMinibatchOutputAttributes()) - print 'n2', self.names2OpResults(self.updateEndInputAttributes()) - print 'n3', self.names2OpResults(self.updateEndOutputAttributes()) - diff -r 4e6b550fe131 -r 174374d59405 test_dataset.py --- a/test_dataset.py Thu Jun 05 18:43:16 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,561 +0,0 @@ -#!/bin/env python -from dataset import * -from math import * -import numpy -from misc import * - -def have_raised(to_eval, **var): - have_thrown = False - try: - eval(to_eval) - except : - have_thrown = True - return have_thrown - -def have_raised2(f, *args, **kwargs): - have_thrown = False - try: - f(*args, **kwargs) - except : - have_thrown = True - return have_thrown - -def test1(): - print "test1" - global a,ds - a = numpy.random.rand(10,4) - print a - ds = ArrayDataSet(a,{'x':slice(3),'y':3,'z':[0,2]}) - print "len(ds)=",len(ds) - assert(len(ds)==10) - print "example 0 = ",ds[0] -# assert - print "x=",ds["x"] - print "x|y" - for x,y in ds("x","y"): - print x,y - minibatch_iterator = ds.minibatches(fieldnames=['z','y'],n_batches=1,minibatch_size=3,offset=4) - minibatch = minibatch_iterator.__iter__().next() - print "minibatch=",minibatch - for var in minibatch: - print "var=",var - print "take a slice and look at field y",ds[1:6:2]["y"] - - del a,ds,x,y,minibatch_iterator,minibatch,var - -def test_iterate_over_examples(array,ds): -#not in doc!!! - i=0 - for example in range(len(ds)): - assert (ds[example]['x']==array[example][:3]).all() - assert ds[example]['y']==array[example][3] - assert (ds[example]['z']==array[example][[0,2]]).all() - i+=1 - assert i==len(ds) - del example,i - -# - for example in dataset: - i=0 - for example in ds: - assert len(example)==3 - assert (example['x']==array[i][:3]).all() - assert example['y']==array[i][3] - assert (example['z']==array[i][0:3:2]).all() - assert (numpy.append(example['x'],example['y'])==array[i]).all() - i+=1 - assert i==len(ds) - del example,i - -# - for val1,val2,... in dataset: - i=0 - for x,y,z in ds: - assert (x==array[i][:3]).all() - assert y==array[i][3] - assert (z==array[i][0:3:2]).all() - assert (numpy.append(x,y)==array[i]).all() - i+=1 - assert i==len(ds) - del x,y,z,i - -# - for example in dataset(field1, field2,field3, ...): - i=0 - for example in ds('x','y','z'): - assert len(example)==3 - assert (example['x']==array[i][:3]).all() - assert example['y']==array[i][3] - assert (example['z']==array[i][0:3:2]).all() - assert (numpy.append(example['x'],example['y'])==array[i]).all() - i+=1 - assert i==len(ds) - del example,i - i=0 - for example in ds('y','x'): - assert len(example)==2 - assert (example['x']==array[i][:3]).all() - assert example['y']==array[i][3] - assert (numpy.append(example['x'],example['y'])==array[i]).all() - i+=1 - assert i==len(ds) - del example,i - -# - for val1,val2,val3 in dataset(field1, field2,field3): - i=0 - for x,y,z in ds('x','y','z'): - assert (x==array[i][:3]).all() - assert y==array[i][3] - assert (z==array[i][0:3:2]).all() - assert (numpy.append(x,y)==array[i]).all() - i+=1 - assert i==len(ds) - del x,y,z,i - i=0 - for y,x in ds('y','x',): - assert (x==array[i][:3]).all() - assert y==array[i][3] - assert (numpy.append(x,y)==array[i]).all() - i+=1 - assert i==len(ds) - del x,y,i - - def test_minibatch_size(minibatch,minibatch_size,len_ds,nb_field,nb_iter_finished): - ##full minibatch or the last minibatch - for idx in range(nb_field): - test_minibatch_field_size(minibatch[idx],minibatch_size,len_ds,nb_iter_finished) - del idx - def test_minibatch_field_size(minibatch_field,minibatch_size,len_ds,nb_iter_finished): - assert len(minibatch_field)==minibatch_size or ((nb_iter_finished*minibatch_size+len(minibatch_field))==len_ds and len(minibatch_field)2: - ds[:1] - ds[1:1] - ds[1:1:1] - if len(ds)>5: - ds[[1,2,3]] - for x in ds: - pass - -#ds[:n] returns a dataset with the n first examples. - ds2=ds[:3] - assert isinstance(ds2,DataSet) - test_ds(ds,ds2,index=[0,1,2]) - del ds2 - -#ds[i1:i2:s]# returns a ds with the examples i1,i1+s,...i2-s. - ds2=ds[1:7:2] - assert isinstance(ds2,DataSet) - test_ds(ds,ds2,[1,3,5]) - del ds2 - -#ds[i] - ds2=ds[5] - assert isinstance(ds2,Example) - if 0: - # see ticket #27 - assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds) # index not defined - assert not have_raised("var['ds']["+str(len(ds)-1)+"]",ds=ds) - del ds2 - -#ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in. - ds2=ds[[4,7,2,8]] - assert isinstance(ds2,DataSet) - test_ds(ds,ds2,[4,7,2,8]) - del ds2 - -#ds[fieldname]# an iterable over the values of the field fieldname across - #the ds (the iterable is obtained by default by calling valuesVStack - #over the values for individual examples). - if 0: - assert have_raised("ds['h']") # h is not defined... - assert have_raised("ds[['x']]") # bad syntax - assert not have_raised("var['ds']['x']",ds=ds) - isinstance(ds['x'],DataSetFields) - ds2=ds['x'] - assert len(ds['x'])==10 - assert len(ds['y'])==10 - assert len(ds['z'])==10 - i=0 - for example in ds['x']: - assert (example==array[i][:3]).all() - i+=1 - assert i==len(ds) - i=0 - for example in ds['y']: - assert (example==array[i][3]).all() - i+=1 - assert i==len(ds) - i=0 - for example in ds['z']: - assert (example==array[i,0:3:2]).all() - i+=1 - assert i==len(ds) - del ds2,i - else: - print 'warning: ds[fieldname] is deprecated... Fred could you fix this test?' - - #ds.# returns the value of a property associated with - #the name . The following properties should be supported: - # - 'description': a textual description or name for the ds - # - 'fieldtypes': a list of types (one per field) - - #* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#???? - #assert hstack([ds('x','y'),ds('z')])==ds - #hstack([ds('z','y'),ds('x')])==ds - assert have_raised2(hstack,[ds('x'),ds('x')]) - assert have_raised2(hstack,[ds('y','x'),ds('x')]) - assert not have_raised2(hstack,[ds('x'),ds('y')]) - - # i=0 - # for example in hstack([ds('x'),ds('y'),ds('z')]): - # example==ds[i] - # i+=1 - # del i,example - #* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#???? - -def test_fields_fct(ds): - #@todo, fill correctly - assert len(ds.fields())==3 - i=0 - v=0 - for field in ds.fields(): - for field_value in field: # iterate over the values associated to that field for all the ds examples - v+=1 - i+=1 - assert i==3 - assert v==3*10 - del i,v - - i=0 - v=0 - for field in ds('x','z').fields(): - i+=1 - for val in field: - v+=1 - assert i==2 - assert v==2*10 - del i,v - - i=0 - v=0 - for field in ds.fields('x','y'): - i+=1 - for val in field: - v+=1 - assert i==2 - assert v==2*10 - del i,v - - i=0 - v=0 - for field_examples in ds.fields(): - for example_value in field_examples: - v+=1 - i+=1 - assert i==3 - assert v==3*10 - del i,v - - assert ds == ds.fields().examples() - assert len(ds('x','y').fields()) == 2 - assert len(ds('x','z').fields()) == 2 - assert len(ds('y').fields()) == 1 - - del field -def test_all(array,ds): - assert len(ds)==10 - - test_iterate_over_examples(array, ds) - test_getitem(array, ds) - test_ds_iterator(array,ds('x','y'),ds('y','z'),ds('x','y','z')) - test_fields_fct(ds) - -def test_ArrayDataSet(): - #don't test stream - #tested only with float value - #don't always test with y - #don't test missing value - #don't test with tuple - #don't test proterties - print "test_ArrayDataSet" - a2 = numpy.random.rand(10,4) - ds = ArrayDataSet(a2,{'x':slice(3),'y':3,'z':[0,2]})###???tuple not tested - ds = ArrayDataSet(a2,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested - #assert ds==a? should this work? - - test_all(a2,ds) - - del a2, ds - -def test_CachedDataSet(): - print "test_CacheDataSet" - a = numpy.random.rand(10,4) - ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested - ds2 = CachedDataSet(ds1) - ds3 = CachedDataSet(ds1,cache_all_upon_construction=True) - - test_all(a,ds2) - test_all(a,ds3) - - del a,ds1,ds2,ds3 - - -def test_DataSetFields(): - print "test_DataSetFields" - raise NotImplementedError() - -def test_ApplyFunctionDataSet(): - print "test_ApplyFunctionDataSet" - a = numpy.random.rand(10,4) - a2 = a+1 - ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested - - ds2 = ApplyFunctionDataSet(ds1,lambda x,y,z: (x+1,y+1,z+1), ['x','y','z'],minibatch_mode=False) - - print ds1.fields('x', 'y', 'z') - print ' ' - print ds2.fields('x', 'y', 'z') - print '----------- ' - - - ds3 = ApplyFunctionDataSet(ds1,lambda x,y,z: (numpy.array(x)+1,numpy.array(y)+1,numpy.array(z)+1), - ['x','y','z'], - minibatch_mode=True) - - test_all(a2,ds2) - test_all(a2,ds3) - - del a,ds1,ds2,ds3 - -def test_FieldsSubsetDataSet(): - print "test_FieldsSubsetDataSet" - raise NotImplementedError() -def test_MinibatchDataSet(): - print "test_MinibatchDataSet" - raise NotImplementedError() -def test_HStackedDataSet(): - print "test_HStackedDataSet" - raise NotImplementedError() -def test_VStackedDataSet(): - print "test_VStackedDataSet" - raise NotImplementedError() -def test_ArrayFieldsDataSet(): - print "test_ArrayFieldsDataSet" - raise NotImplementedError() - - -def test_speed(array, ds): - print "test_speed", ds.__class__ - - mat = numpy.random.rand(400,100) - - @print_timing - def f_array_full(a): - a+1 - @print_timing - def f_array_index(a): - for id in range(a.shape[0]): -# pass - a[id]+1 -# a[id]*mat - @print_timing - def f_array_iter(a): - for r in a: -# pass - r+1 -# r*mat - @print_timing - def f_ds_index(ds): - for id in range(len(ds)): -# pass - ds[id][0]+1 -# ds[id][0]*mat - @print_timing - def f_ds_iter(ds): - for ex in ds: -# pass - ex[0]+1 -# a[0]*mat - @print_timing - def f_ds_mb1(ds,mb_size): - for exs in ds.minibatches(minibatch_size = mb_size): - for ex in exs: -# pass - ex[0]+1 -# ex[0]*mat - @print_timing - def f_ds_mb2(ds,mb_size): - for exs in ds.minibatches(minibatch_size = mb_size): -# pass - exs[0]+1 -# ex[0]*mat - - f_array_full(array) - f_array_index(array) - f_array_iter(array) - - f_ds_index(ds) - f_ds_iter(ds) - - f_ds_mb1(ds,10) - f_ds_mb1(ds,100) - f_ds_mb1(ds,1000) - f_ds_mb1(ds,10000) - f_ds_mb2(ds,10) - f_ds_mb2(ds,100) - f_ds_mb2(ds,1000) - f_ds_mb2(ds,10000) - - -if __name__=='__main__': - test_ArrayDataSet() - #test_CachedDataSet() - #test_ApplyFunctionDataSet() - diff -r 4e6b550fe131 -r 174374d59405 test_filetensor.py --- a/test_filetensor.py Thu Jun 05 18:43:16 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,116 +0,0 @@ -from filetensor import * -import filetensor - -import unittest -import os - -class T(unittest.TestCase): - fname = '/tmp/some_mat' - - def setUp(self): - #TODO: test that /tmp/some_mat does not exist - try: - os.stat(self.fname) - except OSError: - return #assume file was not found - raise Exception('autotest file "%s" exists!' % self.fname) - - def tearDown(self): - os.remove(self.fname) - - def test_file(self): - gen = numpy.random.rand(1) - f = file(self.fname, 'w'); - write(f, gen) - f.flush() - f = file(self.fname, 'r'); - mat = read(f, None, debug=False) #load from filename - self.failUnless(gen.shape == mat.shape) - self.failUnless(numpy.all(gen == mat)) - - def test_filename(self): - gen = numpy.random.rand(1) - write(self.fname, gen) - mat = read(self.fname, None, debug=False) #load from filename - self.failUnless(gen.shape == mat.shape) - self.failUnless(numpy.all(gen == mat)) - - def testNd(self): - """shape and values are stored correctly for tensors of rank 0 to 5""" - whole_shape = [5, 6, 7, 8, 9] - for i in xrange(5): - gen = numpy.asarray(numpy.random.rand(*whole_shape[:i])) - f = file(self.fname, 'w'); - write(f, gen) - f.flush() - f = file(self.fname, 'r'); - mat = read(f, None, debug=False) #load from filename - self.failUnless(gen.shape == mat.shape) - self.failUnless(numpy.all(gen == mat)) - - def test_dtypes(self): - """shape and values are stored correctly for all dtypes """ - for dtype in filetensor._dtype_magic: - gen = numpy.asarray( - numpy.random.rand(4, 5, 2, 1) * 100, - dtype=dtype) - f = file(self.fname, 'w'); - write(f, gen) - f.flush() - f = file(self.fname, 'r'); - mat = read(f, None, debug=False) #load from filename - self.failUnless(gen.dtype == mat.dtype) - self.failUnless(gen.shape == mat.shape) - self.failUnless(numpy.all(gen == mat)) - - def test_dtype_invalid(self): - gen = numpy.zeros((3,4), dtype='uint16') #an unsupported dtype - f = file(self.fname, 'w') - passed = False - try: - write(f, gen) - except TypeError, e: - if e[0].startswith('Invalid ndarray dtype'): - passed = True - f.close() - self.failUnless(passed) - - -if __name__ == '__main__': - unittest.main() - - #a small test script, starts by reading sys.argv[1] - #print 'rval', rval.shape, rval.size - - if 0: - write(f, rval) - print '' - f.close() - f = file('/tmp/some_mat', 'r'); - rval2 = read(f) #load from file handle - print 'rval2', rval2.shape, rval2.size - - assert rval.dtype == rval2.dtype - assert rval.shape == rval2.shape - assert numpy.all(rval == rval2) - print 'ok' - - def _unused(): - f.seek(0,2) #seek to end - f_len = f.tell() - f.seek(f_data_start,0) #seek back to where we were - - if debug: print 'length:', f_len - - - f_data_bytes = (f_len - f_data_start) - - if debug: print 'data bytes according to header: ', dim_size * elsize - if debug: print 'data bytes according to file : ', f_data_bytes - - if debug: print 'reading data...' - sys.stdout.flush() - - def read_ndarray(f, dim, dtype): - return numpy.fromfile(f, dtype=dtype, count=_prod(dim)).reshape(dim) - diff -r 4e6b550fe131 -r 174374d59405 test_lookup_list.py --- a/test_lookup_list.py Thu Jun 05 18:43:16 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -from lookup_list import * -def have_raised(to_eval, **var): - have_thrown = False - try: - eval(to_eval) - except : - have_thrown = True - return have_thrown - -def have_raised2(f, *args, **kwargs): - have_thrown = False - try: - f(*args, **kwargs) - except : - have_thrown = True - return have_thrown - - -def test_LookupList(): - #test only the example in the doc??? - print "test_LookupList" - example = LookupList(['x','y','z'],[1,2,3]) - example['x'] = [1, 2, 3] # set or change a field - x, y, z = example - x = example[0] - x = example["x"] - assert example.keys()==['x','y','z'] - assert example.values()==[[1,2,3],2,3] - assert example.items()==[('x',[1,2,3]),('y',2),('z',3)] - example.append_keyval('u',0) # adds item with name 'u' and value 0 - assert len(example)==4 # number of items = 4 here - example2 = LookupList(['v','w'], ['a','b']) - example3 = LookupList(['x','y','z','u','v','w'], [[1, 2, 3],2,3,0,'a','b']) - assert example+example2==example3 - assert have_raised("var['x']+var['x']",x=example) - - del example, example2, example3, x, y ,z - -if __name__=='__main__': - test_LookupList() diff -r 4e6b550fe131 -r 174374d59405 test_mlp.py --- a/test_mlp.py Thu Jun 05 18:43:16 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,132 +0,0 @@ - -from mlp import * -import dataset -import nnet_ops - - -from functools import partial -def separator(debugger, i, node, *ths): - print "===================" - -def what(debugger, i, node, *ths): - print "#%i" % i, node - -def parents(debugger, i, node, *ths): - print [input.step for input in node.inputs] - -def input_shapes(debugger, i, node, *ths): - print "input shapes: ", - for r in node.inputs: - if hasattr(r.value, 'shape'): - print r.value.shape, - else: - print "no_shape", - print - -def input_types(debugger, i, node, *ths): - print "input types: ", - for r in node.inputs: - print r.type, - print - -def output_shapes(debugger, i, node, *ths): - print "output shapes:", - for r in node.outputs: - if hasattr(r.value, 'shape'): - print r.value.shape, - else: - print "no_shape", - print - -def output_types(debugger, i, node, *ths): - print "output types:", - for r in node.outputs: - print r.type, - print - - -def test0(): - linker = 'c|py' - #linker = partial(theano.gof.DebugLinker, linkers = [theano.gof.OpWiseCLinker], - # debug_pre = [separator, what, parents, input_types, input_shapes], - # debug_post = [output_shapes, output_types], - # compare_fn = lambda x, y: numpy.all(x == y)) - - nnet = OneHiddenLayerNNetClassifier(10,2,.001,1000, linker = linker) - training_set = dataset.ArrayDataSet(numpy.array([[0, 0, 0], - [0, 1, 1], - [1, 0, 1], - [1, 1, 1]]), - {'input':slice(2),'target':2}) - fprop=nnet(training_set) - - output_ds = fprop(training_set) - - for fieldname in output_ds.fieldNames(): - print fieldname+"=",output_ds[fieldname] - -def test1(): - nnet = ManualNNet(2, 10,3,.1,1000) - training_set = dataset.ArrayDataSet(numpy.array([[0, 0, 0], - [0, 1, 1], - [1, 0, 1], - [1, 1, 1]]), - {'input':slice(2),'target':2}) - fprop=nnet(training_set) - - output_ds = fprop(training_set) - - for fieldname in output_ds.fieldNames(): - print fieldname+"=",output_ds[fieldname] - -def test2(): - training_set = dataset.ArrayDataSet(numpy.array([[0, 0, 0], - [0, 1, 1], - [1, 0, 1], - [1, 1, 1]]), - {'input':slice(2),'target':2}) - nin, nhid=2, 10 - def sigm_layer(input): - W1 = t.matrix('W1') - b1 = t.vector('b1') - return (nnet_ops.sigmoid(b1 + t.dot(input, W1)), - [W1, b1], - [(numpy.random.rand(nin, nhid) -0.5) * 0.001, numpy.zeros(nhid)]) - nnet = ManualNNet(nin, nhid, 3, .1, 1000, hidden_layer=sigm_layer) - fprop=nnet(training_set) - - output_ds = fprop(training_set) - - for fieldname in output_ds.fieldNames(): - print fieldname+"=",output_ds[fieldname] - -def test_interface_0(): - learner = ManualNNet(2, 10, 3, .1, 1000) - - model = learner(training_set) - - model2 = learner(training_set) # trains model a second time - - learner.update(additional_data) # modifies nnet and model by side-effect - - -def test_interface2_1(): - learn_algo = ManualNNet(2, 10, 3, .1, 1000) - - prior = learn_algo() - - model1 = learn_algo(training_set1) - - model2 = learn_algo(training_set2) - - model2.update(additional_data) - - n_match = 0 - for o1, o2 in zip(model1.use(test_data), model2.use(test_data)): - n_match += (o1 == o2) - - print n_match - -test1() -test2() - diff -r 4e6b550fe131 -r 174374d59405 test_speed.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_speed.py Fri Jun 06 15:56:18 2008 -0400 @@ -0,0 +1,79 @@ +import numpy +from dataset import * +from misc import * +def test_speed(array, ds): + print "test_speed", ds.__class__ + + mat = numpy.random.rand(400,100) + + @print_timing + def f_array_full(a): + a+1 + @print_timing + def f_array_index(a): + for id in range(a.shape[0]): +# pass + a[id]+1 +# a[id]*mat + @print_timing + def f_array_iter(a): + for r in a: +# pass + r+1 +# r*mat + @print_timing + def f_ds_index(ds): + for id in range(len(ds)): +# pass + ds[id][0]+1 +# ds[id][0]*mat + @print_timing + def f_ds_iter(ds): + for ex in ds: +# pass + ex[0]+1 +# a[0]*mat + @print_timing + def f_ds_mb1(ds,mb_size): + for exs in ds.minibatches(minibatch_size = mb_size): + for ex in exs: +# pass + ex[0]+1 +# ex[0]*mat + @print_timing + def f_ds_mb2(ds,mb_size): + for exs in ds.minibatches(minibatch_size = mb_size): +# pass + exs[0]+1 +# ex[0]*mat + + f_array_full(array) + f_array_index(array) + f_array_iter(array) + + f_ds_index(ds) + f_ds_iter(ds) + + f_ds_mb1(ds,10) + f_ds_mb1(ds,100) + f_ds_mb1(ds,1000) + f_ds_mb1(ds,10000) + f_ds_mb2(ds,10) + f_ds_mb2(ds,100) + f_ds_mb2(ds,1000) + f_ds_mb2(ds,10000) + +if __name__=='__main__': + a2 = numpy.random.rand(100000,400) + ds1 = ArrayDataSet(a2,{'all':slice(0,a2.shape[1],1)}) + test_speed(a2,ds1) + a1 = numpy.random.rand(100000,40) + ds4 = ArrayDataSet(a1,LookupList(["f"+str(x)for x in range(a1.shape[1])], + range(a1.shape[1]))) + test_speed(a2,ds4) + ds2=CachedDataSet(ds1,cache_all_upon_construction=False) + test_speed(a2,ds2) + ds3=CachedDataSet(ds1,cache_all_upon_construction=True) + test_speed(a2,ds3) + del a2,ds1,ds2,ds3 +