# HG changeset patch # User Joseph Turian # Date 1238460484 14400 # Node ID 8fff4bc26f4c1204dca826872634248c2a21fd66 # Parent 27b1344a57b17ede4e62f5b47ac3346e5ea1f231# Parent 9e62fd6b6677586a4a75f53f722196072fa891fa merge diff -r 27b1344a57b1 -r 8fff4bc26f4c .hgtags --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgtags Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,1 @@ +5f9ffefa9ca8040e18e1a69fdbbe34b8a19099bc sequencelabelling 20090130-rerun diff -r 27b1344a57b1 -r 8fff4bc26f4c LICENSE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LICENSE Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,24 @@ +Copyright (c) 2008, Theano Development Team +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Theano nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff -r 27b1344a57b1 -r 8fff4bc26f4c README.txt diff -r 27b1344a57b1 -r 8fff4bc26f4c __init__.py --- a/__init__.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -import filetensor -import version -import learner - -from lookup_list import LookupList - -def __src_version__(): - #todo - this is vulnerable to the bug in theano ticket #160 - return version.src_version(__name__) - diff -r 27b1344a57b1 -r 8fff4bc26f4c _test_dataset.py --- a/_test_dataset.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,680 +0,0 @@ -#!/bin/env python -from dataset import * -from math import * -import numpy, unittest, sys -#from misc import * -from lookup_list import LookupList - -def have_raised(to_eval, **var): - have_thrown = False - try: - eval(to_eval) - except : - have_thrown = True - return have_thrown - -def have_raised2(f, *args, **kwargs): - have_thrown = False - try: - f(*args, **kwargs) - except : - have_thrown = True - return have_thrown - -def test1(): - print "test1" - global a,ds - a = numpy.random.rand(10,4) - print a - ds = ArrayDataSet(a,{'x':slice(3),'y':3,'z':[0,2]}) - print "len(ds)=",len(ds) - assert(len(ds)==10) - print "example 0 = ",ds[0] -# assert - print "x=",ds["x"] - print "x|y" - for x,y in ds("x","y"): - print x,y - minibatch_iterator = ds.minibatches(fieldnames=['z','y'],n_batches=1,minibatch_size=3,offset=4) - minibatch = minibatch_iterator.__iter__().next() - print "minibatch=",minibatch - for var in minibatch: - print "var=",var - print "take a slice and look at field y",ds[1:6:2]["y"] - - del a,ds,x,y,minibatch_iterator,minibatch,var - -def test_iterate_over_examples(array,ds): -#not in doc!!! - i=0 - for example in range(len(ds)): - wanted = array[example][:3] - returned = ds[example]['x'] - if (wanted != returned).all(): - print 'returned:', returned - print 'wanted:', wanted - assert (ds[example]['x']==array[example][:3]).all() - assert ds[example]['y']==array[example][3] - assert (ds[example]['z']==array[example][[0,2]]).all() - i+=1 - assert i==len(ds) - del example,i - -# - for example in dataset: - i=0 - for example in ds: - assert len(example)==3 - assert (example['x']==array[i][:3]).all() - assert example['y']==array[i][3] - assert (example['z']==array[i][0:3:2]).all() - assert (numpy.append(example['x'],example['y'])==array[i]).all() - i+=1 - assert i==len(ds) - del example,i - -# - for val1,val2,... in dataset: - i=0 - for x,y,z in ds: - assert (x==array[i][:3]).all() - assert y==array[i][3] - assert (z==array[i][0:3:2]).all() - assert (numpy.append(x,y)==array[i]).all() - i+=1 - assert i==len(ds) - del x,y,z,i - -# - for example in dataset(field1, field2,field3, ...): - i=0 - for example in ds('x','y','z'): - assert len(example)==3 - assert (example['x']==array[i][:3]).all() - assert example['y']==array[i][3] - assert (example['z']==array[i][0:3:2]).all() - assert (numpy.append(example['x'],example['y'])==array[i]).all() - i+=1 - assert i==len(ds) - del example,i - i=0 - for example in ds('y','x'): - assert len(example)==2 - assert (example['x']==array[i][:3]).all() - assert example['y']==array[i][3] - assert (numpy.append(example['x'],example['y'])==array[i]).all() - i+=1 - assert i==len(ds) - del example,i - -# - for val1,val2,val3 in dataset(field1, field2,field3): - i=0 - for x,y,z in ds('x','y','z'): - assert (x==array[i][:3]).all() - assert y==array[i][3] - assert (z==array[i][0:3:2]).all() - assert (numpy.append(x,y)==array[i]).all() - i+=1 - assert i==len(ds) - del x,y,z,i - i=0 - for y,x in ds('y','x',): - assert (x==array[i][:3]).all() - assert y==array[i][3] - assert (numpy.append(x,y)==array[i]).all() - i+=1 - assert i==len(ds) - del x,y,i - - def test_minibatch_size(minibatch,minibatch_size,len_ds,nb_field,nb_iter_finished): - ##full minibatch or the last minibatch - for idx in range(nb_field): - test_minibatch_field_size(minibatch[idx],minibatch_size,len_ds,nb_iter_finished) - del idx - def test_minibatch_field_size(minibatch_field,minibatch_size,len_ds,nb_iter_finished): - assert len(minibatch_field)==minibatch_size or ((nb_iter_finished*minibatch_size+len(minibatch_field))==len_ds and len(minibatch_field)2: - ds[:1] - ds[1:1] - ds[1:1:1] - if len(ds)>5: - ds[[1,2,3]] - for x in ds: - pass - -#ds[:n] returns a LookupList with the n first examples. - ds2=ds[:3] - test_ds(ds,ds2,index=[0,1,2]) - del ds2 - -#ds[i:j] returns a LookupList with examples i,i+1,...,j-1. - ds2=ds[1:3] - test_ds(ds,ds2,index=[1,2]) - del ds2 - -#ds[i1:i2:s] returns a LookupList with the examples i1,i1+s,...i2-s. - ds2=ds[1:7:2] - test_ds(ds,ds2,[1,3,5]) - del ds2 - -#ds[i] returns the (i+1)-th example of the dataset. - ds2=ds[5] - assert isinstance(ds2,Example) - test_ds(ds,ds2,[5]) - assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds) # index not defined - assert not have_raised("var['ds']["+str(len(ds)-1)+"]",ds=ds) - del ds2 - -#ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in. - ds2=ds[[4,7,2,8]] -# assert isinstance(ds2,DataSet) - test_ds(ds,ds2,[4,7,2,8]) - del ds2 - - #ds.# returns the value of a property associated with - #the name . The following properties should be supported: - # - 'description': a textual description or name for the ds - # - 'fieldtypes': a list of types (one per field) - - #* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#???? - #assert hstack([ds('x','y'),ds('z')])==ds - #hstack([ds('z','y'),ds('x')])==ds - assert have_raised2(hstack,[ds('x'),ds('x')]) - assert have_raised2(hstack,[ds('y','x'),ds('x')]) - assert not have_raised2(hstack,[ds('x'),ds('y')]) - - # i=0 - # for example in hstack([ds('x'),ds('y'),ds('z')]): - # example==ds[i] - # i+=1 - # del i,example - #* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#???? - -def test_subset(array,ds): - def test_ds(orig,ds,index): - i=0 - assert isinstance(ds2,DataSet) - assert len(ds)==len(index) - for x,z,y in ds('x','z','y'): - assert (orig[index[i]]['x']==array[index[i]][:3]).all() - assert (orig[index[i]]['x']==x).all() - assert orig[index[i]]['y']==array[index[i]][3] - assert orig[index[i]]['y']==y - assert (orig[index[i]]['z']==array[index[i]][0:3:2]).all() - assert (orig[index[i]]['z']==z).all() - i+=1 - del i - ds[0] - if len(ds)>2: - ds[:1] - ds[1:1] - ds[1:1:1] - if len(ds)>5: - ds[[1,2,3]] - for x in ds: - pass - -#ds[:n] returns a dataset with the n first examples. - ds2=ds.subset[:3] - test_ds(ds,ds2,index=[0,1,2]) -# del ds2 - -#ds[i1:i2:s]# returns a ds with the examples i1,i1+s,...i2-s. - ds2=ds.subset[1:7:2] - test_ds(ds,ds2,[1,3,5]) -# del ds2 - -# #ds[i] -# ds2=ds.subset[5] -# assert isinstance(ds2,Example) -# assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds) # index not defined -# assert not have_raised("var['ds']["+str(len(ds)-1)+"]",ds=ds) -# del ds2 - -#ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in. - ds2=ds.subset[[4,7,2,8]] - test_ds(ds,ds2,[4,7,2,8]) -# del ds2 - -#ds.# returns the value of a property associated with - #the name . The following properties should be supported: - # - 'description': a textual description or name for the ds - # - 'fieldtypes': a list of types (one per field) - -#* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#???? - #assert hstack([ds('x','y'),ds('z')])==ds - #hstack([ds('z','y'),ds('x')])==ds - assert have_raised2(hstack,[ds('x'),ds('x')]) - assert have_raised2(hstack,[ds('y','x'),ds('x')]) - assert not have_raised2(hstack,[ds('x'),ds('y')]) - -# i=0 -# for example in hstack([ds('x'),ds('y'),ds('z')]): -# example==ds[i] -# i+=1 -# del i,example -#* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#???? - -def test_fields_fct(ds): - #@todo, fill correctly - assert len(ds.fields())==3 - i=0 - v=0 - for field in ds.fields(): - for field_value in field: # iterate over the values associated to that field for all the ds examples - v+=1 - i+=1 - assert i==3 - assert v==3*10 - del i,v - - i=0 - v=0 - for field in ds('x','z').fields(): - i+=1 - for val in field: - v+=1 - assert i==2 - assert v==2*10 - del i,v - - i=0 - v=0 - for field in ds.fields('x','y'): - i+=1 - for val in field: - v+=1 - assert i==2 - assert v==2*10 - del i,v - - i=0 - v=0 - for field_examples in ds.fields(): - for example_value in field_examples: - v+=1 - i+=1 - assert i==3 - assert v==3*10 - del i,v - - assert ds == ds.fields().examples() - assert len(ds('x','y').fields()) == 2 - assert len(ds('x','z').fields()) == 2 - assert len(ds('y').fields()) == 1 - - del field - -def test_overrides(ds) : - """ Test for examples that an override __getitem__ acts as the one in DataSet """ - def ndarray_list_equal(nda,l) : - """ - Compares if a ndarray is the same as the list. Do it by converting the list into - an numpy.ndarray, if possible - """ - try : - l = numpy.asmatrix(l) - except : - return False - return smart_equal(nda,l) - - def smart_equal(a1,a2) : - """ - Handles numpy.ndarray, LookupList, and basic containers - """ - if not isinstance(a1,type(a2)) and not isinstance(a2,type(a1)): - #special case: matrix vs list of arrays - if isinstance(a1,numpy.ndarray) : - return ndarray_list_equal(a1,a2) - elif isinstance(a2,numpy.ndarray) : - return ndarray_list_equal(a2,a1) - return False - # compares 2 numpy.ndarray - if isinstance(a1,numpy.ndarray): - if len(a1.shape) != len(a2.shape): - return False - for k in range(len(a1.shape)) : - if a1.shape[k] != a2.shape[k]: - return False - return (a1==a2).all() - # compares 2 lookuplists - if isinstance(a1,LookupList) : - if len(a1._names) != len(a2._names) : - return False - for k in a1._names : - if k not in a2._names : - return False - if not smart_equal(a1[k],a2[k]) : - return False - return True - # compares 2 basic containers - if hasattr(a1,'__len__'): - if len(a1) != len(a2) : - return False - for k in range(len(a1)) : - if not smart_equal(a1[k],a2[k]): - return False - return True - # try basic equals - return a1 is a2 - - def mask(ds) : - class TestOverride(type(ds)): - def __init__(self,ds) : - self.ds = ds - def __getitem__(self,key) : - res1 = self.ds[key] - res2 = DataSet.__getitem__(ds,key) - assert smart_equal(res1,res2) - return res1 - return TestOverride(ds) - # test getitem - ds2 = mask(ds) - for k in range(10): - res = ds2[k] - res = ds2[1:len(ds):3] - - - - - - -def test_all(array,ds): - assert len(ds)==10 - test_iterate_over_examples(array, ds) - test_overrides(ds) - test_getitem(array, ds) - test_subset(array, ds) - test_ds_iterator(array,ds('x','y'),ds('y','z'),ds('x','y','z')) - test_fields_fct(ds) - - -class T_DataSet(unittest.TestCase): - def test_ArrayDataSet(self): - #don't test stream - #tested only with float value - #don't always test with y - #don't test missing value - #don't test with tuple - #don't test proterties - a2 = numpy.random.rand(10,4) - ds = ArrayDataSet(a2,{'x':slice(3),'y':3,'z':[0,2]})###???tuple not tested - ds = ArrayDataSet(a2,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested - #assert ds==a? should this work? - - test_all(a2,ds) - - del a2, ds - - def test_CachedDataSet(self): - a = numpy.random.rand(10,4) - ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested - ds2 = CachedDataSet(ds1) - ds3 = CachedDataSet(ds1,cache_all_upon_construction=True) - - test_all(a,ds2) - test_all(a,ds3) - - del a,ds1,ds2,ds3 - - - def test_DataSetFields(self): - raise NotImplementedError() - - def test_ApplyFunctionDataSet(self): - a = numpy.random.rand(10,4) - a2 = a+1 - ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested - - ds2 = ApplyFunctionDataSet(ds1,lambda x,y,z: (x+1,y+1,z+1), ['x','y','z'],minibatch_mode=False) - ds3 = ApplyFunctionDataSet(ds1,lambda x,y,z: (numpy.array(x)+1,numpy.array(y)+1,numpy.array(z)+1), - ['x','y','z'], - minibatch_mode=True) - - test_all(a2,ds2) - test_all(a2,ds3) - - del a,ds1,ds2,ds3 - - def test_FieldsSubsetDataSet(self): - a = numpy.random.rand(10,4) - ds = ArrayDataSet(a,Example(['x','y','z','w'],[slice(3),3,[0,2],0])) - ds = FieldsSubsetDataSet(ds,['x','y','z']) - - test_all(a,ds) - - del a, ds - - def test_RenamedFieldsDataSet(self): - a = numpy.random.rand(10,4) - ds = ArrayDataSet(a,Example(['x1','y1','z1','w1'],[slice(3),3,[0,2],0])) - ds = RenamedFieldsDataSet(ds,['x1','y1','z1'],['x','y','z']) - - test_all(a,ds) - - del a, ds - - def test_MinibatchDataSet(self): - raise NotImplementedError() - def test_HStackedDataSet(self): - raise NotImplementedError() - def test_VStackedDataSet(self): - raise NotImplementedError() - def test_ArrayFieldsDataSet(self): - raise NotImplementedError() - - -class T_Exotic1(unittest.TestCase): - class DataSet(DataSet): - """ Dummy dataset, where one field is a ndarray of variables size. """ - def __len__(self) : - return 100 - def fieldNames(self) : - return 'input','target','name' - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - class MultiLengthDataSetIterator(object): - def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset): - if fieldnames is None: fieldnames = dataset.fieldNames() - self.minibatch = Example(fieldnames,range(len(fieldnames))) - self.dataset, self.minibatch_size, self.current = dataset, minibatch_size, offset - def __iter__(self): - return self - def next(self): - for k in self.minibatch._names : - self.minibatch[k] = [] - for ex in range(self.minibatch_size) : - if 'input' in self.minibatch._names: - self.minibatch['input'].append( numpy.array( range(self.current + 1) ) ) - if 'target' in self.minibatch._names: - self.minibatch['target'].append( self.current % 2 ) - if 'name' in self.minibatch._names: - self.minibatch['name'].append( str(self.current) ) - self.current += 1 - return self.minibatch - return MultiLengthDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) - - def test_ApplyFunctionDataSet(self): - ds = T_Exotic1.DataSet() - dsa = ApplyFunctionDataSet(ds,lambda x,y,z: (x[-1],y*10,int(z)),['input','target','name'],minibatch_mode=False) #broken!!!!!! - for k in range(len(dsa)): - res = dsa[k] - self.failUnless(ds[k]('input')[0][-1] == res('input')[0] , 'problem in first applied function') - res = dsa[33:96:3] - - def test_CachedDataSet(self): - ds = T_Exotic1.DataSet() - dsc = CachedDataSet(ds) - for k in range(len(dsc)) : - self.failUnless(numpy.all( dsc[k]('input')[0] == ds[k]('input')[0] ) , (dsc[k],ds[k]) ) - res = dsc[:] - -if __name__=='__main__': - tests = [] - debug=False - if len(sys.argv)==1: - unittest.main() - else: - assert sys.argv[1]=="--debug" - for arg in sys.argv[2:]: - tests.append(arg) - if tests: - unittest.TestSuite(map(T_DataSet, tests)).debug() - else: - module = __import__("_test_dataset") - tests = unittest.TestLoader().loadTestsFromModule(module) - tests.debug() diff -r 27b1344a57b1 -r 8fff4bc26f4c _test_filetensor.py --- a/_test_filetensor.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,120 +0,0 @@ -from filetensor import * -import filetensor - -import unittest -import os - -class T(unittest.TestCase): - fname = '/tmp/some_mat' - - def setUp(self): - #TODO: test that /tmp/some_mat does not exist - try: - os.stat(self.fname) - except OSError: - return #assume file was not found - raise Exception('autotest file "%s" exists!' % self.fname) - - def tearDown(self): - os.remove(self.fname) - - def test_file(self): - gen = numpy.random.rand(1) - f = file(self.fname, 'w'); - write(f, gen) - f.flush() - f = file(self.fname, 'r'); - mat = read(f, None, debug=False) #load from filename - self.failUnless(gen.shape == mat.shape) - self.failUnless(numpy.all(gen == mat)) - - def test_filename(self): - gen = numpy.random.rand(1) - f = file(self.fname, 'w') - write(f, gen) - f.close() - f = file(self.fname, 'r') - mat = read(f, None, debug=False) #load from filename - f.close() - self.failUnless(gen.shape == mat.shape) - self.failUnless(numpy.all(gen == mat)) - - def testNd(self): - """shape and values are stored correctly for tensors of rank 0 to 5""" - whole_shape = [5, 6, 7, 8, 9] - for i in xrange(5): - gen = numpy.asarray(numpy.random.rand(*whole_shape[:i])) - f = file(self.fname, 'w'); - write(f, gen) - f.flush() - f = file(self.fname, 'r'); - mat = read(f, None, debug=False) #load from filename - self.failUnless(gen.shape == mat.shape) - self.failUnless(numpy.all(gen == mat)) - - def test_dtypes(self): - """shape and values are stored correctly for all dtypes """ - for dtype in filetensor._dtype_magic: - gen = numpy.asarray( - numpy.random.rand(4, 5, 2, 1) * 100, - dtype=dtype) - f = file(self.fname, 'w'); - write(f, gen) - f.flush() - f = file(self.fname, 'r'); - mat = read(f, None, debug=False) #load from filename - self.failUnless(gen.dtype == mat.dtype) - self.failUnless(gen.shape == mat.shape) - self.failUnless(numpy.all(gen == mat)) - - def test_dtype_invalid(self): - gen = numpy.zeros((3,4), dtype='uint16') #an unsupported dtype - f = file(self.fname, 'w') - passed = False - try: - write(f, gen) - except TypeError, e: - if e[0].startswith('Invalid ndarray dtype'): - passed = True - f.close() - self.failUnless(passed) - - -if __name__ == '__main__': - unittest.main() - - #a small test script, starts by reading sys.argv[1] - #print 'rval', rval.shape, rval.size - - if 0: - write(f, rval) - print '' - f.close() - f = file('/tmp/some_mat', 'r'); - rval2 = read(f) #load from file handle - print 'rval2', rval2.shape, rval2.size - - assert rval.dtype == rval2.dtype - assert rval.shape == rval2.shape - assert numpy.all(rval == rval2) - print 'ok' - - def _unused(): - f.seek(0,2) #seek to end - f_len = f.tell() - f.seek(f_data_start,0) #seek back to where we were - - if debug: print 'length:', f_len - - - f_data_bytes = (f_len - f_data_start) - - if debug: print 'data bytes according to header: ', dim_size * elsize - if debug: print 'data bytes according to file : ', f_data_bytes - - if debug: print 'reading data...' - sys.stdout.flush() - - def read_ndarray(f, dim, dtype): - return numpy.fromfile(f, dtype=dtype, count=_prod(dim)).reshape(dim) - diff -r 27b1344a57b1 -r 8fff4bc26f4c _test_linear_regression.py --- a/_test_linear_regression.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ - -import unittest -from linear_regression import * -from make_test_datasets import * -import numpy - -class test_linear_regression(unittest.TestCase): - - def test1(self): - trainset,testset,theta=make_artificial_datasets_from_function(n_inputs=3, - n_targets=2, - n_examples=100, - f=linear_predictor) - - assert trainset.fields()['input'].shape==(50,3) - assert testset.fields()['target'].shape==(50,2) - regressor = LinearRegression(L2_regularizer=0.1) - predictor = regressor(trainset) - test_data = testset.fields() - mse = predictor.compute_mse(test_data['input'],test_data['target']) - print 'mse = ',mse - -if __name__ == '__main__': - import sys - - if len(sys.argv)==1: - unittest.main() - else: - assert sys.argv[1]=="--debug" - tests = [] - for arg in sys.argv[2:]: - tests.append(arg) - if tests: - unittest.TestSuite(map(T_DataSet, tests)).debug() - else: - module = __import__("_test_linear_regression") - tests = unittest.TestLoader().loadTestsFromModule(module) - tests.debug() diff -r 27b1344a57b1 -r 8fff4bc26f4c _test_lookup_list.py --- a/_test_lookup_list.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -from lookup_list import * -import unittest - -class T_LookUpList(unittest.TestCase): - def test_LookupList(self): - #test only the example in the doc??? - example = LookupList(['x','y','z'],[1,2,3]) - example['x'] = [1, 2, 3] # set or change a field - x, y, z = example - x = example[0] - x = example["x"] - assert example.keys()==['x','y','z'] - assert example.values()==[[1,2,3],2,3] - assert example.items()==[('x',[1,2,3]),('y',2),('z',3)] - example.append_keyval('u',0) # adds item with name 'u' and value 0 - assert len(example)==4 # number of items = 4 here - example2 = LookupList(['v','w'], ['a','b']) - example3 = LookupList(['x','y','z','u','v','w'], [[1, 2, 3],2,3,0,'a','b']) - assert example+example2==example3 - self.assertRaises(AssertionError,example.__add__,example) - del example, example2, example3, x, y ,z - -if __name__=='__main__': - unittest.main() diff -r 27b1344a57b1 -r 8fff4bc26f4c _test_nnet_ops.py --- a/_test_nnet_ops.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,132 +0,0 @@ - -import unittest -import theano -import theano._test_tensor as TT -import numpy - -from nnet_ops import * - -class T_sigmoid(unittest.TestCase): - def setUp(self): - numpy.random.seed(9999) - def test_elemwise(self): - TT.verify_grad(self, sigmoid, [numpy.random.rand(3,4)]) - -class T_softplus(unittest.TestCase): - def setUp(self): - numpy.random.seed(9999) - def test_elemwise(self): - TT.verify_grad(self, softplus, [numpy.random.rand(3,4)]) - -class T_Softmax(unittest.TestCase): - def setUp(self): - numpy.random.seed(9999) - def test0(self): - class Dummy(object): - def make_node(self, a): - return [softmax(a)[:,0]] - TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4)]) - def test1(self): - class Dummy(object): - def make_node(self, a): - return [softmax(a)[:,1]] - TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4)]) - def test2(self): - class Dummy(object): - def make_node(self, a): - return [softmax(a)[:,2]] - TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4)]) - def test3(self): - class Dummy(object): - def make_node(self, a): - return [softmax(a)[:,3]] - TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4)]) - - -class T_SoftmaxWithBias(unittest.TestCase): - def setUp(self): - numpy.random.seed(9999) - def test0(self): - class Dummy(object): - def make_node(self, a, b): - return [softmax_with_bias(a, b)[:,0]] - TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4), - numpy.random.rand(4)]) - def test1(self): - class Dummy(object): - def make_node(self, a, b): - return [softmax_with_bias(a, b)[:,1]] - TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4), - numpy.random.rand(4)]) - def test2(self): - class Dummy(object): - def make_node(self, a, b): - return [softmax_with_bias(a, b)[:,2]] - TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4), - numpy.random.rand(4)]) - def test3(self): - class Dummy(object): - def make_node(self, a, b): - return [softmax_with_bias(a, b)[:,3]] - TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4), - numpy.random.rand(4)]) - -class T_CrossentropySoftmax1Hot(unittest.TestCase): - def setUp(self): - numpy.random.seed(9999) - def test0(self): - y_idx = [0,1,3] - class Dummy(object): - def make_node(self, a,b): - return crossentropy_softmax_1hot_with_bias(a, b, y_idx)[0:1] - TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4), - numpy.random.rand(4)]) - - def test1(self): - y_idx = [0,1,3] - class Dummy(object): - def make_node(self, a): - return crossentropy_softmax_1hot(a, y_idx)[0:1] - TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4)]) - -class T_prepend(unittest.TestCase): - def test0(self): - """basic functionality""" - x=tensor.matrix('x') - y=Prepend_scalar_constant_to_each_row(4.)(x) - f=theano.function([x],[y]) - m=numpy.random.rand(3,5) - my = f(m) - self.failUnless(my.shape == (3, 6), my.shape) - self.failUnless(numpy.all( my[:,0] == 4.0)) - - -class T_prepend(unittest.TestCase): - def test0(self): - """basic functionality""" - x=tensor.matrix('x') - y=Prepend_scalar_to_each_row()(5.,x) - f=theano.function([x],[y]) - m=numpy.ones((3,5),dtype="float32") - my = f(m) - self.failUnless(str(my.dtype) == 'float64') - self.failUnless(my.shape == (3, 6)) - self.failUnless(numpy.all(my[:,0] == 5.0)) - -class T_solve(unittest.TestCase): - def setUp(self): - self.rng = numpy.random.RandomState(666) - - def test0(self): - A=self.rng.randn(5,5) - b=numpy.array(range(5),dtype=float) - x=numpy.linalg.solve(A,b) - Ax = numpy.dot(A,x) - are = theano.gradient.numeric_grad.abs_rel_err(Ax, b) - self.failUnless(numpy.all(are < 1.0e-5), (are, Ax, b)) - #print A,b - #print numpy.dot(A,x) - - -if __name__ == '__main__': - unittest.main() diff -r 27b1344a57b1 -r 8fff4bc26f4c _test_onehotop.py --- a/_test_onehotop.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -from onehotop import one_hot - -import unittest -from theano import compile -from theano import gradient -from theano import function -from theano.tensor import as_tensor - -import random -import numpy.random - -class T_OneHot(unittest.TestCase): - def test0(self): - x = as_tensor([3, 2, 1]) - y = as_tensor(5) - o = one_hot(x, y) - f = function([],o) - self.failUnless(numpy.all(f() == numpy.asarray([[0, 0, 0, 1, 0], [0, 0, 1, 0, 0], [0, 1, 0, 0, 0]]))) - -if __name__ == '__main__': - unittest.main() diff -r 27b1344a57b1 -r 8fff4bc26f4c _test_random_transformation.py --- a/_test_random_transformation.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,84 +0,0 @@ -from random_transformation import row_random_transformation - -import unittest -from theano import compile -from theano import gradient - -from theano.sparse import _is_dense, _is_sparse, _is_dense_result, _is_sparse_result -from theano.sparse import _mtypes, _mtype_to_str -from theano.sparse import as_sparse - -from theano.tensor import as_tensor -from theano.scalar import as_scalar - -import random -import numpy.random - -class T_RowRandomTransformation(unittest.TestCase): - def setUp(self): - random.seed(44) - numpy.random.seed(44) - - def test_basic(self): - rows = 4 - cols = 20 - fakeseed = 0 - length = 3 - md = numpy.random.rand(rows, cols) - for mtype in _mtypes: - m = as_sparse(mtype(md)) - o = row_random_transformation(m, length, initial_seed=fakeseed) - y = compile.eval_outputs([o]) - expected = "[[ 0.88239119 1.03244463 -1.29297503]\n [ 0.02644961 1.50119695 -0.025081 ]\n [-0.60741013 1.25424625 0.30119422]\n [-1.08659967 -0.35531544 -1.38915467]]" - self.failUnless(str(y) == expected) - - def test_length(self): - """ Test that if length is increased, we obtain the same results - (except longer). """ - - for i in range(10): - mtype = random.choice(_mtypes) - rows = random.randint(1, 20) - cols = random.randint(1, 20) - fakeseed = random.randint(0, 100) - length = random.randint(1, 10) - extralength = random.randint(1, 10) - - m = as_sparse(mtype(numpy.random.rand(rows, cols))) - o1 = row_random_transformation(m, length, initial_seed=fakeseed) - o2 = row_random_transformation(m, length + extralength, initial_seed=fakeseed) - - y1 = compile.eval_outputs([o1]) - y2 = compile.eval_outputs([o2]) - - self.failUnless((y1 == y2[:,:length]).all()) - - def test_permute(self): - """ Test that if the order of the rows is permuted, we obtain the same results. """ - for i in range(10): - mtype = random.choice(_mtypes) - rows = random.randint(2, 20) - cols = random.randint(1, 20) - fakeseed = random.randint(0, 100) - length = random.randint(1, 10) - - permute = numpy.random.permutation(rows) - - - m1 = numpy.random.rand(rows, cols) - m2 = m1[permute] - for r in range(rows): - self.failUnless((m2[r] == m1[permute[r]]).all()) - s1 = as_sparse(mtype(m1)) - s2 = as_sparse(mtype(m2)) - o1 = row_random_transformation(s1, length, initial_seed=fakeseed) - o2 = row_random_transformation(s2, length, initial_seed=fakeseed) - y1 = compile.eval_outputs([o1]) - y2 = compile.eval_outputs([o2]) - - self.failUnless(y1.shape == y2.shape) - for r in range(rows): - self.failUnless((y2[r] == y1[permute[r]]).all()) - -if __name__ == '__main__': - unittest.main() diff -r 27b1344a57b1 -r 8fff4bc26f4c _test_xlogx.py --- a/_test_xlogx.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,28 +0,0 @@ -from xlogx import xlogx - -import unittest - -from theano import compile -from theano import gradient -from theano import function -from theano.tensor import as_tensor -import theano.tensor.basic as TT - -import random -import numpy.random - -class T_XlogX(unittest.TestCase): - def test0(self): - x = as_tensor([1, 0]) - y = xlogx(x) - f = function([],y) - self.failUnless(numpy.all(f() == numpy.asarray([0, 0.]))) - def test1(self): - class Dummy(object): - def make_node(self, a): - return [xlogx(a)[:,2]] - TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4)]) - - -if __name__ == '__main__': - unittest.main() diff -r 27b1344a57b1 -r 8fff4bc26f4c activation.py --- a/activation.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -""" -Activation functions. - -@todo: Make an Activation function class, with a particular contract. -That way, we can swap in Activation functions in our algorithms. -""" diff -r 27b1344a57b1 -r 8fff4bc26f4c algorithms/__init__.py --- a/algorithms/__init__.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ - -from regressor import Regressor, BinRegressor -from aa import AutoEncoder, SigmoidXEAutoEncoder -from daa import DenoisingAA, SigmoidXEDenoisingAA -from stacker import Stacker diff -r 27b1344a57b1 -r 8fff4bc26f4c algorithms/_test_logistic_regression.py --- a/algorithms/_test_logistic_regression.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ -from logistic_regression import * -import sys, time - -if __name__ == '__main__': - pprint.assign(nnet.crossentropy_softmax_1hot_with_bias_dx, printing.FunctionPrinter('xsoftmaxdx')) - pprint.assign(nnet.crossentropy_softmax_argmax_1hot_with_bias, printing.FunctionPrinter('nll', 'softmax', 'argmax')) - if 1: - lrc = Module_Nclass() - - print '================' - print lrc.update.pretty() - print '================' - print lrc.update.pretty(mode = theano.Mode('py', 'fast_run')) - print '================' -# print lrc.update.pretty(mode = compile.FAST_RUN.excluding('inplace')) -# print '================' - -# sys.exit(0) - - lr = lrc.make(10, 2, mode=theano.Mode('c|py', 'fast_run')) - #lr = lrc.make(10, 2, mode=compile.FAST_RUN.excluding('fast_run')) - #lr = lrc.make(10, 2, mode=theano.Mode('py', 'merge')) #'FAST_RUN') - - data_x = N.random.randn(5, 10) - data_y = (N.random.randn(5) > 0) - - t = time.time() - for i in xrange(10000): - lr.lr = 0.02 - xe = lr.update(data_x, data_y) - #if i % 100 == 0: - # print i, xe - - print 'training time:', time.time() - t - print 'final error', xe - - #print - #print 'TRAINED MODEL:' - #print lr - - if 0: - lrc = Module() - - lr = lrc.make(10, mode=theano.Mode('c|py', 'merge')) #'FAST_RUN') - - data_x = N.random.randn(5, 10) - data_y = (N.random.randn(5, 1) > 0) - - for i in xrange(10000): - xe = lr.update(data_x, data_y) - if i % 100 == 0: - print i, xe - - print - print 'TRAINED MODEL:' - print lr - - - - diff -r 27b1344a57b1 -r 8fff4bc26f4c algorithms/aa.py --- a/algorithms/aa.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,108 +0,0 @@ - -import theano -from theano import tensor as T -from theano.tensor import nnet as NN -import numpy as N - -class AutoEncoder(theano.FancyModule): - - def __init__(self, input = None, regularize = True, tie_weights = True): - super(AutoEncoder, self).__init__() - - # MODEL CONFIGURATION - self.regularize = regularize - self.tie_weights = tie_weights - - # ACQUIRE/MAKE INPUT - if not input: - input = T.matrix('input') - self.input = theano.External(input) - - # HYPER-PARAMETERS - self.lr = theano.Member(T.scalar()) - - # PARAMETERS - self.w1 = theano.Member(T.matrix()) - if not tie_weights: - self.w2 = theano.Member(T.matrix()) - else: - self.w2 = self.w1.T - self.b1 = theano.Member(T.vector()) - self.b2 = theano.Member(T.vector()) - - # HIDDEN LAYER - self.hidden_activation = T.dot(input, self.w1) + self.b1 - self.hidden = self.build_hidden() - - # RECONSTRUCTION LAYER - self.output_activation = T.dot(self.hidden, self.w2) + self.b2 - self.output = self.build_output() - - # RECONSTRUCTION COST - self.reconstruction_cost = self.build_reconstruction_cost() - - # REGULARIZATION COST - self.regularization = self.build_regularization() - - # TOTAL COST - self.cost = self.reconstruction_cost - if self.regularize: - self.cost = self.cost + self.regularization - - # GRADIENTS AND UPDATES - if self.tie_weights: - self.params = self.w1, self.b1, self.b2 - else: - self.params = self.w1, self.w2, self.b1, self.b2 - gradients = T.grad(self.cost, self.params) - updates = dict((p, p - self.lr * g) for p, g in zip(self.params, gradients)) - - # INTERFACE METHODS - self.update = theano.Method(input, self.cost, updates) - self.reconstruction = theano.Method(input, self.output) - self.representation = theano.Method(input, self.hidden) - - def _instance_initialize(self, obj, input_size = None, hidden_size = None, seed = None, **init): - if (input_size is None) ^ (hidden_size is None): - raise ValueError("Must specify hidden_size and target_size or neither.") - super(AutoEncoder, self)._instance_initialize(obj, **init) - if seed is not None: - R = N.random.RandomState(seed) - else: - R = N.random - if input_size is not None: - sz = (input_size, hidden_size) - range = 1/N.sqrt(input_size) - obj.w1 = R.uniform(size = sz, low = -range, high = range) - if not self.tie_weights: - obj.w2 = R.uniform(size = list(reversed(sz)), low = -range, high = range) - obj.b1 = N.zeros(hidden_size) - obj.b2 = N.zeros(input_size) - - def build_regularization(self): - return T.zero() # no regularization! - - -class SigmoidXEAutoEncoder(AutoEncoder): - - def build_hidden(self): - return NN.sigmoid(self.hidden_activation) - - def build_output(self): - return NN.sigmoid(self.output_activation) - - def build_reconstruction_cost(self): - self.reconstruction_cost_matrix = self.input * T.log(self.output) + (1.0 - self.input) * T.log(1.0 - self.output) - self.reconstruction_costs = -T.sum(self.reconstruction_cost_matrix, axis=1) - return T.sum(self.reconstruction_costs) - - def build_regularization(self): - self.l2_coef = theano.Member(T.scalar()) - if self.tie_weights: - return self.l2_coef * T.sum(self.w1 * self.w1) - else: - return self.l2_coef * T.sum(self.w1 * self.w1) + T.sum(self.w2 * self.w2) - - def _instance_initialize(self, obj, input_size = None, hidden_size = None, **init): - init.setdefault('l2_coef', 0) - super(SigmoidXEAutoEncoder, self)._instance_initialize(obj, input_size, hidden_size, **init) diff -r 27b1344a57b1 -r 8fff4bc26f4c algorithms/daa.py --- a/algorithms/daa.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,186 +0,0 @@ - -import theano -from theano import tensor as T -from theano.tensor import nnet as NN -import numpy as N - -from pylearn import cost as cost - -class DenoisingAA(T.RModule): - """De-noising Auto-encoder - - WRITEME - - Abstract base class. Requires subclass with functions: - - - build_corrupted_input() - - Introductory article about this model WRITEME. - - - """ - - def __init__(self, input = None, regularize = True, tie_weights = True, - activation_function=NN.sigmoid, reconstruction_cost_function=cost.cross_entropy): - """ - :param input: WRITEME - - :param regularize: WRITEME - - :param tie_weights: WRITEME - - :param activation_function: WRITEME - - :param reconstruction_cost: Should return one cost per example (row) - - :todo: Default noise level for all daa levels - - """ - super(DenoisingAA, self).__init__() - - # MODEL CONFIGURATION - self.regularize = regularize - self.tie_weights = tie_weights - self.activation_function = activation_function - self.reconstruction_cost_function = reconstruction_cost_function - - # ACQUIRE/MAKE INPUT - if not input: - input = T.matrix('input') - self.input = theano.External(input) - - # HYPER-PARAMETERS - self.lr = theano.Member(T.scalar()) - - # PARAMETERS - self.w1 = theano.Member(T.matrix()) - if not tie_weights: - self.w2 = theano.Member(T.matrix()) - else: - self.w2 = self.w1.T - self.b1 = theano.Member(T.vector()) - self.b2 = theano.Member(T.vector()) - - - # REGULARIZATION COST - self.regularization = self.build_regularization() - - - ### NOISELESS ### - - # HIDDEN LAYER - self.hidden_activation = T.dot(self.input, self.w1) + self.b1 - self.hidden = self.hid_activation_function(self.hidden_activation) - - # RECONSTRUCTION LAYER - self.output_activation = T.dot(self.hidden, self.w2) + self.b2 - self.output = self.out_activation_function(self.output_activation) - - # RECONSTRUCTION COST - self.reconstruction_costs = self.build_reconstruction_costs(self.output) - self.reconstruction_cost = T.mean(self.reconstruction_costs) - - # TOTAL COST - self.cost = self.reconstruction_cost - if self.regularize: - self.cost = self.cost + self.regularization - - - ### WITH NOISE ### - self.corrupted_input = self.build_corrupted_input() - - # HIDDEN LAYER - self.nhidden_activation = T.dot(self.corrupted_input, self.w1) + self.b1 - self.nhidden = self.hid_activation_function(self.nhidden_activation) - - # RECONSTRUCTION LAYER - self.noutput_activation = T.dot(self.nhidden, self.w2) + self.b2 - self.noutput = self.out_activation_function(self.noutput_activation) - - # RECONSTRUCTION COST - self.nreconstruction_costs = self.build_reconstruction_costs(self.noutput) - self.nreconstruction_cost = T.mean(self.nreconstruction_costs) - - # TOTAL COST - self.ncost = self.nreconstruction_cost - if self.regularize: - self.ncost = self.ncost + self.regularization - - - # GRADIENTS AND UPDATES - if self.tie_weights: - self.params = self.w1, self.b1, self.b2 - else: - self.params = self.w1, self.w2, self.b1, self.b2 - gradients = T.grad(self.ncost, self.params) - updates = dict((p, p - self.lr * g) for p, g in zip(self.params, gradients)) - - # INTERFACE METHODS - self.update = theano.Method(self.input, self.ncost, updates) - self.compute_cost = theano.Method(self.input, self.cost) - self.noisify = theano.Method(self.input, self.corrupted_input) - self.reconstruction = theano.Method(self.input, self.output) - self.representation = theano.Method(self.input, self.hidden) - self.reconstruction_through_noise = theano.Method(self.input, [self.corrupted_input, self.noutput]) - - self.validate = theano.Method(self.input, [self.cost, self.output]) - - def _instance_initialize(self, obj, input_size = None, hidden_size = None, seed = None, **init): - if (input_size is None) ^ (hidden_size is None): - raise ValueError("Must specify input_size and hidden_size or neither.") - super(DenoisingAA, self)._instance_initialize(obj, **init) - if seed is not None: - R = N.random.RandomState(seed) - else: - R = N.random - if input_size is not None: - sz = (input_size, hidden_size) - inf = 1/N.sqrt(input_size) - hif = 1/N.sqrt(hidden_size) - obj.w1 = R.uniform(size = sz, low = -inf, high = inf) - if not self.tie_weights: - obj.w2 = R.uniform(size = list(reversed(sz)), low = -hif, high = hif) - obj.b1 = N.zeros(hidden_size) - obj.b2 = N.zeros(input_size) - if seed is not None: - obj.seed(seed) - obj.__hide__ = ['params'] - - def build_regularization(self): - """ - @todo: Why do we need this function? - """ - return T.zero() # no regularization! - - -class SigmoidXEDenoisingAA(DenoisingAA): - """ - @todo: Merge this into the above. - @todo: Default noise level for all daa levels - """ - - def build_corrupted_input(self): - self.noise_level = theano.Member(T.scalar()) - return self.random.binomial(T.shape(self.input), 1, 1 - self.noise_level) * self.input - - def hid_activation_function(self, activation): - return self.activation_function(activation) - - def out_activation_function(self, activation): - return self.activation_function(activation) - - def build_reconstruction_costs(self, output): - return self.reconstruction_cost_function(self.input, output) - - def build_regularization(self): - self.l2_coef = theano.Member(T.scalar()) - if self.tie_weights: - return self.l2_coef * T.sum(self.w1 * self.w1) - else: - return self.l2_coef * (T.sum(self.w1 * self.w1) + T.sum(self.w2 * self.w2)) - - def _instance_initialize(self, obj, input_size = None, hidden_size = None, seed = None, **init): - init.setdefault('noise_level', 0) - init.setdefault('l2_coef', 0) - super(SigmoidXEDenoisingAA, self)._instance_initialize(obj, input_size, hidden_size, seed, **init) - diff -r 27b1344a57b1 -r 8fff4bc26f4c algorithms/layer.py --- a/algorithms/layer.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -""" -@todo: Make a layer class, with standardized names: - input, cost, lr, and update -(a Method called update, to be more precise, whose first argument is the input) - -input_dimension, output_dimension (aliased as nin and nout) - -Modules like pylearn.algorithms.logistic_regression.Module_Nclass and -pylearn.algorithms.???.Bin_Regressor should inherit from Layer and -Stacker should assume Layer. -""" diff -r 27b1344a57b1 -r 8fff4bc26f4c algorithms/logistic_regression.py --- a/algorithms/logistic_regression.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,111 +0,0 @@ -import theano -from theano import tensor as T -from theano.tensor import nnet -from theano.compile import module -from theano import printing, pprint -from theano import compile - -import numpy as N - -class LogRegInstanceType(module.FancyModuleInstance): - def initialize(self, n_in, n_out, lr, seed): - #self.component is the LogisticRegressionTemplate instance that built this guy. - """ - @todo: Remove seed. Used only to keep Stacker happy. - """ - - self.w = N.zeros((n_in, n_out)) - self.b = N.zeros(n_out) - self.lr = lr - self.__hide__ = ['params'] - self.input_dimension = n_in - self.output_dimension = n_out - -class Module_Nclass(module.FancyModule): - InstanceType = LogRegInstanceType - - def __init__(self, x=None, targ=None, w=None, b=None, lr=None, regularize=False): - super(Module_Nclass, self).__init__() #boilerplate - - self.x = module.Member(x) if x is not None else T.matrix('input') - self.targ = module.Member(targ) if targ is not None else T.lvector() - - self.w = module.Member(w) if w is not None else module.Member(T.dmatrix()) - self.b = module.Member(b) if b is not None else module.Member(T.dvector()) - self.lr = module.Member(lr) if lr is not None else module.Member(T.dscalar()) - - self.params = [p for p in [self.w, self.b] if p.owner is None] - - linear_output = T.dot(self.x, self.w) + self.b - - (xent, softmax, max_pr, argmax) = nnet.crossentropy_softmax_max_and_argmax_1hot( - linear_output, self.targ) - sum_xent = T.sum(xent) - - self.softmax = softmax - self.argmax = argmax - self.max_pr = max_pr - self.sum_xent = sum_xent - - # Softmax being computed directly. - softmax_unsupervised = nnet.softmax(linear_output) - self.softmax_unsupervised = softmax_unsupervised - - #compatibility with current implementation of stacker/daa or something - #TODO: remove this, make a wrapper - self.cost = self.sum_xent - self.input = self.x - # TODO: I want to make output = linear_output. - self.output = self.softmax_unsupervised - - #define the apply method - self.pred = T.argmax(linear_output, axis=1) - self.apply = module.Method([self.input], self.pred) - - self.validate = module.Method([self.input, self.targ], [self.cost, self.argmax, self.max_pr]) - self.softmax_output = module.Method([self.input], self.softmax_unsupervised) - - if self.params: - gparams = T.grad(sum_xent, self.params) - - self.update = module.Method([self.input, self.targ], sum_xent, - updates = dict((p, p - self.lr * g) for p, g in zip(self.params, gparams))) - -class Module(module.FancyModule): - InstanceType = LogRegInstanceType - - def __init__(self, input=None, targ=None, w=None, b=None, lr=None, regularize=False): - super(Module, self).__init__() #boilerplate - - self.input = module.Member(input) if input is not None else T.matrix('input') - self.targ = module.Member(targ) if targ is not None else T.lcol() - - self.w = module.Member(w) if w is not None else module.Member(T.dmatrix()) - self.b = module.Member(b) if b is not None else module.Member(T.dvector()) - self.lr = module.Member(lr) if lr is not None else module.Member(T.dscalar()) - - self.params = [p for p in [self.w, self.b] if p.owner is None] - - output = nnet.sigmoid(T.dot(self.x, self.w) + self.b) - xent = -self.targ * T.log(output) - (1.0 - self.targ) * T.log(1.0 - output) - sum_xent = T.sum(xent) - - self.output = output - self.xent = xent - self.sum_xent = sum_xent - self.cost = sum_xent - - #define the apply method - self.pred = (T.dot(self.input, self.w) + self.b) > 0.0 - self.apply = module.Method([self.input], self.pred) - - #if this module has any internal parameters, define an update function for them - if self.params: - gparams = T.grad(sum_xent, self.params) - self.update = module.Method([self.input, self.targ], sum_xent, - updates = dict((p, p - self.lr * g) for p, g in zip(self.params, gparams))) - -class Learner(object): - """TODO: Encapsulate the algorithm for finding an optimal regularization coefficient""" - pass - diff -r 27b1344a57b1 -r 8fff4bc26f4c algorithms/regressor.py --- a/algorithms/regressor.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,104 +0,0 @@ - -import theano -from theano import tensor as T -from theano.tensor import nnet as NN -import numpy as N - -class Regressor(theano.FancyModule): - - def __init__(self, input = None, target = None, regularize = True): - super(Regressor, self).__init__() - - # MODEL CONFIGURATION - self.regularize = regularize - - # ACQUIRE/MAKE INPUT AND TARGET - self.input = theano.External(input) if input else T.matrix('input') - self.target = theano.External(target) if target else T.matrix('target') - - # HYPER-PARAMETERS - self.lr = theano.Member(T.scalar()) - - # PARAMETERS - self.w = theano.Member(T.matrix()) - self.b = theano.Member(T.vector()) - - # OUTPUT - self.output_activation = T.dot(self.input, self.w) + self.b - self.output = self.build_output() - - # REGRESSION COST - self.regression_cost = self.build_regression_cost() - - # REGULARIZATION COST - self.regularization = self.build_regularization() - - # TOTAL COST - self.cost = self.regression_cost - if self.regularize: - self.cost = self.cost + self.regularization - - # GRADIENTS AND UPDATES - self.params = self.w, self.b - gradients = T.grad(self.cost, self.params) - updates = dict((p, p - self.lr * g) for p, g in zip(self.params, gradients)) - - # INTERFACE METHODS - self.update = theano.Method([self.input, self.target], self.cost, updates) - self.get_cost = theano.Method([self.input, self.target], self.cost) - self.predict = theano.Method(self.input, self.output) - - self.build_extensions() - - def _instance_initialize(self, obj, input_size = None, output_size = None, seed = None, **init): - if seed is not None: - R = N.random.RandomState(seed) - else: - R = N.random - if (input_size is None) ^ (output_size is None): - raise ValueError("Must specify input_size and output_size or neither.") - super(Regressor, self)._instance_initialize(obj, **init) - if input_size is not None: - sz = (input_size, output_size) - range = 1/N.sqrt(input_size) - obj.w = R.uniform(size = sz, low = -range, high = range) - obj.b = N.zeros(output_size) - obj.__hide__ = ['params'] - - def _instance_flops_approx(self, obj): - return obj.w.size - - def build_extensions(self): - pass - - def build_output(self): - raise NotImplementedError('override in subclass') - - def build_regression_cost(self): - raise NotImplementedError('override in subclass') - - def build_regularization(self): - return T.zero() # no regularization! - - -class BinRegressor(Regressor): - - def build_extensions(self): - self.classes = T.iround(self.output) - self.classify = theano.Method(self.input, self.classes) - - def build_output(self): - return NN.sigmoid(self.output_activation) - - def build_regression_cost(self): - self.regression_cost_matrix = self.target * T.log(self.output) + (1.0 - self.target) * T.log(1.0 - self.output) - self.regression_costs = -T.sum(self.regression_cost_matrix, axis=1) - return T.mean(self.regression_costs) - - def build_regularization(self): - self.l2_coef = theano.Member(T.scalar()) - return self.l2_coef * T.sum(self.w * self.w) - - def _instance_initialize(self, obj, input_size = None, output_size = 1, seed = None, **init): - init.setdefault('l2_coef', 0) - super(BinRegressor, self)._instance_initialize(obj, input_size, output_size, seed, **init) diff -r 27b1344a57b1 -r 8fff4bc26f4c algorithms/sgd.py --- a/algorithms/sgd.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ - -from theano.compile import module -from theano import tensor as T - -class StochasticGradientDescent(module.FancyModule): - def __init__(self, params, gparams, lr=None): - super(StochasticGradientDescent, self).__init__() - - self.lr = lr if lr is not None else module.Member(T.dscalar()) - self.params = params - self.gparams = gparams - - self.updates = dict((p, p - self.lr * g) for p, g in zip(self.params, self.gparams)) - diff -r 27b1344a57b1 -r 8fff4bc26f4c algorithms/stacker.py --- a/algorithms/stacker.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ - -# for example in examples: -# repr = example -# for layer in stacked.layers: -# layer.update(repr) -# repr = layer.representation(repr) - -import theano -from theano import tensor as T -import sys -import numpy as N - -class Stacker(T.RModule): - """ - @note: Assumes some names in the layers: input, cost, lr, and update - @todo: Maybe compile functions on demand, rather than immediately. - """ - - def __init__(self, submodules, input = None, regularize = False): - super(Stacker, self).__init__() - - current = input - layers = [] - for i, (submodule, outname) in enumerate(submodules): - layer = submodule(current, regularize = regularize) - layers.append(layer) - current = layer[outname] - self.layers = layers - - self.input = self.layers[0].input - self.output = current - - representation = [] - local_update = [] - global_update = [] - to_update = [] - all_kits = [] - for layer, (submodule, outname) in zip(layers, submodules): - u = layer.update - u.resolve_all() - to_update += u.updates.keys() - all_kits += u.kits - # the input is the whole deep model's input instead of the layer's own - # input (which is previous_layer[outname]) - inputs = [self.input] + u.inputs[1:] - method = theano.Method(inputs, u.outputs, u.updates, u.kits) - local_update.append(method) - global_update.append( - theano.Method(inputs, - u.outputs, - # we update the params of the previous layers too but wrt - # this layer's cost - dict((param, param - layer.lr * T.grad(layer.cost, param)) - for param in to_update), - list(all_kits))) - representation.append(theano.Method(self.input, layer[outname])) - -# @todo: Add diagnostics -# self.diagnose_from_input = Method([self.input], self.layers[0].diagnose.outputs + self.layers[1].diagnose.outputs ... - - self.local_update = local_update - self.global_update = global_update - self.representation = representation - self.update = self.global_update[-1] - self.compute = theano.Method(self.input, self.output) - ll = self.layers[-1] - for name, method in ll.components_map(): - if isinstance(method, theano.Method) and not hasattr(self, name): - m = method.dup() - m.resolve_all() - m.inputs = [self.input if x is ll.input else x for x in m.inputs] - setattr(self, name, m) - - def _instance_initialize(self, obj, nunits = None, lr = 0.01, seed = None, **kwargs): - super(Stacker, self)._instance_initialize(obj, **kwargs) - if seed is not None: - R = N.random.RandomState(seed) - else: - R = N.random - for layer in obj.layers: - if layer.lr is None: - layer.lr = lr - if nunits: - obj.input_dimension = nunits[0] - obj.output_dimension = nunits[-1] - if len(nunits) != len(obj.layers) + 1: - raise ValueError('You should give exactly one more unit numbers as there are layers.') - for ni, no, layer in zip(nunits[:-1], nunits[1:], obj.layers): - if seed is not None: - layer.initialize(ni, no, seed = R.random_integers(sys.maxint - 1)) - else: - layer.initialize(ni, no) - if seed is not None: - obj.seed(seed) - - def _instance_flops_approx(self, obj): - rval = 0 - for layer in obj.layers: - rval += layer.flops_approx() - return rval - diff -r 27b1344a57b1 -r 8fff4bc26f4c algorithms/tests/test_aa.py --- a/algorithms/tests/test_aa.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -#from __future__ import absolute_imports - -from pylearn import algorithms as models -import theano -import numpy -import time - - -def test_train(mode = theano.Mode('c|py', 'fast_run')): - - aa = models.SigmoidXEAutoEncoder(regularize = False) -# print aa.update.pretty(mode = theano.Mode('py', 'fast_run').excluding('inplace')) - - model = aa.make(lr = 0.01, - input_size = 100, - hidden_size = 1000, - mode = mode) - - data = [[0, 1, 0, 0, 1, 1, 1, 0, 1, 0]*10]*10 - #data = numpy.random.rand(10, 100) - - t1 = time.time() - for i in xrange(1001): - cost = model.update(data) - if i % 100 == 0: - print i, cost - t2 = time.time() - return t2 - t1 - -if __name__ == '__main__': - numpy.random.seed(10) - print 'optimized:' - t1 = test_train(theano.Mode('c|py', 'fast_run')) - print 'time:',t1 - print - - numpy.random.seed(10) - print 'not optimized:' - t2 = test_train(theano.Mode('c|py', 'fast_compile')) - print 'time:',t2 - - - diff -r 27b1344a57b1 -r 8fff4bc26f4c algorithms/tests/test_daa.py --- a/algorithms/tests/test_daa.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,90 +0,0 @@ -#!/usr/bin/python - -from pylearn import algorithms as models -import theano -import numpy -import time - -import pylearn.algorithms.logistic_regression - -def test_train_daa(mode = theano.Mode('c|py', 'fast_run')): - - ndaa = 3 - daa = models.Stacker([(models.SigmoidXEDenoisingAA, 'hidden')] * ndaa + [(models.BinRegressor, 'output')], - regularize = False) - - model = daa.make([4, 20, 20, 20, 1], - lr = 0.01, - mode = mode, - seed = 10) - - model.layers[0].noise_level = 0.3 - model.layers[1].noise_level = 0.3 - model.layers[2].noise_level = 0.3 - - # Update the first hidden layer - for l in range(3): - for i in range(10): - model.local_update[l]([[0, 1, 0, 1]]) - model.local_update[l]([[1, 0, 1, 0]]) - - for i in range(10): - model.update([[0, 1, 0, 1]], [[1]]) - model.update([[1, 0, 1, 0]], [[0]]) - print model.classify([[0, 1, 0, 1]]) - print model.classify([[1, 0, 1, 0]]) - - -def test_train_daa2(mode = theano.Mode('c|py', 'fast_run')): - - ndaa = 3 - daa = models.Stacker([(models.SigmoidXEDenoisingAA, 'hidden')] * ndaa + [(pylearn.algorithms.logistic_regression.Module_Nclass, 'pred')], - regularize = False) - - model = daa.make([4] + [20] * ndaa + [10], - lr = 0.01, - mode = mode, - seed = 10) - - for l in range(ndaa): model.layers[l].noise_level = 0.3 - - instances = [([[0, 1, 0, 1]], [1]), ([[1, 0, 1, 0]], [0])] - - for l in range(ndaa): - for i in range(10): - for (input, output) in instances: - model.local_update[l](input) - - for i in range(10): - for (input, output) in instances: -# model.update(input, output) - print "OLD:", - print model.validate(input, output) - oldloss = model.update(input, output) - print oldloss - print "NEW:" - print model.validate(input, output) - print - - print model.apply([[0, 1, 0, 1]]) - print model.apply([[1, 0, 1, 0]]) - - - - -if __name__ == '__main__': -# print 'optimized:' -# t1 = test_train_daa(theano.Mode('py', 'fast_compile')) -# t1 = test_train_daa(theano.Mode('c|py', 'fast_run')) -# print 'time:',t1 -# print - -# print 'not optimized:' -# t2 = test_train_daa(theano.Mode('c|py', 'fast_compile')) -## print 'time:',t2 - -# test_train_daa(theano.compile.Mode('c&py', 'merge')) -# test_train_daa(theano.compile.Mode('c|py', 'merge')) - test_train_daa(theano.compile.Mode('py', 'merge')) - - test_train_daa2(theano.compile.Mode('c|py', 'merge')) diff -r 27b1344a57b1 -r 8fff4bc26f4c algorithms/tests/test_regressor.py --- a/algorithms/tests/test_regressor.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ - - -import models -import theano -import numpy -import time - - -def test_train(mode = theano.Mode('c|py', 'fast_run')): - - reg = models.BinRegressor(regularize = False) - - model = reg.make(lr = 0.01, - input_size = 100, - mode = mode, - seed = 10) - -# data = [[0, 1, 0, 0, 1, 1, 1, 0, 1, 0]*10]*10 -# targets = [[1]]*10 - #data = numpy.random.rand(10, 100) - - R = numpy.random.RandomState(100) - t1 = time.time() - for i in xrange(1001): - data = R.random_integers(0, 1, size = (10, 100)) - targets = data[:, 6].reshape((10, 1)) - cost = model.update(data, targets) - if i % 100 == 0: - print i, '\t', cost, '\t', 1*(targets.T == model.classify(data).T) - t2 = time.time() - return t2 - t1 - -if __name__ == '__main__': - print 'optimized:' - t1 = test_train(theano.Mode('c|py', 'fast_run')) - print 'time:',t1 - print - - print 'not optimized:' - t2 = test_train(theano.Mode('c|py', 'fast_compile')) - print 'time:',t2 - - - - - diff -r 27b1344a57b1 -r 8fff4bc26f4c algorithms/tests/test_stacker.py --- a/algorithms/tests/test_stacker.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ - -import models -import theano -import numpy -import time - - -def test_train(mode = theano.Mode('c|py', 'fast_run')): - - reg = models.Stacker([(models.BinRegressor, 'output'), (models.BinRegressor, 'output')], - regularize = False) - #print reg.global_update[1].pretty(mode = mode.excluding('inplace')) - - model = reg.make([100, 200, 1], - lr = 0.01, - mode = mode, - seed = 10) - - R = numpy.random.RandomState(100) - t1 = time.time() - for i in xrange(1001): - data = R.random_integers(0, 1, size = (10, 100)) - targets = data[:, 6].reshape((10, 1)) - cost = model.update(data, targets) - if i % 100 == 0: - print i, '\t', cost, '\t', 1*(targets.T == model.classify(data).T) - t2 = time.time() - return t2 - t1 - -if __name__ == '__main__': - print 'optimized:' - t1 = test_train(theano.Mode('c|py', 'fast_run')) - print 'time:',t1 - print - - print 'not optimized:' - t2 = test_train(theano.Mode('c|py', 'fast_compile')) - print 'time:',t2 - - - - - diff -r 27b1344a57b1 -r 8fff4bc26f4c amat.py --- a/amat.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,120 +0,0 @@ -"""load PLearn AMat files""" - -import sys, numpy, array - -class AMat: - """DataSource to access a plearn amat file as a periodic unrandomized stream. - - Attributes: - - input -- all columns of input - target -- all columns of target - weight -- all columns of weight - extra -- all columns of extra - - all -- the entire data contents of the amat file - n_examples -- the number of training examples in the file - - AMat stands for Ascii Matri[x,ces] - - """ - - marker_size = '#size:' - marker_sizes = '#sizes:' - marker_col_names = '#:' - - def __init__(self, path, head=None, update_interval=0, ofile=sys.stdout): - - """Load the amat at into memory. - - path - str: location of amat file - head - int: stop reading after this many data rows - update_interval - int: print '.' to ofile every lines - ofile - file: print status, msgs, etc. to this file - - """ - self.all = None - self.input = None - self.target = None - self.weight = None - self.extra = None - - self.header = False - self.header_size = None - self.header_rows = None - self.header_cols = None - self.header_sizes = None - self.header_col_names = [] - - data_started = False - data = array.array('d') - - f = open(path) - n_data_lines = 0 - len_float_line = None - - for i,line in enumerate(f): - if n_data_lines == head: - #we've read enough data, - # break even if there's more in the file - break - if len(line) == 0 or line == '\n': - continue - if line[0] == '#': - if not data_started: - #the condition means that the file has a header, and we're on - # some header line - self.header = True - if line.startswith(AMat.marker_size): - info = line[len(AMat.marker_size):] - self.header_size = [int(s) for s in info.split()] - self.header_rows, self.header_cols = self.header_size - if line.startswith(AMat.marker_col_names): - info = line[len(AMat.marker_col_names):] - self.header_col_names = info.split() - elif line.startswith(AMat.marker_sizes): - info = line[len(AMat.marker_sizes):] - self.header_sizes = [int(s) for s in info.split()] - else: - #the first non-commented line tells us that the header is done - data_started = True - float_line = [float(s) for s in line.split()] - if len_float_line is None: - len_float_line = len(float_line) - if (self.header_cols is not None) \ - and self.header_cols != len_float_line: - print >> sys.stderr, \ - 'WARNING: header declared %i cols but first line has %i, using %i',\ - self.header_cols, len_float_line, len_float_line - else: - if len_float_line != len(float_line): - raise IOError('wrong line length', i, line) - data.extend(float_line) - n_data_lines += 1 - - if update_interval > 0 and (ofile is not None) \ - and n_data_lines % update_interval == 0: - ofile.write('.') - ofile.flush() - - if update_interval > 0: - ofile.write('\n') - f.close() - - # convert from array.array to numpy.ndarray - nshape = (len(data) / len_float_line, len_float_line) - self.all = numpy.frombuffer(data).reshape(nshape) - self.n_examples = self.all.shape[0] - - # assign - if self.header_sizes is not None: - if len(self.header_sizes) > 4: - print >> sys.stderr, 'WARNING: ignoring sizes after 4th in %s' % path - leftmost = 0 - #here we make use of the fact that if header_sizes has len < 4 - # the loop will exit before 4 iterations - attrlist = ['input', 'target', 'weight', 'extra'] - for attr, ncols in zip(attrlist, self.header_sizes): - setattr(self, attr, self.all[:, leftmost:leftmost+ncols]) - leftmost += ncols - diff -r 27b1344a57b1 -r 8fff4bc26f4c autotest.py --- a/autotest.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -import unittest, os, sys, traceback - -def test_root_dir(debugmode=False): - suite = None - filenames = os.listdir('.') - for filename in filenames: - if filename[-3:] == '.py' and filename.startswith('_test'): - #print >>sys.stderr, 'Loading', modname - modname = filename[0:-3] - - try: - module = __import__(modname) - except Exception, e: - print >>sys.stderr, "====================================================" - print >>sys.stderr, "Failed to load %s.py" % modname - print >>sys.stderr, "====================================================" - traceback.print_exc() - print >>sys.stderr, "====================================================" - continue - - tests = unittest.TestLoader().loadTestsFromModule(module) - if tests.countTestCases() > 0: - print >>sys.stderr, 'Testing', modname - if suite is None: - suite = tests - else: - suite.addTests(tests) - if suite is None: - print >>sys.stderr, "No suite found" - sys.exit(1) - if debugmode: - suite.debug() - else: - unittest.TextTestRunner(verbosity=1).run(suite) - -if __name__ == '__main__': - - def printUsage(): - print >>sys.stderr, "Bad argument: ",sys.argv - print >>sys.stderr, "only --debug is supported" - sys.exit(1) - debugparam="" - - if len(sys.argv)==2: - if sys.argv[1]=="--debug": - debugparam="--debug" - sys.argv.remove(debugparam) - else: - printUsage() - elif len(sys.argv)>2: - printUsage() - - test_root_dir(debugparam!="") - diff -r 27b1344a57b1 -r 8fff4bc26f4c cost.py --- a/cost.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -""" -Cost functions. - -@note: All of these functions return one cost per example. So it is your -job to perform a tensor.sum over the individual example losses. - -@todo: Make a Cost class, with a particular contract. - -@todo: It would be nice to implement a hinge loss, with a particular margin. -""" - -import theano.tensor as T -from xlogx import xlogx - -def quadratic(target, output, axis=1): - return T.mean(T.sqr(target - output), axis=axis) - -def cross_entropy(target, output, axis=1): - """ - @todo: This is essentially duplicated as nnet_ops.binary_crossentropy - @warning: OUTPUT and TARGET are reversed in nnet_ops.binary_crossentropy - """ - return -T.mean(target * T.log(output) + (1 - target) * T.log(1 - output), axis=axis) - -def KL_divergence(target, output): - """ - @note: We do not compute the mean, because if target and output have - different shapes then the result will be garbled. - """ - return -(target * T.log(output) + (1 - target) * T.log(1 - output)) \ - + (xlogx(target) + xlogx(1 - target)) -# return cross_entropy(target, output, axis) - cross_entropy(target, target, axis) diff -r 27b1344a57b1 -r 8fff4bc26f4c dataset.py --- a/dataset.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1533 +0,0 @@ - -from lookup_list import LookupList as Example -from common.misc import unique_elements_list_intersection -from string import join -from sys import maxint -import numpy, copy - -from exceptions import * - -class AttributesHolder(object): - def __init__(self): pass - - def attributeNames(self): - raise AbstractFunction() - - def setAttributes(self,attribute_names,attribute_values,make_copies=False): - """ - Allow the attribute_values to not be a list (but a single value) if the attribute_names is of length 1. - """ - if len(attribute_names)==1 and not (isinstance(attribute_values,list) or isinstance(attribute_values,tuple) ): - attribute_values = [attribute_values] - if make_copies: - for name,value in zip(attribute_names,attribute_values): - self.__setattr__(name,copy.deepcopy(value)) - else: - for name,value in zip(attribute_names,attribute_values): - self.__setattr__(name,value) - - def getAttributes(self,attribute_names=None, return_copy=False): - """ - Return all (if attribute_names=None, in the order of attributeNames()) or a specified subset of attributes. - """ - if attribute_names is None: - attribute_names = self.attributeNames() - if return_copy: - return [copy.copy(self.__getattribute__(name)) for name in attribute_names] - else: - return [self.__getattribute__(name) for name in attribute_names] - -class DataSet(AttributesHolder): - """A virtual base class for datasets. - - A DataSet can be seen as a generalization of a matrix, meant to be used in conjunction - with learning algorithms (for training and testing them): rows/records are called examples, and - columns/attributes are called fields. The field value for a particular example can be an arbitrary - python object, which depends on the particular dataset. - - We call a DataSet a 'stream' when its length is unbounded (in which case its __len__ method - should return sys.maxint). - - A DataSet is a generator of iterators; these iterators can run through the - examples or the fields in a variety of ways. A DataSet need not necessarily have a finite - or known length, so this class can be used to interface to a 'stream' which - feeds on-line learning (however, as noted below, some operations are not - feasible or not recommended on streams). - - To iterate over examples, there are several possibilities: - - for example in dataset: - - for val1,val2,... in dataset: - - for example in dataset(field1, field2,field3, ...): - - for val1,val2,val3 in dataset(field1, field2,field3): - - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N): - - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N): - Each of these is documented below. All of these iterators are expected - to provide, in addition to the usual 'next()' method, a 'next_index()' method - which returns a non-negative integer pointing to the position of the next - example that will be returned by 'next()' (or of the first example in the - next minibatch returned). This is important because these iterators - can wrap around the dataset in order to do multiple passes through it, - in possibly unregular ways if the minibatch size is not a divisor of the - dataset length. - - To iterate over fields, one can do - - for field in dataset.fields(): - for field_value in field: # iterate over the values associated to that field for all the dataset examples - - for field in dataset(field1,field2,...).fields() to select a subset of fields - - for field in dataset.fields(field1,field2,...) to select a subset of fields - and each of these fields is iterable over the examples: - - for field_examples in dataset.fields(): - for example_value in field_examples: - ... - but when the dataset is a stream (unbounded length), it is not recommended to do - such things because the underlying dataset may refuse to access the different fields in - an unsynchronized ways. Hence the fields() method is illegal for streams, by default. - The result of fields() is a L{DataSetFields} object, which iterates over fields, - and whose elements are iterable over examples. A DataSetFields object can - be turned back into a DataSet with its examples() method:: - dataset2 = dataset1.fields().examples() - and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1). - - Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. - - Note: The content of a field can be of any type. Field values can also be 'missing' - (e.g. to handle semi-supervised learning), and in the case of numeric (numpy array) - fields (i.e. an ArrayFieldsDataSet), NaN plays the role of a missing value. - What about non-numeric values? None. - - Dataset elements can be indexed and sub-datasets (with a subset - of examples) can be extracted. These operations are not supported - by default in the case of streams. - - - dataset[:n] returns an Example with the n first examples. - - - dataset[i1:i2:s] returns an Example with the examples i1,i1+s,...i2-s. - - - dataset[i] returns an Example. - - - dataset[[i1,i2,...in]] returns an Example with examples i1,i2,...in. - - A similar command gives you a DataSet instead of Examples : - - - dataset.subset[:n] returns a DataSet with the n first examples. - - - dataset.subset[i1:i2:s] returns a DataSet with the examples i1,i1+s,...i2-s. - - - dataset.subset[i] returns a DataSet. - - - dataset.subset[[i1,i2,...in]] returns a DataSet with examples i1,i2,...in. - - - - dataset. returns the value of a property associated with - the name . The following properties should be supported: - - 'description': a textual description or name for the dataset - - 'fieldtypes': a list of types (one per field) - A DataSet may have other attributes that it makes visible to other objects. These are - used to store information that is not example-wise but global to the dataset. - The list of names of these attributes is given by the attribute_names() method. - - Datasets can be concatenated either vertically (increasing the length) or - horizontally (augmenting the set of fields), if they are compatible, using - the following operations (with the same basic semantics as numpy.hstack - and numpy.vstack): - - - dataset1 | dataset2 | dataset3 == dataset.hstack([dataset1,dataset2,dataset3]) - - creates a new dataset whose list of fields is the concatenation of the list of - fields of the argument datasets. This only works if they all have the same length. - - - dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3]) - - creates a new dataset that concatenates the examples from the argument datasets - (and whose length is the sum of the length of the argument datasets). This only - works if they all have the same fields. - - According to the same logic, and viewing a DataSetFields object associated to - a DataSet as a kind of transpose of it, fields1 & fields2 concatenates fields of - a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their - examples. - - A dataset can hold arbitrary key-value pairs that may be used to access meta-data - or other properties of the dataset or associated with the dataset or the result - of a computation stored in a dataset. These can be accessed through the [key] syntax - when key is a string (or more specifically, neither an integer, a slice, nor a list). - - A DataSet sub-class should always redefine the following methods: - - __len__ if it is not a stream - - fieldNames - - minibatches_nowrap (called by DataSet.minibatches()) - For efficiency of implementation, a sub-class might also want to redefine - - valuesHStack - - valuesVStack - - hasFields - - __getitem__ may not be feasible with some streams - - __iter__ - A sub-class should also append attributes to self._attribute_names - (the default value returned by attributeNames()). - By convention, attributes not in attributeNames() should have a name - starting with an underscore. - @todo enforce/test that convention! - """ - - numpy_vstack = lambda fieldname,values: numpy.vstack(values) - numpy_hstack = lambda fieldnames,values: numpy.hstack(values) - - def __init__(self, description=None, fieldnames=None, fieldtypes=None): - """ - @type fieldnames: list of strings - @type fieldtypes: list of python types, same length as fieldnames - @type description: string - @param description: description/name for this dataset - """ - def default_desc(): - return type(self).__name__ \ - + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" - - #self.fieldnames = fieldnames - - self.fieldtypes = fieldtypes if fieldtypes is not None \ - else [None]*1 #len(fieldnames) - - self.description = default_desc() if description is None \ - else description - self._attribute_names = ["description"] - - - attributeNames = property(lambda self: copy.copy(self._attribute_names)) - - def __contains__(self, fieldname): - return (fieldname in self.fieldNames()) \ - or (fieldname in self.attributeNames()) - - def __iter__(self): - """Supports the syntax "for i in dataset: ..." - - Using this syntax, "i" will be an Example instance (or equivalent) with - all the fields of DataSet self. Every field of "i" will give access to - a field of a single example. Fields should be accessible via - i["fielname"] or i[3] (in the order defined by the elements of the - Example returned by this iterator), but the derived class is free - to accept any type of identifier, and add extra functionality to the iterator. - - The default implementation calls the minibatches iterator and extracts the first example of each field. - """ - return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1)) - - def __len__(self): - """ - len(dataset) returns the number of examples in the dataset. - By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint). - Sub-classes which implement finite-length datasets should redefine this method. - Some methods only make sense for finite-length datasets. - """ - from sys import maxint - return maxint - - - class MinibatchToSingleExampleIterator(object): - """ - Converts the result of minibatch iterator with minibatch_size==1 into - single-example values in the result. Therefore the result of - iterating on the dataset itself gives a sequence of single examples - (whereas the result of iterating over minibatches gives in each - Example field an iterable object over the individual examples in - the minibatch). - """ - def __init__(self, minibatch_iterator): - self.minibatch_iterator = minibatch_iterator - self.minibatch = None - def __iter__(self): #makes for loop work - return self - def next(self): - size1_minibatch = self.minibatch_iterator.next() - if not self.minibatch: - names = size1_minibatch.keys() - # next lines are a hack, but there was problem when we were getting [array(327)] for instance - try: - values = [value[0] for value in size1_minibatch.values()] - except : - values = [value for value in size1_minibatch.values()] - self.minibatch = Example(names,values) - else: - self.minibatch._values = [value[0] for value in size1_minibatch.values()] - return self.minibatch - - def next_index(self): - return self.minibatch_iterator.next_index() - - class MinibatchWrapAroundIterator(object): - """ - An iterator for minibatches that handles the case where we need to wrap around the - dataset because n_batches*minibatch_size > len(dataset). It is constructed from - a dataset that provides a minibatch iterator that does not need to handle that problem. - This class is a utility for dataset subclass writers, so that they do not have to handle - this issue multiple times, nor check that fieldnames are valid, nor handle the - empty fieldnames (meaning 'use all the fields'). - """ - def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset): - self.dataset=dataset - self.fieldnames=fieldnames - self.minibatch_size=minibatch_size - self.n_batches=n_batches - self.n_batches_done=0 - self.next_row=offset - self.L=len(dataset) - self.offset=offset % self.L - ds_nbatches = (self.L-self.next_row)/self.minibatch_size - if n_batches is not None: - ds_nbatches = min(n_batches,ds_nbatches) - if fieldnames: - assert dataset.hasFields(*fieldnames) - else: - self.fieldnames=dataset.fieldNames() - self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, ds_nbatches,self.next_row) - - def __iter__(self): - return self - - def next_index(self): - return self.next_row - - def next(self): - if self.n_batches and self.n_batches_done==self.n_batches: - raise StopIteration - elif not self.n_batches and self.next_row ==self.L: - raise StopIteration - upper = self.next_row+self.minibatch_size - if upper <=self.L: - minibatch = self.iterator.next() - else: - if not self.n_batches: - upper=min(upper, self.L) - # if their is not a fixed number of batch, we continue to the end of the dataset. - # this can create a minibatch that is smaller then the minibatch_size - assert (self.L-self.next_row)<=self.minibatch_size - minibatch = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next() - else: - # we must concatenate (vstack) the bottom and top parts of our minibatch - # first get the beginning of our minibatch (top of dataset) - first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next() - second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next() - minibatch = Example(self.fieldnames, - [self.dataset.valuesVStack(name,[first_part[name],second_part[name]]) - for name in self.fieldnames]) - self.next_row=upper - self.n_batches_done+=1 - if upper >= self.L and self.n_batches: - self.next_row -= self.L - ds_nbatches = (self.L-self.next_row)/self.minibatch_size - if self.n_batches is not None: - ds_nbatches = min(self.n_batches,ds_nbatches) - self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, - ds_nbatches,self.next_row) - return DataSetFields(MinibatchDataSet(minibatch,self.dataset.valuesVStack, - self.dataset.valuesHStack), - minibatch.keys()) - - - minibatches_fieldnames = None - minibatches_minibatch_size = 1 - minibatches_n_batches = None - def minibatches(self, - fieldnames = minibatches_fieldnames, - minibatch_size = minibatches_minibatch_size, - n_batches = minibatches_n_batches, - offset = 0): - """ - Return an iterator that supports three forms of syntax: - - for i in dataset.minibatches(None,**kwargs): ... - - for i in dataset.minibatches([f1, f2, f3],**kwargs): ... - - for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ... - - Using the first two syntaxes, "i" will be an indexable object, such as a list, - tuple, or Example instance. In both cases, i[k] is a list-like container - of a batch of current examples. In the second case, i[0] is - list-like container of the f1 field of a batch current examples, i[1] is - a list-like container of the f2 field, etc. - - Using the first syntax, all the fields will be returned in "i". - Using the third syntax, i1, i2, i3 will be list-like containers of the - f1, f2, and f3 fields of a batch of examples on each loop iteration. - - The minibatches iterator is expected to return upon each call to next() - a DataSetFields object, which is a Example (indexed by the field names) whose - elements are iterable and indexable over the minibatch examples, and which keeps a pointer to - a sub-dataset that can be used to iterate over the individual examples - in the minibatch. Hence a minibatch can be converted back to a regular - dataset or its fields can be looked at individually (and possibly iterated over). - - PARAMETERS - - fieldnames (list of any type, default None): - The loop variables i1, i2, i3 (in the example above) should contain the - f1, f2, and f3 fields of the current batch of examples. If None, the - derived class can choose a default, e.g. all fields. - - - minibatch_size (integer, default 1) - On every iteration, the variables i1, i2, i3 will have - exactly minibatch_size elements. e.g. len(i1) == minibatch_size - - @DEPRECATED n_batches : not used anywhere - - n_batches (integer, default None) - The iterator will loop exactly this many times, and then stop. If None, - the derived class can choose a default. If (-1), then the returned - iterator should support looping indefinitely. - - - offset (integer, default 0) - The iterator will start at example 'offset' in the dataset, rather than the default. - - Note: A list-like container is something like a tuple, list, numpy.ndarray or - any other object that supports integer indexing and slicing. - - @ATTENTION: now minibatches returns minibatches_nowrap, which is supposed to return complete - batches only, raise StopIteration. - @ATTENTION: minibatches returns a LookupList, we can't iterate over examples on it. - - """ - #return DataSet.MinibatchWrapAroundIterator(self, fieldnames, minibatch_size, n_batches,offset) - assert offset >= 0 - assert offset < len(self) - assert offset + minibatch_size -1 < len(self) - if fieldnames == None : - fieldnames = self.fieldNames() - return self.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset) - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - """ - This is the minibatches iterator generator that sub-classes must define. - It does not need to worry about wrapping around multiple times across the dataset, - as this is handled by MinibatchWrapAroundIterator when DataSet.minibatches() is called. - The next() method of the returned iterator does not even need to worry about - the termination condition (as StopIteration will be raised by DataSet.minibatches - before an improper call to minibatches_nowrap's next() is made). - That next() method can assert that its next row will always be within [0,len(dataset)). - The iterator returned by minibatches_nowrap does not need to implement - a next_index() method either, as this will be provided by MinibatchWrapAroundIterator. - """ - raise AbstractFunction() - - def is_unbounded(self): - """ - Tests whether a dataset is unbounded (e.g. a stream). - """ - return len(self)==maxint - - def hasFields(self,*fieldnames): - """ - Return true if the given field name (or field names, if multiple arguments are - given) is recognized by the DataSet (i.e. can be used as a field name in one - of the iterators). - - The default implementation may be inefficient (O(# fields in dataset)), as it calls the fieldNames() - method. Many datasets may store their field names in a dictionary, which would allow more efficiency. - """ - return len(unique_elements_list_intersection(fieldnames,self.fieldNames()))>0 - - def fieldNames(self): - """ - Return the list of field names that are supported by the iterators, - and for which hasFields(fieldname) would return True. - """ - raise AbstractFunction() - - def __call__(self,*fieldnames): - """ - Return a dataset that sees only the fields whose name are specified. - """ - assert self.hasFields(*fieldnames) - #return self.fields(*fieldnames).examples() - fieldnames_list = list(fieldnames) - return FieldsSubsetDataSet(self,fieldnames_list) - - def cached_fields_subset(self,*fieldnames) : - """ - Behaviour is supposed to be the same as __call__(*fieldnames), but the dataset returned is cached. - @see : dataset.__call__ - """ - assert self.hasFields(*fieldnames) - return self.fields(*fieldnames).examples() - - def fields(self,*fieldnames): - """ - Return a DataSetFields object associated with this dataset. - """ - return DataSetFields(self,fieldnames) - - def getitem_key(self, fieldname): - """A not-so-well thought-out place to put code that used to be in - getitem. - """ - #removing as per discussion June 4. --JSB - - i = fieldname - # else check for a fieldname - if self.hasFields(i): - return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] - # else we are trying to access a property of the dataset - assert i in self.__dict__ # else it means we are trying to access a non-existing property - return self.__dict__[i] - - def __getitem__(self,i): - """ - @rtype: Example - @returns: single or multiple examples - - @type i: integer or slice or of integers - @param i: - dataset[i] returns the (i+1)-th example of the dataset. - dataset[i:j] returns a LookupList with examples i,i+1,...,j-1. - dataset[i:j:s] returns a LookupList with examples i,i+2,i+4...,j-2. - dataset[[i1,i2,..,in]] returns a LookupList with examples i1,i2,...,in. - - @note: - Some stream datasets may be unable to implement random access, i.e. - arbitrary slicing/indexing because they can only iterate through - examples one or a minibatch at a time and do not actually store or keep - past (or future) examples. - - The default implementation of getitem uses the minibatches iterator - to obtain one example, one slice, or a list of examples. It may not - always be the most efficient way to obtain the result, especially if - the data are actually stored in a memory array. - """ - - if type(i) is int: - assert i >= 0 # TBM: see if someone complains and want negative i - if i >= len(self) : - raise IndexError - i_batch = self.minibatches_nowrap(self.fieldNames(), - minibatch_size=1, n_batches=1, offset=i) - return DataSet.MinibatchToSingleExampleIterator(i_batch).next() - - #if i is a contiguous slice - if type(i) is slice and (i.step in (None, 1)): - offset = 0 if i.start is None else i.start - upper_bound = len(self) if i.stop is None else i.stop - upper_bound = min(len(self) , upper_bound) - #return MinibatchDataSet(self.minibatches_nowrap(self.fieldNames(), - # minibatch_size=upper_bound - offset, - # n_batches=1, - # offset=offset).next()) - # now returns a LookupList - return self.minibatches_nowrap(self.fieldNames(), - minibatch_size=upper_bound - offset, - n_batches=1, - offset=offset).next() - - # if slice has a step param, convert it to list and handle it with the - # list code - if type(i) is slice: - offset = 0 if i.start is None else i.start - upper_bound = len(self) if i.stop is None else i.stop - upper_bound = min(len(self) , upper_bound) - i = list(range(offset, upper_bound, i.step)) - - # handle tuples, arrays, lists - if hasattr(i, '__getitem__'): - for idx in i: - #dis-allow nested slices - if not isinstance(idx, int): - raise TypeError(idx) - if idx >= len(self) : - raise IndexError - # call back into self.__getitem__ - examples = [self.minibatches_nowrap(self.fieldNames(), - minibatch_size=1, n_batches=1, offset=ii).next() - for ii in i] - # re-index the fields in each example by field instead of by example - field_values = [[] for blah in self.fieldNames()] - for e in examples: - for f,v in zip(field_values, e): - f.append(v) - #build them into a LookupList (a.ka. Example) - zz = zip(self.fieldNames(),field_values) - vst = [self.valuesVStack(fieldname,field_values) for fieldname,field_values in zz] - example = Example(self.fieldNames(), vst) - #return MinibatchDataSet(example, self.valuesVStack, self.valuesHStack) - # now returns a LookupList - return example - - # what in the world is i? - raise TypeError(i, type(i)) - - - """ - Enables the call dataset.subset[a:b:c] that will return a DataSet - around the examples returned by __getitem__(slice(a,b,c)) - - @SEE DataSet.__getsubset(self) - """ - subset = property(lambda s : s.__getsubset(),doc="returns a subset as a DataSet") - - - def __getsubset(self) : - """ - Enables the call data.subset[a:b:c], returns a DataSet. - Default implementation is a simple wrap around __getitem__() using MinibatchDataSet. - - @RETURN DataSet - @SEE DataSet.subset = property(lambda s : s.__getsubset()) - """ - _self = self - class GetSliceReturnsDataSet(object) : - def __getitem__(self,slice) : - return MinibatchDataSet(_self.__getitem__(slice)) - return GetSliceReturnsDataSet() - - - - def valuesHStack(self,fieldnames,fieldvalues): - """ - Return a value that corresponds to concatenating (horizontally) several field values. - This can be useful to merge some fields. The implementation of this operation is likely - to involve a copy of the original values. When the values are numpy arrays, the - result should be numpy.hstack(values). If it makes sense, this operation should - work as well when each value corresponds to multiple examples in a minibatch - e.g. if each value is a Ni-vector and a minibatch of length L is a LxNi matrix, - then the result should be a Lx(N1+N2+..) matrix equal to numpy.hstack(values). - The default is to use numpy.hstack for numpy.ndarray values, and a list - pointing to the original values for other data types. - """ - all_numpy=True - for value in fieldvalues: - if not type(value) is numpy.ndarray: - all_numpy=False - if all_numpy: - return numpy.hstack(fieldvalues) - # the default implementation of horizontal stacking is to put values in a list - return fieldvalues - - def valuesVStack(self,fieldname,values): - """ - @param fieldname: the name of the field from which the values were taken - @type fieldname: any type - - @param values: bits near the beginning or end of the dataset - @type values: list of minibatches (returned by minibatches_nowrap) - - @return: the concatenation (stacking) of the values - @rtype: something suitable as a minibatch field - """ - rval = [] - for v in values: - rval.extend(v) - return rval - - def __or__(self,other): - """ - dataset1 | dataset2 returns a dataset whose list of fields is the concatenation of the list of - fields of the argument datasets. This only works if they all have the same length. - """ - return HStackedDataSet([self,other]) - - def __and__(self,other): - """ - dataset1 & dataset2 is a dataset that concatenates the examples from the argument datasets - (and whose length is the sum of the length of the argument datasets). This only - works if they all have the same fields. - """ - return VStackedDataSet([self,other]) - -def hstack(datasets): - """ - hstack(dataset1,dataset2,...) returns dataset1 | datataset2 | ... - which is a dataset whose fields list is the concatenation of the fields - of the individual datasets. - """ - assert len(datasets)>0 - if len(datasets)==1: - return datasets[0] - return HStackedDataSet(datasets) - -def vstack(datasets): - """ - vstack(dataset1,dataset2,...) returns dataset1 & datataset2 & ... - which is a dataset which iterates first over the examples of dataset1, then - over those of dataset2, etc. - """ - assert len(datasets)>0 - if len(datasets)==1: - return datasets[0] - return VStackedDataSet(datasets) - -class FieldsSubsetDataSet(DataSet): - """ - A sub-class of L{DataSet} that selects a subset of the fields. - """ - def __init__(self,src,fieldnames): - self.src=src - self.fieldnames=fieldnames - assert src.hasFields(*fieldnames) - self.valuesHStack = src.valuesHStack - self.valuesVStack = src.valuesVStack - - def __len__(self): return len(self.src) - - def fieldNames(self): - return self.fieldnames - - def __iter__(self): - class FieldsSubsetIterator(object): - def __init__(self,ds): - self.ds=ds - self.src_iter=ds.src.__iter__() - self.example=None - def __iter__(self): return self - def next(self): - complete_example = self.src_iter.next() - if self.example: - self.example._values=[complete_example[field] - for field in self.ds.fieldnames] - else: - self.example=Example(self.ds.fieldnames, - [complete_example[field] for field in self.ds.fieldnames]) - return self.example - return FieldsSubsetIterator(self) - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - assert self.hasFields(*fieldnames) - return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset) - def dontuse__getitem__(self,i): - return FieldsSubsetDataSet(self.src[i],self.fieldnames) - -class RenamedFieldsDataSet(DataSet): - """ - A sub-class of L{DataSet} that selects and renames a subset of the fields. - """ - def __init__(self,src,src_fieldnames,new_fieldnames): - self.src=src - self.src_fieldnames=src_fieldnames - self.new_fieldnames=new_fieldnames - assert src.hasFields(*src_fieldnames) - assert len(src_fieldnames)==len(new_fieldnames) - self.valuesHStack = src.valuesHStack - self.valuesVStack = src.valuesVStack - self.lookup_fields = Example(new_fieldnames,src_fieldnames) - - def __len__(self): return len(self.src) - - def fieldNames(self): - return self.new_fieldnames - - def __iter__(self): - class FieldsSubsetIterator(object): - def __init__(self,ds): - self.ds=ds - self.src_iter=ds.src.__iter__() - self.example=None - def __iter__(self): return self - def next(self): - complete_example = self.src_iter.next() - if self.example: - self.example._values=[complete_example[field] - for field in self.ds.src_fieldnames] - else: - self.example=Example(self.ds.new_fieldnames, - [complete_example[field] - for field in self.ds.src_fieldnames]) - return self.example - return FieldsSubsetIterator(self) - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - assert self.hasFields(*fieldnames) - cursor = Example(fieldnames,[0]*len(fieldnames)) - for batch in self.src.minibatches_nowrap([self.lookup_fields[f] for f in fieldnames],minibatch_size,n_batches,offset): - cursor._values=batch._values - yield cursor - - def __getitem__(self,i): -# return FieldsSubsetDataSet(self.src[i],self.new_fieldnames) - complete_example = self.src[i] - return Example(self.new_fieldnames, - [complete_example[field] - for field in self.src_fieldnames]) - - - -class DataSetFields(Example): - """ - Although a L{DataSet} iterates over examples (like rows of a matrix), an associated - DataSetFields iterates over fields (like columns of a matrix), and can be understood - as a transpose of the associated dataset. - - To iterate over fields, one can do - * for fields in dataset.fields() - * for fields in dataset(field1,field2,...).fields() to select a subset of fields - * for fields in dataset.fields(field1,field2,...) to select a subset of fields - and each of these fields is iterable over the examples: - * for field_examples in dataset.fields(): - for example_value in field_examples: - ... - but when the dataset is a stream (unbounded length), it is not recommended to do - such things because the underlying dataset may refuse to access the different fields in - an unsynchronized ways. Hence the fields() method is illegal for streams, by default. - The result of fields() is a DataSetFields object, which iterates over fields, - and whose elements are iterable over examples. A DataSetFields object can - be turned back into a DataSet with its examples() method: - dataset2 = dataset1.fields().examples() - and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1). - - DataSetFields can be concatenated vertically or horizontally. To be consistent with - the syntax used for DataSets, the | concatenates the fields and the & concatenates - the examples. - """ - def __init__(self,dataset,fieldnames): - original_dataset=dataset - if not fieldnames: - fieldnames=dataset.fieldNames() - elif not list(fieldnames)==list(dataset.fieldNames()): - #we must cast to list, othersize('x','y')!=['x','y'] - dataset = FieldsSubsetDataSet(dataset,fieldnames) - assert dataset.hasFields(*fieldnames) - self.dataset=dataset - - if isinstance(dataset,MinibatchDataSet): - Example.__init__(self,fieldnames,list(dataset._fields)) - elif isinstance(original_dataset,MinibatchDataSet): - Example.__init__(self,fieldnames, - [original_dataset._fields[field] - for field in fieldnames]) - else: - minibatch_iterator = dataset.minibatches(fieldnames, - minibatch_size=len(dataset), - n_batches=1) - minibatch=minibatch_iterator.next() - Example.__init__(self,fieldnames,minibatch) - - def examples(self): - return self.dataset - - def __or__(self,other): - """ - fields1 | fields2 is a DataSetFields that whose list of examples is the concatenation - of the list of examples of DataSetFields fields1 and fields2. - """ - return (self.examples() + other.examples()).fields() - - def __and__(self,other): - """ - fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation - of the fields of DataSetFields fields1 and fields2. - """ - return (self.examples() | other.examples()).fields() - - -class MinibatchDataSet(DataSet): - """ - Turn a L{Example} of same-length (iterable) fields into an example-iterable dataset. - Each element of the lookup-list should be an iterable and sliceable, all of the same length. - """ - def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack, - values_hstack=DataSet().valuesHStack): - """ - The user can (and generally should) also provide values_vstack(fieldname,fieldvalues) - and a values_hstack(fieldnames,fieldvalues) functions behaving with the same - semantics as the DataSet methods of the same name (but without the self argument). - """ - - self._fields=fields_lookuplist - assert len(fields_lookuplist)>0 - self.length=len(fields_lookuplist[0]) - for field in fields_lookuplist[1:]: - if self.length != len(field) : - print 'self.length = ',self.length - print 'len(field) = ', len(field) - print 'self._fields.keys() = ', self._fields.keys() - print 'field=',field - print 'fields_lookuplist=', fields_lookuplist - assert self.length==len(field) - self.valuesVStack=values_vstack - self.valuesHStack=values_hstack - - def __len__(self): - return self.length - - def dontuse__getitem__(self,i): - if type(i) in (slice,list): - return DataSetFields(MinibatchDataSet( - Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames()) - if type(i) is int: - return Example(self._fields.keys(),[field[i] for field in self._fields]) - if self.hasFields(i): - return self._fields[i] - assert i in self.__dict__ # else it means we are trying to access a non-existing property - return self.__dict__[i] - - def fieldNames(self): - return self._fields.keys() - - def hasFields(self,*fieldnames): - for fieldname in fieldnames: - if fieldname not in self._fields.keys(): - return False - return True - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - #@TODO bug somewhere here, fieldnames doesnt seem to be well handled - class Iterator(object): - def __init__(self,ds,fieldnames): - # tbm: added two next lines to handle fieldnames - if fieldnames is None: fieldnames = ds._fields.keys() - self.fieldnames = fieldnames - - self.ds=ds - self.next_example=offset - assert minibatch_size >= 0 - if offset+minibatch_size > ds.length: - raise NotImplementedError() - def __iter__(self): - return self - def next(self): - upper = self.next_example+minibatch_size - if upper > len(self.ds) : - raise StopIteration() - assert upper<=len(self.ds) # instead of self.ds.length - #minibatch = Example(self.ds._fields.keys(), - # [field[self.next_example:upper] - # for field in self.ds._fields]) - # tbm: modif to use fieldnames - values = [] - for f in self.fieldnames : - #print 'we have field',f,'in fieldnames' - values.append( self.ds._fields[f][self.next_example:upper] ) - minibatch = Example(self.fieldnames,values) - #print minibatch - self.next_example+=minibatch_size - return minibatch - - # tbm: added fieldnames to handle subset of fieldnames - return Iterator(self,fieldnames) - -class HStackedDataSet(DataSet): - """ - A L{DataSet} that wraps several datasets and shows a view that includes all their fields, - i.e. whose list of fields is the concatenation of their lists of fields. - - If a field name is found in more than one of the datasets, then either an error is - raised or the fields are renamed (either by prefixing the __name__ attribute - of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list). - - @todo: automatically detect a chain of stacked datasets due to A | B | C | D ... - """ - def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None): - DataSet.__init__(self,description,field_types) - self.datasets=datasets - self.accept_nonunique_names=accept_nonunique_names - self.fieldname2dataset={} - - def rename_field(fieldname,dataset,i): - if hasattr(dataset,"__name__"): - return dataset.__name__ + "." + fieldname - return fieldname+"."+str(i) - - # make sure all datasets have the same length and unique field names - self.length=None - names_to_change=[] - for i in xrange(len(datasets)): - dataset = datasets[i] - length=len(dataset) - if self.length: - assert self.length==length - else: - self.length=length - for fieldname in dataset.fieldNames(): - if fieldname in self.fieldname2dataset: # name conflict! - if accept_nonunique_names: - fieldname=rename_field(fieldname,dataset,i) - names2change.append((fieldname,i)) - else: - raise ValueError("Incompatible datasets: non-unique field name = "+fieldname) - self.fieldname2dataset[fieldname]=i - for fieldname,i in names_to_change: - del self.fieldname2dataset[fieldname] - self.fieldname2dataset[rename_field(fieldname,self.datasets[i],i)]=i - - def __len__(self): - return len(self.datasets[0]) - - def hasFields(self,*fieldnames): - for fieldname in fieldnames: - if not fieldname in self.fieldname2dataset: - return False - return True - - def fieldNames(self): - return self.fieldname2dataset.keys() - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - - class HStackedIterator(object): - def __init__(self,hsds,iterators): - self.hsds=hsds - self.iterators=iterators - def __iter__(self): - return self - def next(self): - # concatenate all the fields of the minibatches - l=Example() - for iter in self.iterators: - l.append_lookuplist(iter.next()) - return l - - assert self.hasFields(*fieldnames) - # find out which underlying datasets are necessary to service the required fields - # and construct corresponding minibatch iterators - if fieldnames and fieldnames!=self.fieldNames(): - datasets=set([]) - fields_in_dataset=dict([(dataset,[]) for dataset in datasets]) - for fieldname in fieldnames: - dataset=self.datasets[self.fieldname2dataset[fieldname]] - datasets.add(dataset) - fields_in_dataset[dataset].append(fieldname) - datasets=list(datasets) - iterators=[dataset.minibatches(fields_in_dataset[dataset],minibatch_size,n_batches,offset) - for dataset in datasets] - else: - datasets=self.datasets - iterators=[dataset.minibatches(None,minibatch_size,n_batches,offset) for dataset in datasets] - return HStackedIterator(self,iterators) - - - def untested_valuesVStack(self,fieldname,fieldvalues): - return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues) - - def untested_valuesHStack(self,fieldnames,fieldvalues): - """ - We will use the sub-dataset associated with the first fieldname in the fieldnames list - to do the work, hoping that it can cope with the other values (i.e. won't care - about the incompatible fieldnames). Hence this heuristic will always work if - all the fieldnames are of the same sub-dataset. - """ - return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues) - -class VStackedDataSet(DataSet): - """ - A L{DataSet} that wraps several datasets and shows a view that includes all their examples, - in the order provided. This clearly assumes that they all have the same field names - and all (except possibly the last one) are of finite length. - - @todo: automatically detect a chain of stacked datasets due to A + B + C + D ... - """ - def __init__(self,datasets): - self.datasets=datasets - self.length=0 - self.index2dataset={} - assert len(datasets)>0 - fieldnames = datasets[-1].fieldNames() - self.datasets_start_row=[] - # We use this map from row index to dataset index for constant-time random access of examples, - # to avoid having to search for the appropriate dataset each time and slice is asked for. - for dataset,k in enumerate(datasets[0:-1]): - assert dataset.is_unbounded() # All VStacked datasets (except possibly the last) must be bounded (have a length). - L=len(dataset) - for i in xrange(L): - self.index2dataset[self.length+i]=k - self.datasets_start_row.append(self.length) - self.length+=L - assert dataset.fieldNames()==fieldnames - self.datasets_start_row.append(self.length) - self.length+=len(datasets[-1]) - # If length is very large, we should use a more memory-efficient mechanism - # that does not store all indices - if self.length>1000000: - # 1 million entries would require about 60 meg for the index2dataset map - # TODO - print "A more efficient mechanism for index2dataset should be implemented" - - def __len__(self): - return self.length - - def fieldNames(self): - return self.datasets[0].fieldNames() - - def hasFields(self,*fieldnames): - return self.datasets[0].hasFields(*fieldnames) - - def locate_row(self,row): - """Return (dataset_index, row_within_dataset) for global row number""" - dataset_index = self.index2dataset[row] - row_within_dataset = self.datasets_start_row[dataset_index] - return dataset_index, row_within_dataset - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - - class VStackedIterator(object): - def __init__(self,vsds): - self.vsds=vsds - self.next_row=offset - self.next_dataset_index,self.next_dataset_row=self.vsds.locate_row(offset) - self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ - self.next_iterator(vsds.datasets[0],offset,n_batches) - - def next_iterator(self,dataset,starting_offset,batches_left): - L=len(dataset) - ds_nbatches = (L-starting_offset)/minibatch_size - if batches_left is not None: - ds_nbatches = max(batches_left,ds_nbatches) - if minibatch_size>L: - ds_minibatch_size=L - n_left_in_mb=minibatch_size-L - ds_nbatches=1 - else: - n_left_in_mb=0 - return dataset.minibatches(fieldnames,minibatch_size,ds_nbatches,starting_offset), \ - L-(starting_offset+ds_nbatches*minibatch_size), n_left_in_mb - - def move_to_next_dataset(self): - if self.n_left_at_the_end_of_ds>0: - self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ - self.next_iterator(vsds.datasets[self.next_dataset_index], - self.n_left_at_the_end_of_ds,1) - else: - self.next_dataset_index +=1 - if self.next_dataset_index==len(self.vsds.datasets): - self.next_dataset_index = 0 - self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ - self.next_iterator(vsds.datasets[self.next_dataset_index],starting_offset,n_batches) - - def __iter__(self): - return self - - def next(self): - dataset=self.vsds.datasets[self.next_dataset_index] - mb = self.next_iterator.next() - if self.n_left_in_mb: - extra_mb = [] - while self.n_left_in_mb>0: - self.move_to_next_dataset() - extra_mb.append(self.next_iterator.next()) - mb = Example(fieldnames, - [dataset.valuesVStack(name, - [mb[name]]+[b[name] for b in extra_mb]) - for name in fieldnames]) - - self.next_row+=minibatch_size - self.next_dataset_row+=minibatch_size - if self.next_row+minibatch_size>len(dataset): - self.move_to_next_dataset() - return examples - return VStackedIterator(self) - -class ArrayFieldsDataSet(DataSet): - """ - Virtual super-class of datasets whose field values are numpy array, - thus defining valuesHStack and valuesVStack for sub-classes. - """ - def __init__(self,description=None,field_types=None): - DataSet.__init__(self,description,field_types) - def untested_valuesHStack(self,fieldnames,fieldvalues): - """Concatenate field values horizontally, e.g. two vectors - become a longer vector, two matrices become a wider matrix, etc.""" - return numpy.hstack(fieldvalues) - def untested_valuesVStack(self,fieldname,values): - """Concatenate field values vertically, e.g. two vectors - become a two-row matrix, two matrices become a longer matrix, etc.""" - return numpy.vstack(values) - - - -class NArraysDataSet(ArrayFieldsDataSet) : - """ - An NArraysDataSet stores fields that are numpy tensor, whose first axis - iterates over examples. It's a generalization of ArrayDataSet. - """ - #@TODO not completely implemented yet - def __init__(self, data_arrays, fieldnames, **kwargs) : - """ - Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list - of fieldnames. The number of arrays must be the same as the number of - fieldnames. Each set of numpy tensor must have the same first dimension (first - axis) corresponding to the number of examples. - - Every tensor is treated as a numpy array (using numpy.asarray) - """ - ArrayFieldsDataSet.__init__(self,**kwargs) - assert len(data_arrays) == len(fieldnames) - assert len(fieldnames) > 0 - ndarrays = [numpy.asarray(a) for a in data_arrays] - lens = [a.shape[0] for a in ndarrays] - num_examples = lens[0] #they must all be equal anyway - self._fieldnames = fieldnames - for k in ndarrays : - assert k.shape[0] == num_examples - self._datas = ndarrays - # create dict - self.map_field_idx = dict() - for k in range(len(fieldnames)): - self.map_field_idx[fieldnames[k]] = k - - - def __len__(self) : - """ - Length of the dataset is based on the first array = data_arrays[0], using its shape - """ - return self._datas[0].shape[0] - - def fieldNames(self) : - """ - Returns the fieldnames as set in self.__init__ - """ - return self._fieldnames - - def field_pos(self,fieldname) : - """ - Returns the index of a given fieldname. Fieldname must exists! see fieldNames(). - """ - return self.map_field_idx[fieldname] - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - cursor = Example(fieldnames,[0]*len(fieldnames)) - fieldnames = self.fieldNames() if fieldnames is None else fieldnames - for n in xrange(n_batches): - if offset == len(self): - break - for f in range(len(cursor._names)) : - idx = self.field_pos(cursor._names[f]) - sub_data = self._datas[idx][offset : offset+minibatch_size] - cursor._values[f] = sub_data - offset += len(sub_data) #can be less than minibatch_size at end - yield cursor - - #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) - - - - -class ArrayDataSet(ArrayFieldsDataSet): - """ - An ArrayDataSet stores the fields as groups of columns in a numpy tensor, - whose first axis iterates over examples, second axis determines fields. - If the underlying array is N-dimensional (has N axes), then the field - values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2). - """ - - def __init__(self, data_array, fields_columns, **kwargs): - """ - Construct an ArrayDataSet from the underlying numpy array (data) and - a map (fields_columns) from fieldnames to field columns. The columns of a field are specified - using the standard arguments for indexing/slicing: integer for a column index, - slice for an interval of columns (with possible stride), or iterable of column indices. - """ - ArrayFieldsDataSet.__init__(self, **kwargs) - self.data=data_array - self.fields_columns=fields_columns - - # check consistency and complete slices definitions - for fieldname, fieldcolumns in self.fields_columns.items(): - if type(fieldcolumns) is int: - assert fieldcolumns>=0 and fieldcolumns=0 and i=self.l: - raise StopIteration - sub_data = self.dataset.data[self.current] - self.minibatch._values = [sub_data[c] for c in self.columns] - - self.current+=1 - return self.minibatch - - return ArrayDataSetIteratorIter(self,self.fieldNames()) - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - cursor = Example(fieldnames,[0]*len(fieldnames)) - fieldnames = self.fieldNames() if fieldnames is None else fieldnames - if n_batches == None: - n_batches = (len(self) - offset) / minibatch_size - for n in xrange(n_batches): - if offset == len(self): - break - sub_data = self.data[offset : offset+minibatch_size] - offset += len(sub_data) #can be less than minibatch_size at end - cursor._values = [sub_data[:,self.fields_columns[f]] for f in cursor._names] - yield cursor - - #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) - - -class CachedDataSet(DataSet): - """ - Wrap a L{DataSet} whose values are computationally expensive to obtain - (e.g. because they involve some computation, or disk access), - so that repeated accesses to the same example are done cheaply, - by caching every example value that has been accessed at least once. - - Optionally, for finite-length dataset, all the values can be computed - (and cached) upon construction of the CachedDataSet, rather at the - first access. - - @todo: when cache_all_upon_construction create mini-batches that are as - large as possible but not so large as to fill up memory. - - @todo: add disk-buffering capability, so that when the cache becomes too - big for memory, we cache things on disk, trying to keep in memory only - the record most likely to be accessed next. - """ - def __init__(self,source_dataset,cache_all_upon_construction=False): - self.source_dataset=source_dataset - self.cache_all_upon_construction=cache_all_upon_construction - self.cached_examples = [] - if cache_all_upon_construction: - # this potentially brings all the source examples - # into memory at once, which may be too much - # the work could possibly be done by minibatches - # that are as large as possible but no more than what memory allows. - # - # field_values is supposed to be an DataSetFields, that inherits from LookupList - #fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next() - fields_values = DataSetFields(source_dataset,None) - assert all([len(self)==len(field_values) for field_values in fields_values]) - for example in fields_values.examples(): - self.cached_examples.append(copy.copy(example)) - - self.fieldNames = source_dataset.fieldNames - self.hasFields = source_dataset.hasFields - self.valuesHStack = source_dataset.valuesHStack - self.valuesVStack = source_dataset.valuesVStack - - def __len__(self): - return len(self.source_dataset) - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - class CacheIterator(object): - def __init__(self,dataset): - self.dataset=dataset - self.current=offset - self.all_fields = self.dataset.fieldNames()==fieldnames - self.n_batches = n_batches - self.batch_counter = 0 - def __iter__(self): return self - def next(self): - self.batch_counter += 1 - if self.n_batches and self.batch_counter > self.n_batches : - raise StopIteration() - upper = self.current+minibatch_size - if upper > len(self.dataset.source_dataset): - raise StopIteration() - cache_len = len(self.dataset.cached_examples) - if upper>cache_len: # whole minibatch is not already in cache - # cache everything from current length to upper - #for example in self.dataset.source_dataset[cache_len:upper]: - for example in self.dataset.source_dataset.subset[cache_len:upper]: - self.dataset.cached_examples.append(example) - all_fields_minibatch = Example(self.dataset.fieldNames(), - zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size])) - - self.current+=minibatch_size - if self.all_fields: - return all_fields_minibatch - return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames]) - return CacheIterator(self) - - def dontuse__getitem__(self,i): - if type(i)==int and len(self.cached_examples)>i: - return self.cached_examples[i] - else: - return self.source_dataset[i] - - def __iter__(self): - class CacheIteratorIter(object): - def __init__(self,dataset): - self.dataset=dataset - self.l = len(dataset) - self.current = 0 - self.fieldnames = self.dataset.fieldNames() - self.example = Example(self.fieldnames,[0]*len(self.fieldnames)) - def __iter__(self): return self - def next(self): - if self.current>=self.l: - raise StopIteration - cache_len = len(self.dataset.cached_examples) - if self.current>=cache_len: # whole minibatch is not already in cache - # cache everything from current length to upper - self.dataset.cached_examples.append( - self.dataset.source_dataset[self.current]) - self.example._values = self.dataset.cached_examples[self.current] - self.current+=1 - return self.example - - return CacheIteratorIter(self) - -class ApplyFunctionDataSet(DataSet): - """ - A L{DataSet} that contains as fields the results of applying a - given function example-wise or minibatch-wise to all the fields of - an input dataset. The output of the function should be an iterable - (e.g. a list or a LookupList) over the resulting values. - - The function take as input the fields of the dataset, not the examples. - - In minibatch mode, the function is expected to work on minibatches - (takes a minibatch in input and returns a minibatch in output). More - precisely, it means that each element of the input or output list - should be iterable and indexable over the individual example values - (typically these elements will be numpy arrays). All of the elements - in the input and output lists should have the same length, which is - the length of the minibatch. - - The function is applied each time an example or a minibatch is accessed. - To avoid re-doing computation, wrap this dataset inside a CachedDataSet. - - If the values_{h,v}stack functions are not provided, then - the input_dataset.values{H,V}Stack functions are used by default. - - """ - - def __init__(self,input_dataset,function,output_names,minibatch_mode=True, - values_hstack=None,values_vstack=None, - description=None,fieldtypes=None): - """ - Constructor takes an input dataset that has as many fields as the function - expects as inputs. The resulting dataset has as many fields as the function - produces as outputs, and that should correspond to the number of output names - (provided in a list). - - Note that the expected semantics of the function differs in minibatch mode - (it takes minibatches of inputs and produces minibatches of outputs, as - documented in the class comment). - - TBM: are fieldtypes the old field types (from input_dataset) or the new ones - (for the new dataset created)? - """ - self.input_dataset=input_dataset - self.function=function - self.output_names=output_names - #print 'self.output_names in afds:', self.output_names - #print 'length in afds:', len(self.output_names) - self.minibatch_mode=minibatch_mode - DataSet.__init__(self,description,fieldtypes) - self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack - self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack - - def __len__(self): - return len(self.input_dataset) - - def fieldNames(self): - return self.output_names - - def minibatches_nowrap(self, fieldnames, *args, **kwargs): - all_input_fieldNames = self.input_dataset.fieldNames() - mbnw = self.input_dataset.minibatches_nowrap - - for input_fields in mbnw(all_input_fieldNames, *args, **kwargs): - if self.minibatch_mode: - all_output_fields = self.function(*input_fields) - else: - input_examples = zip(*input_fields) #makes so that [i] means example i - output_examples = [self.function(*input_example) - for input_example in input_examples] - all_output_fields = zip(*output_examples) - - #print 'output_names=', self.output_names - #print 'all_output_fields', all_output_fields - #print 'len(all_output_fields)=', len(all_output_fields) - all_outputs = Example(self.output_names, all_output_fields) - if fieldnames==self.output_names: - rval = all_outputs - else: - rval = Example(fieldnames,[all_outputs[name] for name in fieldnames]) - #print 'rval', rval - #print '--------' - yield rval - - def untested__iter__(self): # only implemented for increased efficiency - class ApplyFunctionSingleExampleIterator(object): - def __init__(self,output_dataset): - self.current=0 - self.output_dataset=output_dataset - self.input_iterator=output_dataset.input_dataset.__iter__() - def __iter__(self): return self - def next(self): - if self.output_dataset.minibatch_mode: - function_inputs = [[input] for input in self.input_iterator.next()] - outputs = self.output_dataset.function(*function_inputs) - assert all([hasattr(output,'__iter__') for output in outputs]) - function_outputs = [output[0] for output in outputs] - else: - function_inputs = self.input_iterator.next() - function_outputs = self.output_dataset.function(*function_inputs) - return Example(self.output_dataset.output_names,function_outputs) - return ApplyFunctionSingleExampleIterator(self) - -def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): - """ - Wraps an arbitrary L{DataSet} into one for supervised learning tasks - by forcing the user to define a set of fields as the 'input' field - and a set of fields as the 'target' field. Optionally, a single - weight_field can also be defined. - """ - args = ((input_fields,'input'),(output_fields,'target')) - if weight_field: args+=(([weight_field],'weight')) - return src_dataset.merge_fields(*args) - - - - diff -r 27b1344a57b1 -r 8fff4bc26f4c datasets/MNIST.py --- a/datasets/MNIST.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -""" -Various routines to load/access MNIST data. -""" -from __future__ import absolute_import - -import os -import numpy - -from ..amat import AMat -from .config import data_root - -def head(n=10, path=None): - """Load the first MNIST examples. - - Returns two matrices: x, y. x has N rows of 784 columns. Each row of x represents the - 28x28 grey-scale pixels in raster order. y is a vector of N integers. Each element y[i] - is the label of the i'th row of x. - - """ - path = os.path.join(data_root(), 'mnist','mnist_with_header.amat') if path is None else path - - dat = AMat(path=path, head=n) - - try: - assert dat.input.shape[0] == n - assert dat.target.shape[0] == n - except Exception , e: - raise Exception("failed to read MNIST data", (dat, e)) - - return dat.input, numpy.asarray(dat.target, dtype='int64').reshape(dat.target.shape[0]) - -def train_valid_test(ntrain=50000, nvalid=10000, ntest=10000, path=None): - all_x, all_targ = head(ntrain+nvalid+ntest, path=path) - - train = all_x[0:ntrain], all_targ[0:ntrain] - valid = all_x[ntrain:ntrain+nvalid], all_targ[ntrain:ntrain+nvalid] - test = all_x[ntrain+nvalid:ntrain+nvalid+ntest], all_targ[ntrain+nvalid:ntrain+nvalid+ntest] - - return train, valid, test - -def all(path=None): - return head(n=None, path=path) - - diff -r 27b1344a57b1 -r 8fff4bc26f4c datasets/__init__.py --- a/datasets/__init__.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -from dataset import dataset, Dataset diff -r 27b1344a57b1 -r 8fff4bc26f4c datasets/config.py --- a/datasets/config.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ -"""Configuration options for datasets - - -Especially, the locations of data files. -""" - -import os -def env_get(key, default): - return default if os.getenv(key) is None else os.getenv(key) - -def data_root(): - return env_get('PYLEARN_DATA_ROOT', '/u/bergstrj/pub/data/') - diff -r 27b1344a57b1 -r 8fff4bc26f4c datasets/dataset.py --- a/datasets/dataset.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,118 +0,0 @@ -"""The dataset-from-descriptor mechanism.""" - -_factory = {} - -def add_dataset_factory(tok0, fn): - """Add `fn` as the handler for descriptors whose first token is `tok0`. - - :returns: None - - """ - if tok0 in _factory: - raise Exception('Identifier already in use:', tok0) - else: - _factory[tok0] = fn - -def dataset_factory(tok0): - """Register a function as the handler for a given kind of dataset, identified by `tok0`. - - When someone calls dataset_from_descr('kind_of_dataset option1 option2, etc.', approx=1), - then the handler registered for 'kind_of_dataset' will be called with the same arguments as - dataset_from_descr. - - .. code-block:: python - - @dataset_factory('MNIST') - def mnist_related_dataset(descr, **kwargs): - ... - - :returns: `dectorator` - """ - def decorator(fn): - add_dataset_factory(tok0, fn) - return fn - return decorator - -def dataset(descr, **kwargs): - """Return the dataset described by `descr`. - - :param descr: a dataset identifier - :type descr: str - :returns: `Dataset` - - """ - tok0 = descr.split()[0] - fn = _factory[tok0] - return fn(descr, **kwargs) - - -class Dataset(object): - """Dataset is a generic container for pylearn datasets. - - It is not intended to put any restriction whatsoever on its contents. - - It is intended to encourage certain conventions, described below. Conventions should arise - naturally among datasets in PyLearn. When a few datasets adhere to a new convention, then - describe it here and make it more official. - - If no particular convention applies. Create your own object to store the dataset, and - assign it to the `data` attribute. - """ - data = None - - """ - SIMPLE REGRESSION / CLASSIFICATION - ---------------------------------- - - In this setting, you are aiming to do vector classification or vector regression - where your train, valid and test sets fit in memory. - The convention is to put your data into numpy ndarray instances. Put training data in the - `train` attribute, validation data in the `valid` attribute and test data in the `test - attribute`. - Each of those attributes should be an instance that defines at least two attributes: `x` for the - input matrix and `y` for the target matrix. The `x` ndarray should be one example per - leading index (row for matrices). - The `y` ndarray should be one target per leading index (entry for vectors, row for matrices). - If `y` is a classification target, than it should be a vector with numpy dtype 'int32'. - - If there are weights associated with different examples, then create a 'weights' attribute whose - value is a vector with one floating-point value (typically double-precision) per example. - - If the task is classification, then the classes should be mapped to the integers - 0,1,...,N-1. - The number of classes (here, N) should be stored in the `n_classes` attribute. - - """ - train = None #instance with .x, .y - - valid = None #instance with .x, .y - - test = None #instance with .x, .y - - n_classes = None #int - - """ - WHEN INPUTS ARE FIXED-SIZE GREYSCALE IMAGES - ------------------------------------------- - - In this setting we typically encode images as vectors, by enumerating the pixel values in - left-to-right, top-to-bottom order. Pixel values should be in floating-point, and - normalized between 0 and 1. - - The shape of the images should be recorded in the `img_shape` attribute as a tuple (rows, - cols). - - """ - - img_shape = None # (rows, cols) - - - """ - TIMESERIES - ---------- - - When dealing with examples which are themselves timeseries, put each example timeseries in a - tensor and make a list of them. Generally use tensors, and resort to lists or arrays - wherever different - """ - diff -r 27b1344a57b1 -r 8fff4bc26f4c datasets/shapeset1.py --- a/datasets/shapeset1.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -""" -Routines to load/access Shapeset1 -""" - -from __future__ import absolute_import - -import os -import numpy - -from ..amat import AMat -from .config import data_root - -def _head(path, n): - dat = AMat(path=path, head=n) - - try: - assert dat.input.shape[0] == n - assert dat.target.shape[0] == n - except Exception , e: - raise Exception("failed to read %i lines from file %s" % (n, path)) - - return dat.input, numpy.asarray(dat.target, dtype='int64').reshape(dat.target.shape[0]) - - -def head_train(n=10000): - """Load the first Shapeset1 training examples. - - Returns two matrices: x, y. x has N rows of 1024 columns. Each row of x represents the - 32x32 grey-scale pixels in raster order. y is a vector of N integers. Each element y[i] - is the label of the i'th row of x. - - """ - path = os.path.join(data_root(), 'shapeset1','shapeset1_1cspo_2_3.10000.train.shape.amat') - return _head(path, n) - -def head_valid(n=5000): - """Load the first Shapeset1 validation examples. - - Returns two matrices: x, y. x has N rows of 1024 columns. Each row of x represents the - 32x32 grey-scale pixels in raster order. y is a vector of N integers. Each element y[i] - is the label of the i'th row of x. - - """ - path = os.path.join(data_root(), 'shapeset1','shapeset1_1cspo_2_3.5000.valid.shape.amat') - return _head(path, n) - -def head_test(n=5000): - """Load the first Shapeset1 testing examples. - - Returns two matrices: x, y. x has N rows of 1024 columns. Each row of x represents the - 32x32 grey-scale pixels in raster order. y is a vector of N integers. Each element y[i] - is the label of the i'th row of x. - - """ - path = os.path.join(data_root(), 'shapeset1','shapeset1_1cspo_2_3.5000.test.shape.amat') - return _head(path, n) - -def train_valid_test(ntrain=10000, nvalid=5000, ntest=5000): - return head_train(n=ntrain), head_valid(n=nvalid), head_test(n=ntest) - - diff -r 27b1344a57b1 -r 8fff4bc26f4c datasets/smallNorb.py --- a/datasets/smallNorb.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,108 +0,0 @@ -import os -import numpy -from ..filetensor import read -from .config import data_root - -#Path = '/u/bergstrj/pub/data/smallnorb' -#Path = '/home/fringant2/lisa/louradoj/data/smallnorb' -#Path = '/home/louradou/data/norb' - -class Paths(object): - """File-related operations on smallNorb - """ - def __init__(self): - smallnorb = [data_root(), 'smallnorb'] - self.train_dat = os.path.join(*\ - smallnorb + ['smallnorb-5x46789x9x18x6x2x96x96-training-dat.mat']) - self.test_dat = os.path.join(*\ - smallnorb + ['smallnorb-5x01235x9x18x6x2x96x96-testing-dat.mat']) - self.train_cat = os.path.join(*\ - smallnorb + ['smallnorb-5x46789x9x18x6x2x96x96-training-cat.mat']) - self.test_cat = os.path.join(*\ - smallnorb + ['smallnorb-5x01235x9x18x6x2x96x96-testing-cat.mat']) - self.train_info = os.path.join(*\ - smallnorb + ['smallnorb-5x46789x9x18x6x2x96x96-training-info.mat']) - self.test_info = os.path.join(*\ - smallnorb + ['smallnorb-5x01235x9x18x6x2x96x96-testing-info.mat']) - - def load_append_train_test(self, normalize_pixels=True, downsample_amt=1, dtype='uint8'): - """ Load the smallNorb data into numpy matrices. - - normalize_pixels True will divide the values by 255, which makes sense in conjunction - with dtype=float32 or dtype=float64. - - """ - def downsample(dataset): - return dataset[:, 0, ::downsample_amt, ::downsample_amt] - - samples = downsample(read(open(self.train_dat))) - samples = numpy.vstack((samples, downsample(read(open(self.test_dat))))) - samples = numpy.asarray(samples, dtype=dtype) - if normalize_pixels: - samples *= (1.0 / 255.0) - - labels = read(open(self.train_cat)) - labels = numpy.hstack((labels, read(open(self.test_cat)))) - - infos = read(open(self.train_info)) - infos = numpy.vstack((infos, read(open(self.test_info)))) - - return samples, labels, infos - -def smallnorb_iid(ntrain=29160, nvalid=9720, ntest=9720, dtype='float64', normalize_pixels=True): - """Variation of the smallNorb task in which we randomly shuffle all the object instances - together before dividing into train/valid/test. - - The default train/valid/test sizes correspond to 60/20/20 split of the entire dataset. - - :returns: 5, (train_x, train_labels), (valid_x, valid_labels), (test_x, test_labels) - - """ - # cut from /u/louradoj/theano/hpu/expcode1.py - rng = numpy.random.RandomState(1) - samples, labels, infos = Paths().load_append_train_test(downsample_amt=3, dtype=dtype, normalize_pixels=normalize_pixels) - - nsamples = samples.shape[0] - if ntrain + nvalid + ntest > nsamples: - raise Exception("ntrain+nvalid+ntest exceeds number of samples (%i)" % nsamples, - (ntrain, nvalid, ntest)) - i0 = 0 - i1 = ntrain - i2 = ntrain + nvalid - i3 = ntrain + nvalid + ntest - - indices = rng.permutation(nsamples) - train_rows = indices[i0:i1] - valid_rows = indices[i1:i2] - test_rows = indices[i2:i3] - - n_labels = 5 - - def _pick_rows(rows): - a = numpy.array([samples[i].flatten() for i in rows]) - b = numpy.array([labels[i] for i in rows]) - return a, b - - return [_pick_rows(r) for r in (train_rows, valid_rows, test_rows)] - -def smallnorb_azSplit(): - # cut from /u/louradoj/theano/hpu/expcode1.py - # WARNING NOT NECESSARILY WORKING CODE - - samples, labels, infos = _load_append_train_test() - train_rows, valid_rows, test_rows = [], [], [] - train_rows_azimuth = [] - for instance in range(10): - az_min = 4*instance - az_max = 4*instance + 18 - train_rows_azimuth.append( [a % 36 for a in range(az_min,az_max,2)] ) - #print "train_rows_azimuth", train_rows_azimuth - for i, info in enumerate(infos): - if info[2] in train_rows_azimuth[info[0]]: - train_rows.append(i) - elif info[2] / 2 % 2 == 0: - test_rows.append(i) - else: - valid_rows.append(i) - - return [_pick_rows(samples, labels, r) for r in (train_rows, valid_rows, test_rows)] diff -r 27b1344a57b1 -r 8fff4bc26f4c embeddings/README.txt --- a/embeddings/README.txt Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -Messy scripts for working with Jason + Ronan's embeddings. - -Parameters are given in parameters.py diff -r 27b1344a57b1 -r 8fff4bc26f4c embeddings/__init__.py --- a/embeddings/__init__.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -from process import * diff -r 27b1344a57b1 -r 8fff4bc26f4c embeddings/convert.py --- a/embeddings/convert.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -#!/usr/bin/python -""" -Convert stdin sentences to word embeddings, and output YAML. -""" - -import sys, string -import read -import yaml - -output = [] -for l in sys.stdin: - l = string.strip(l) - output.append((l, read.convert_string(l))) - -print yaml.dump(output) diff -r 27b1344a57b1 -r 8fff4bc26f4c embeddings/one-per-line.py --- a/embeddings/one-per-line.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ -#!/usr/bin/python - -import string -#import psyco - -weightsfile = "lm-weights.txt" -vocabfile = "words.asc" -size = 30000 -dimensions = 50 - -import numpy, math -import sys -from common.str import percent - -word_to_vector = {} - -f = open(weightsfile) -f.readline() -vals = [float(v) for v in string.split(f.readline())] -assert len(vals) == size * dimensions -vals.reverse() -#for i in range(size): -r = range(size) -r.reverse() -for i in r: - l = vals[dimensions*i:dimensions*(i+1)] - print string.join([`s` for s in l], "\t") diff -r 27b1344a57b1 -r 8fff4bc26f4c embeddings/parameters.py --- a/embeddings/parameters.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -""" -Locations of the embedding data files. -""" -#WEIGHTSFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/lm-weights.txt" -#VOCABFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/words.asc" -WEIGHTSFILE = "/home/joseph/data/word_embeddings.collobert-and-weston/lm-weights.txt" -VOCABFILE = "/home/joseph/data/word_embeddings.collobert-and-weston/words.asc" -NUMBER_OF_WORDS = 30000 -DIMENSIONS = 50 -UNKNOWN = "UNKNOWN" diff -r 27b1344a57b1 -r 8fff4bc26f4c embeddings/process.py --- a/embeddings/process.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,136 +0,0 @@ -""" -Read in the weights file -""" - -import string -import sys - -from parameters import * - -__words = None -__word_to_embedding = None -__read = False - -def length(): - """ - @return: The length of embeddings - """ - return len(__word_to_embedding[__words[0]]) - -def word_to_embedding(w): - read_embeddings() - return __word_to_embedding[w] - -def read_embeddings(): - global __words - global __word_to_embedding - global __read - if __read: return - - __words = [string.strip(w) for w in open(VOCABFILE).readlines()] - assert len(__words) == NUMBER_OF_WORDS - - import numpy, math - from common.str import percent - - __word_to_embedding = {} - - sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) - f = open(WEIGHTSFILE) - f.readline() - vals = [float(v) for v in string.split(f.readline())] - assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS - for i in range(NUMBER_OF_WORDS): - l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] - w = __words[i] - __word_to_embedding[w] = l - __read = True - for w in __word_to_embedding: assert len(__word_to_embedding[__words[0]]) == len(__word_to_embedding[w]) - sys.stderr.write("...done reading %s\n" % WEIGHTSFILE) - -import re -numberre = re.compile("[0-9]") -slashre = re.compile("\\\/") - -def preprocess_word(origw): - """ - Convert a word so that it can be embedded directly. - Returned the preprocessed sequence. - @note: Preprocessing is appropriate for Penn Treebank style documents. - #@note: Perhaps run L{common.penntreebank.preprocess} on the word first. - """ - read_embeddings() - if origw == "-LRB-": w = "(" - elif origw == "-RRB-": w = ")" - elif origw == "-LCB-": w = "{" - elif origw == "-RCB-": w = "}" - elif origw == "-LSB-": w = "[" - elif origw == "-RSB-": w = "]" - else: - w = origw - if w not in __word_to_embedding: - w = string.lower(w) - w = slashre.sub("/", w) - w = numberre.sub("NUMBER", w) -# if w not in __word_to_embedding: -# w = string.lower(w) -# w = numberre.sub("NUMBER", w) - if w not in __word_to_embedding: -# sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) - w = UNKNOWN - assert w in __word_to_embedding - return w - -def preprocess_seq(l): - """ - Convert a sequence so that it can be embedded directly. - Returned the preprocessed sequence. - @note: Preprocessing is appropriate for Penn Treebank style documents. - """ - read_embeddings() - lnew = [] - for origw in l: - w = preprocess_word(origw) - lnew.append(w) - return lnew - -#def convert_string(s, strict=False): -# """ -# Convert a string to a sequence of embeddings. -# @param strict: If strict, then words *must* be in the vocabulary. -# @todo: DEPRECATED Remove this function. -# """ -# read_embeddings() -# e = [] -# for origw in string.split(string.lower(s)): -# w = numberre.sub("NUMBER", origw) -# if w in __word_to_embedding: -# e.append(__word_to_embedding[w]) -# else: -# sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) -# assert not strict -# e.append(__word_to_embedding[UNKNOWN]) -# return e - -#def test(): -# """ -# Debugging code. -# """ -# read_embeddings() -# for w in __word_to_embedding: -# assert len(__word_to_embedding[w]) == 50 -# import numpy -# for w1 in __words: -# e1 = numpy.asarray(__word_to_embedding[w1]) -# lst = [] -# print w1, numpy.dot(e1, e1) -# for w2 in __word_to_embedding: -# if w1 >= w2: continue -# e2 = numpy.asarray(__word_to_embedding[w2]) -# d = (e1 - e2) -# l2 = numpy.dot(d, d) -# lst.append((l2, w1, w2)) -# lst.sort() -# print lst[:10] -# -#test() diff -r 27b1344a57b1 -r 8fff4bc26f4c embeddings/read-original.py --- a/embeddings/read-original.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -#!/usr/bin/python - -import string -#import psyco - -weightsfile = "lm-weights.txt" -vocabfile = "words.asc" -size = 30000 -dimensions = 50 - -words = [string.strip(w) for w in open(vocabfile).readlines()] -assert len(words) == 30000 - -import numpy, math -import sys -from common.str import percent - -word_to_vector = {} - -f = open(weightsfile) -f.readline() -vals = [float(v) for v in string.split(f.readline())] -assert len(vals) == size * dimensions -vals.reverse() -for i in range(size): - l = vals[dimensions*i:dimensions*(i+1)] - w = words[i] - word_to_vector[w] = l - -# l2 = numpy.asarray(l) -# print math.fabs(50 - numpy.sum(l2*l2)), w - -cnt = 0 -for i1 in range(len(words)): - for i2 in range(len(words)): - w1 = words[i1] - w2 = words[i2] - cnt += 1 - if i1 <= i2: continue - l1 = numpy.asarray(word_to_vector[w1]) - l2 = numpy.asarray(word_to_vector[w2]) - d = l2 - l1 - dist = numpy.sum(d * d) - if dist < 50: - print numpy.sum(d * d), w1, w2, i1, i2 - if cnt % 1000 == 0: - sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector))) diff -r 27b1344a57b1 -r 8fff4bc26f4c examples/linear_classifier.py --- a/examples/linear_classifier.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,224 +0,0 @@ -#! /usr/bin/env python -""" -T. Bertin-Mahieux (2008) University of Montreal -bertinmt@iro.umontreal.ca - -linear_classifier.py -Simple script that creates a linear_classifier, and -learns the parameters using backpropagation. - -This is to illustrate how to use theano/pylearn. -Anyone who knows how to make this script simpler/clearer is welcome to -make the modifications. -""" - - -import os -import sys -import time -import copy -import pickle -import numpy -import numpy as N -import numpy.random as NR -from pylearn import cost -import theano -from theano import tensor as T - - -def cost_function(*args,**kwargs) : - """ default cost function, quadratic """ - return cost.quadratic(*args,**kwargs) - - -class modelgraph() : - """ class that contains the graph of the model """ - lr = T.scalar() # learning rate - inputs = T.matrix() # inputs (one example per line) - true_outputs = T.matrix() # outputs (one example per line) - W = T.matrix() # weights input * W + b= output - b = T.vector() # bias - outputs = T.dot(inputs,W) + b # output, one per line - costs = cost_function(true_outputs,outputs) # costs - g_W = T.grad(costs,W) # gradient of W - g_b = T.grad(costs,b) # gradient of b - new_W = T.sub_inplace(W, lr * g_W) # update inplace of W - new_b = T.sub_inplace(b, lr * g_b) # update inplace of b - - -class model() : - """ - The model! - Contains needed matrices, needed functions, and a link to the model graph. - """ - - def __init__(self,input_size,output_size) : - """ init matrix and bias, creates the graph, create a dict of compiled functions """ - # graph - self.graph = modelgraph() - # weights and bias, saved in self.params - seed = 666 - r = NR.RandomState(seed) - W = r.uniform(size = [input_size, output_size], low = -1/N.sqrt(input_size), high = 1/N.sqrt(input_size)) - b = numpy.zeros((output_size, )) - self.params = [W,b] - # dictionary of compiled functions - self.func_dict = dict() - # keep some init_infos (may not be necessary) - self.init_params = [input_size,output_size] - - - def update(self,lr,true_inputs,true_outputs) : - """ does an update of the model, one gradient descent """ - # do we already have the proper theano function? - if self.func_dict.has_key('update_func') : - self.func_dict['update_func'](lr,true_inputs,true_outputs,self.params[0],self.params[1]) - return - else : - # create the theano function, tell him what are the inputs and outputs) - func = theano.function([self.graph.lr,self.graph.inputs,self.graph.true_outputs, - self.graph.W, self.graph.b], - [self.graph.new_W,self.graph.new_b]) - # add function to dictionary, so we don't compile it again - self.func_dict['update_func'] = func - # use this function - func(lr,true_inputs,true_outputs,self.params[0],self.params[1]) - return - - def costs(self,true_inputs,true_outputs) : - """ get the costs for given examples, don't update """ - # do we already have the proper theano function? - if self.func_dict.has_key('costs_func') : - return self.func_dict['costs_func'](true_inputs,true_outputs,self.params[0],self.params[1]) - else : - # create the theano function, tell him what are the inputs and outputs) - func = theano.function([self.graph.inputs,self.graph.true_outputs,self.graph.W,self.graph.b], - [self.graph.costs]) - # add function to dictionary, se we don't compile it again - self.func_dict['costs_func'] = func - # use this function - return func(true_inputs,true_outputs,self.params[0],self.params[1]) - - def outputs(self,true_inputs) : - """ get the output for a set of examples (could be called 'predict') """ - # do we already have the proper theano function? - if self.func_dict.has_key('outputs_func') : - return self.func_dict['outputs_func'](true_inputs,self.params[0],self.params[1]) - else : - # create the theano function, tell him what are the inputs and outputs) - func = theano.function([self.graph.inputs, self.graph.W, self.graph.b], - [self.graph.outputs]) - # add function to dictionary, se we don't compile it again - self.func_dict['outputs_func'] = func - # use this function - return func(true_inputs,self.params[0],self.params[1]) - - def __getitem__(self,inputs) : - """ for simplicity, we can use the model this way: predictions = model[inputs] """ - return self.outputs(inputs) - - def __getstate__(self) : - """ - To save/copy the model, used by pickle.dump() and by copy.deepcopy(). - @return a dictionnary with the params (matrix + bias) - """ - d = dict() - d['params'] = self.params - d['init_params'] = self.init_params - return d - - def __setstate__(self,d) : - """ - Get the dictionary created by __getstate__(), use it to recreate the model. - """ - self.params = d['params'] - self.init_params = d['init_params'] - self.graph = modelgraph() # we did not save the model graph - - def __str__(self) : - """ returns a string representing the model """ - res = "Linear regressor, input size =",str(self.init_params[0]) - res += ", output size =", str(self.init_params[1]) - return res - - def __equal__(self,other) : - """ - Compares the model based on the params. - @return True if the params are the same, False otherwise - """ - # class - if not isinstance(other,model) : - return False - # input size - if self.params[0].shape[0] != other.params[0].shape[0] : - return False - # output size - if self.params[0].shape[1] != other.params[0].shape[1] : - return False - # actual values - if not (self.params[0] == other.params[0]).all(): - return False - if not (self.params[1] == other.params[1]).all(): - return False - # all good - return True - - -def die_with_usage() : - """ help menu """ - print 'simple script to illustrate how to use theano/pylearn' - print 'to launch:' - print ' python linear_classifier.py -launch' - sys.exit(0) - - - -#************************************************************ -# main - -if __name__ == '__main__' : - - if len(sys.argv) < 2 : - die_with_usage() - - # print create data - inputs = numpy.array([[.1,.2], - [.2,.8], - [.9,.3], - [.6,.5]]) - outputs = numpy.array([[0], - [0], - [1], - [1]]) - assert inputs.shape[0] == outputs.shape[0] - - # create model - m = model(2,1) - - # predict - print 'prediction before training:' - print m[inputs] - - # update it for 100 iterations - for k in range(50) : - m.update(.1,inputs,outputs) - - # predict - print 'prediction after training:' - print m[inputs] - - # show points - import pylab as P - colors = outputs.flatten().tolist() - x = inputs[:,0] - y = inputs[:,1] - P.plot(x[numpy.where(outputs==0)[0]],y[numpy.where(outputs==0)[0]],'r+') - P.plot(x[numpy.where(outputs==1)[0]],y[numpy.where(outputs==1)[0]],'b+') - # decision line - p1 = (.5 - m.params[1] * 1.) / m.params[0][1,0] # abs = 0 - p2 = (.5 - m.params[1] * 1.) / m.params[0][0,0] # ord = 0 - P.plot((0,p2[0],2*p2[0]),(p1[0],0,-p1[0]),'g-') - # show - P.axis([-1,2,-1,2]) - P.show() - diff -r 27b1344a57b1 -r 8fff4bc26f4c examples/theano_update.py --- a/examples/theano_update.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -import theano -from theano import tensor - -import numpy - -# Two scalar symbolic variables -a = tensor.scalar() -b = tensor.scalar() - -# Definition of output symbolic variable -c = a * b -# Definition of the function computing it -fprop = theano.function([a,b], [c]) - -# Initialize numerical variables -a_val = numpy.array(12.) -b_val = numpy.array(2.) -print 'a_val =', a_val -print 'b_val =', b_val - -# Numerical value of output is returned by the call to "fprop" -c_val = fprop(a_val, b_val) -print 'c_val =', c_val - - -# Definition of simple update (increment by one) -new_b = b + 1 -update = theano.function([b], [new_b]) - -# New numerical value of b is returned by the call to "update" -b_val = update(b_val) -print 'new b_val =', b_val -# We can use the new value in "fprop" -c_val = fprop(a_val, b_val) -print 'c_val =', c_val - - -# Definition of in-place update (increment by one) -re_new_b = tensor.add_inplace(b, 1.) -re_update = theano.function([b], [re_new_b]) - -# "re_update" can be used the same way as "update" -b_val = re_update(b_val) -print 'new b_val =', b_val -# We can use the new value in "fprop" -c_val = fprop(a_val, b_val) -print 'c_val =', c_val - -# It is not necessary to keep the return value when the update is done in place -re_update(b_val) -print 'new b_val =', b_val -c_val = fprop(a_val, b_val) -print 'c_val =', c_val - - - diff -r 27b1344a57b1 -r 8fff4bc26f4c exceptions.py --- a/exceptions.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -""" -Common exceptions. -@todo: This file should be part of a common/ python package. -""" - -class AbstractFunction (Exception): """Derived class must override this function""" -class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented""" diff -r 27b1344a57b1 -r 8fff4bc26f4c external/wrap_libsvm.py --- a/external/wrap_libsvm.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,99 +0,0 @@ -"""Run an experiment using libsvm. -""" -import numpy -from ..datasets import dataset_from_descr - -# libsvm currently has no python installation instructions/convention. -# -# This module uses a specific convention for libsvm's installation. -# I base this on installing libsvm-2.88. -# To install libsvm's python module, do three things: -# 1. Build libsvm (run make in both the root dir and the python subdir). -# 2. touch a '__init__.py' file in the python subdir -# 3. add a symbolic link to a PYTHONPATH location that looks like this: -# libsvm -> /libsvm-2.88/python/ -# -# That is the sort of thing that this module expects from 'import libsvm' - -import libsvm - -def score_01(x, y, model): - assert len(x) == len(y) - size = len(x) - errors = 0 - for i in range(size): - prediction = model.predict(x[i]) - #probability = model.predict_probability - if (y[i] != prediction): - errors = errors + 1 - return float(errors)/size - -#this is the dbdict experiment interface... if you happen to use dbdict -class State(object): - #TODO: parametrize to get all the kernel types, not hardcode for RBF - dataset = 'MNIST_1k' - C = 10.0 - kernel = 'RBF' - # rel_gamma is related to the procedure Jerome used. He mentioned why in - # quadratic_neurons/neuropaper/draft3.pdf. - rel_gamma = 1.0 - - def __init__(self, **kwargs): - for k, v in kwargs: - setattr(self, k, type(getattr(self, k))(v)) - - -def dbdict_run_svm_experiment(state, channel=lambda *args, **kwargs:None): - """Parameters are described in state, and returned in state. - - :param state: object instance to store parameters and return values - :param channel: not used - - :returns: None - - This is the kind of function that dbdict-run can use. - - """ - ((train_x, train_y), (valid_x, valid_y), (test_x, test_y)) = dataset_from_descr(state.dataset) - - #libsvm needs stuff in int32 on a 32bit machine - #TODO: test this on a 64bit machine - train_y = numpy.asarray(train_y, dtype='int32') - valid_y = numpy.asarray(valid_y, dtype='int32') - test_y = numpy.asarray(test_y, dtype='int32') - problem = svm.svm_problem(train_y, train_x); - - gamma0 = 0.5 / numpy.sum(numpy.var(train_x, axis=0)) - - param = svm.svm_parameter(C=state.C, - kernel_type=getattr(svm, state.kernel), - gamma=state.rel_gamma * gamma0) - - model = svm.svm_model(problem, param) #this is the expensive part - - state.train_01 = score_01(train_x, train_y, model) - state.valid_01 = score_01(valid_x, valid_y, model) - state.test_01 = score_01(test_x, test_y, model) - - state.n_train = len(train_y) - state.n_valid = len(valid_y) - state.n_test = len(test_y) - -def run_svm_experiment(**kwargs): - """Python-friendly interface to dbdict_run_svm_experiment - - Parameters are used to construct a `State` instance, which is returned after running - `dbdict_run_svm_experiment` on it. - - .. code-block:: python - results = run_svm_experiment(dataset='MNIST_1k', C=100.0, rel_gamma=0.01) - print results.n_train - # 1000 - print results.valid_01, results.test_01 - # 0.14, 0.10 #.. or something... - - """ - state = State(**kwargs) - state_run_svm_experiment(state) - return state - diff -r 27b1344a57b1 -r 8fff4bc26f4c filetensor.py --- a/filetensor.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,141 +0,0 @@ -""" -Read and write the matrix file format described at -U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html} - -The format is for dense tensors: - - - magic number indicating type and endianness - 4bytes - - rank of tensor - int32 - - dimensions - int32, int32, int32, ... - - - -The number of dimensions and rank is slightly tricky: - - for scalar: rank=0, dimensions = [1, 1, 1] - - for vector: rank=1, dimensions = [?, 1, 1] - - for matrix: rank=2, dimensions = [?, ?, 1] - -For rank >= 3, the number of dimensions matches the rank exactly. - - -@todo: add complex type support - -""" -import sys -import numpy - -def _prod(lst): - p = 1 - for l in lst: - p *= l - return p - -_magic_dtype = { - 0x1E3D4C51 : ('float32', 4), - #0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix? - 0x1E3D4C53 : ('float64', 8), - 0x1E3D4C54 : ('int32', 4), - 0x1E3D4C55 : ('uint8', 1), - 0x1E3D4C56 : ('int16', 2), - } -_dtype_magic = { - 'float32': 0x1E3D4C51, - #'packed matrix': 0x1E3D4C52, - 'float64': 0x1E3D4C53, - 'int32': 0x1E3D4C54, - 'uint8': 0x1E3D4C55, - 'int16': 0x1E3D4C56 - } - -# -# TODO: implement item selection: -# e.g. load('some mat', subtensor=(:6, 2:5)) -# -# This function should be memory efficient by: -# - allocating an output matrix at the beginning -# - seeking through the file, reading subtensors from multiple places -def read(f, subtensor=None, debug=False): - """Load all or part of file 'f' into a numpy ndarray - - @param f: file from which to read - @type f: file-like object - - If subtensor is not None, it should be like the argument to - numpy.ndarray.__getitem__. The following two expressions should return - equivalent ndarray objects, but the one on the left may be faster and more - memory efficient if the underlying file f is big. - - read(f, subtensor) <===> read(f)[*subtensor] - - Support for subtensors is currently spotty, so check the code to see if your - particular type of subtensor is supported. - - """ - def _read_int32(f): - s = f.read(4) - s_array = numpy.fromstring(s, dtype='int32') - return s_array.item() - - #what is the data type of this matrix? - #magic_s = f.read(4) - #magic = numpy.fromstring(magic_s, dtype='int32') - magic = _read_int32(f) - magic_t, elsize = _magic_dtype[magic] - if debug: - print 'header magic', magic, magic_t, elsize - if magic_t == 'packed matrix': - raise NotImplementedError('packed matrix not supported') - - #what is the rank of the tensor? - ndim = _read_int32(f) - if debug: print 'header ndim', ndim - - #what are the dimensions of the tensor? - dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] - dim_size = _prod(dim) - if debug: print 'header dim', dim, dim_size - - rval = None - if subtensor is None: - rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) - elif isinstance(subtensor, slice): - if subtensor.step not in (None, 1): - raise NotImplementedError('slice with step', subtensor.step) - if subtensor.start not in (None, 0): - bytes_per_row = _prod(dim[1:]) * elsize - raise NotImplementedError('slice with start', subtensor.start) - dim[0] = min(dim[0], subtensor.stop) - rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) - else: - raise NotImplementedError('subtensor access not written yet:', subtensor) - - return rval - -def write(f, mat): - """Write a numpy.ndarray to file. - - @param f: file into which to write - @type f: file-like object - - @param mat: array to write to file - @type mat: numpy ndarray or compatible - - """ - def _write_int32(f, i): - i_array = numpy.asarray(i, dtype='int32') - if 0: print 'writing int32', i, i_array - i_array.tofile(f) - - try: - _write_int32(f, _dtype_magic[str(mat.dtype)]) - except KeyError: - raise TypeError('Invalid ndarray dtype for filetensor format', mat.dtype) - - _write_int32(f, len(mat.shape)) - shape = mat.shape - if len(shape) < 3: - shape = list(shape) + [1] * (3 - len(shape)) - if 0: print 'writing shape =', shape - for sh in shape: - _write_int32(f, sh) - mat.tofile(f) - diff -r 27b1344a57b1 -r 8fff4bc26f4c image_tools.py --- a/image_tools.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,39 +0,0 @@ - -import numpy - - -def make_weights_image(mat, xres, yres, i, j, nrow, ncol): - """ - Displays the filters implemented by a weight matrix. - - Each filter corresponds to a row of mat and will be represented - by a xres*yres image. - - Units from i to j will be included in the picture. - - The picture will have nrow rows of filters and ncol columns - of filters. Unused spots for filters will be filled with zeros. - - The return value is a matrix suitable for display with - matplotlib's imshow. - """ - - assert j > i - n = j - i - result = numpy.zeros((ncol * xres, nrow * yres)) - submat = mat[i:j] - for k, row in enumerate(submat): - x = (k % ncol)*xres - y = (k / ncol)*yres - entry = row.reshape((xres, yres)) - lmin, lmax = numpy.min(entry), numpy.max(entry) - ldiff = lmax - lmin - #entry = (entry - lmin) / ldiff - result[x:x + xres, y:y + yres] = entry - return result.T - - - - - - diff -r 27b1344a57b1 -r 8fff4bc26f4c kernel_regression.py --- a/kernel_regression.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,231 +0,0 @@ -""" -Implementation of kernel regression: -""" - -from pylearn.learner import OfflineLearningAlgorithm -from theano import tensor as T -from theano.tensor.nnet import prepend_1_to_each_row -from theano.scalar import as_scalar -from common.autoname import AutoName -import theano -import numpy - -# map a N-vector to a 1xN matrix -row_vector = theano.tensor.DimShuffle((False,),['x',0]) -# map a N-vector to a Nx1 matrix -col_vector = theano.tensor.DimShuffle((False,),[0,'x']) - -class KernelRegression(OfflineLearningAlgorithm): - """ -Implementation of kernel regression: -* the data are n (x_t,y_t) pairs and we want to estimate E[y|x] -* the predictor computes - f(x) = b + \sum_{t=1}^n \alpha_t K(x,x_t) - with free parameters b and alpha, training inputs x_t, - and kernel function K (gaussian by default). - Clearly, each prediction involves O(n) computations. -* the learner chooses b and alpha to minimize - lambda alpha' G' G alpha + \sum_{t=1}^n (f(x_t)-y_t)^2 - where G is the matrix with entries G_ij = K(x_i,x_j). - The first (L2 regularization) term is the squared L2 - norm of the primal weights w = \sum_t \alpha_t phi(x_t) - where phi is the function s.t. K(u,v)=phi(u).phi(v). -* this involves solving a linear system with (n+1,n+1) - matrix, which is an O(n^3) computation. In addition, - that linear system matrix requires O(n^2) memory. - So this learning algorithm should be used only for - small datasets. -* the linear system is - (M + lambda I_n) theta = (1, y)' - where theta = (b, alpha), I_n is the (n+1)x(n+1) matrix that is the identity - except with a 0 at (0,0), M is the matrix with G in the sub-matrix starting - at (1,1), 1's in column 0, except for a value of n at (0,0), and sum_i G_{i,j} - in the rest of row 0. - -Note that this is gives an estimate of E[y|x,training_set] that is the -same as obtained with a Gaussian process regression. The GP -regression would also provide a Bayesian Var[y|x,training_set]. -It corresponds to an assumption that f is a random variable -with Gaussian (process) prior distribution with covariance -function K. Because we assume Gaussian noise we obtain a Gaussian -posterior for f (whose mean is computed here). - - - Usage: - - kernel_regressor=KernelRegression(L2_regularizer=0.1,gamma=0.5) (kernel=GaussianKernel(gamma=0.5)) - kernel_predictor=kernel_regressor(training_set) - all_results_dataset=kernel_predictor(test_set) # creates a dataset with "output" and "squared_error" field - outputs = kernel_predictor.compute_outputs(inputs) # inputs and outputs are numpy arrays - outputs, errors = kernel_predictor.compute_outputs_and_errors(inputs,targets) - errors = kernel_predictor.compute_errors(inputs,targets) - mse = kernel_predictor.compute_mse(inputs,targets) - - - - The training_set must have fields "input" and "target". - The test_set must have field "input", and needs "target" if - we want to compute the squared errors. - - The predictor parameters are obtained analytically from the training set. - Training is only done on a whole training set rather than on minibatches - (no online implementation). - - The dataset fields expected and produced by the learning algorithm and the trained model - are the following: - - - Input and output dataset fields (example-wise quantities): - - - 'input' (always expected as an input_dataset field) - - 'target' (always expected by the learning algorithm, optional for learned model) - - 'output' (always produced by learned model) - - 'squared_error' (optionally produced by learned model if 'target' is provided) - = example-wise squared error - """ - def __init__(self, kernel=None, L2_regularizer=0, gamma=1, use_bias=False): - # THE VERSION WITH BIAS DOES NOT SEEM RIGHT - self.kernel = kernel - self.L2_regularizer=L2_regularizer - self.use_bias=use_bias - self.gamma = gamma # until we fix things, the kernel type is fixed, Gaussian - self.equations = KernelRegressionEquations() - - def __call__(self,trainset): - n_examples = len(trainset) - first_example = trainset[0] - n_inputs = first_example['input'].size - n_outputs = first_example['target'].size - b1=1 if self.use_bias else 0 - M = numpy.zeros((n_examples+b1,n_examples+b1)) - Y = numpy.zeros((n_examples+b1,n_outputs)) - for i in xrange(n_examples): - M[i+b1,i+b1]=self.L2_regularizer - data = trainset.fields() - train_inputs = numpy.array(data['input']) - if self.use_bias: - Y[0]=1 - Y[b1:,:] = numpy.array(data['target']) - train_inputs_square,sumG,G=self.equations.compute_system_matrix(train_inputs,self.gamma) - M[b1:,b1:] += G - if self.use_bias: - M[0,1:] = sumG - M[1:,0] = 1 - M[0,0] = M.shape[0] - self.M=M - self.Y=Y - theta=numpy.linalg.solve(M,Y) - return KernelPredictor(theta,self.gamma, train_inputs, train_inputs_square) - -class KernelPredictorEquations(AutoName): - train_inputs = T.matrix() # n_examples x n_inputs - train_inputs_square = T.vector() # n_examples - inputs = T.matrix() # minibatchsize x n_inputs - targets = T.matrix() # minibatchsize x n_outputs - theta = T.matrix() # (n_examples+1) x n_outputs - b1 = T.shape(train_inputs_square)[0]>> example = LookupList(['x','y','z'],[1,2,3]) - >>> example['x'] = [1, 2, 3] # set or change a field - >>> print example('z','y') # prints [3,2] - >>> x, y, z = example - >>> x = example[0] - >>> x = example["x"] - >>> print example.keys() # prints ['x','y','z'] - >>> print example.values() # prints [[1,2,3],2,3] - >>> print example.items() # prints [('x',[1,2,3]),('y',2),('z',3)] - >>> example.append_keyval('u',0) # adds item with name 'u' and value 0 - >>> print len(example) # number of items = 4 here - >>> example2 = LookupList(['v', 'w'], ['a','b']) - >>> print example+example2 # addition is like for lists, a concatenation of the items. - >>> example + example # throw an error as we can't have duplicate name. - - @note: The element names should be unique. - - @todo: Convert this documentation into doctest - format, and actually perform doctest'ing: - U{http://epydoc.sourceforge.net/manual-epytext.html#doctest-blocks} - """ - def __init__(self,names=[],values=[]): - #print 'values=', values - #print 'length=', len(values) - #print 'names=', names - #print 'length=',len(names) - assert len(values)==len(names) - self.__dict__['_values']=values - self.__dict__['_name2index']={} - self.__dict__['_names']=names - for i in xrange(len(values)): - assert names[i] not in self._name2index - self._name2index[names[i]]=i - - def keys(self): - return self._names - - def values(self): - return self._values - - def items(self): - """ - Return a list of (name,value) pairs of all the items in the look-up list. - """ - return zip(self._names,self._values) - - def __getitem__(self,key): - """ - The key in example[key] can either be an integer to index the fields - or the name of the field. - """ - if isinstance(key,int) or isinstance(key,slice) or (isinstance(key,list) and all([isinstance(i,int) for i in key])): - return self._values[key] - else: # if not an int, key must be a name - # expecting key to be a valid field name - assert isinstance(key,str) - return self._values[self._name2index[key]] - - def __setitem__(self,key,value): - if isinstance(key,int): - self._values[key]=value - else: # if not an int, key must be a name - if key in self._name2index: - self._values[self._name2index[key]]=value - else: - self.append_keyval(key,value) - - def append_keyval(self, key, value): - assert key not in self._name2index - self._name2index[key]=len(self) - self._values.append(value) - self._names.append(key) - - def append_lookuplist(self, *list): - for l in list: - for key in l.keys(): - self.append_keyval(key,l[key]) - del l - - def __len__(self): - return len(self._values) - - def __repr__(self): - return "{%s}" % ", ".join([str(k) + "=" + repr(v) for k,v in self.items()]) - - def __add__(self,rhs): - new_example = deepcopy(self) - for item in rhs.items(): - new_example.append_keyval(item[0],item[1]) - return new_example - - def __radd__(self,lhs): - new_example = deepcopy(lhs) - for item in self.items(): - new_example.append_keyval(item[0],item[1]) - return new_example - - def __eq__(self, other): - return self._values==other._values and self._name2index==other._name2index and self._names==other._names - - def __ne__(self, other): - return not self.__eq__(other) - - def __hash__(self): - raise NotImplementedError() - - def __call__(self,*names): - """ - Return a list of values associated with the given names (which must all be keys of the lookup list). - """ - if names == self._names: - return self._values - return [self[name] for name in names] - - -if __name__ == '__main__': - - a=LookupList(['a'],[1]) - print a - b=LookupList(['b'],[2]) - print b - a.append_lookuplist(b) - print a - a.append_lookuplist(b) - print a diff -r 27b1344a57b1 -r 8fff4bc26f4c make_test_datasets.py --- a/make_test_datasets.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,118 +0,0 @@ -from dataset import ArrayDataSet -from shapeset.dset import Polygons -from linear_regression import linear_predictor -from kernel_regression import kernel_predictor -from numpy import * - -""" -General-purpose code to generate artificial datasets that can be used -to test different learning algorithms. -""" - - -def make_triangles_rectangles_online_dataset(image_size=(10,10)): - """ - Make a binary classification dataset to discriminate triangle images from rectangle images. - """ - def convert_dataset(dset): - # convert the n_vert==3 into target==0 and n_vert==4 into target==1 - def mapf(images,n_vertices): - n=len(n_vertices) - targets = ndarray((n,1),dtype='float64') - for i in xrange(n): - targets[i,0] = array([0. if n_vertices[i]==3 else 1.],dtype='float64') - return images.reshape(len(images),images[0].size).astype('float64'),targets - return dataset.ApplyFunctionDataSet(dset("image","nvert"),mapf,["input","target"]) - - p=Polygons(image_size,[3,4],fg_min=1./255,fg_max=1./255,rot_max=1.,scale_min=0.35,scale_max=0.9,pos_min=0.1, pos_max=0.9) - trainset=convert_dataset(p) - return trainset - - -def make_triangles_rectangles_dataset(n_examples=600,image_size=(10,10), cache = True): - """ - Make a binary classification dataset to discriminate triangle images from rectangle images. - """ - def convert_dataset(dset): - # convert the n_vert==3 into target==0 and n_vert==4 into target==1 - def mapf(images,n_vertices): - n=len(n_vertices) - targets = ndarray((n,1),dtype='float64') - for i in xrange(n): - targets[i,0] = array([0. if n_vertices[i]==3 else 1.],dtype='float64') - return images.reshape(len(images),images[0].size).astype('float64'),targets - return dataset.CachedDataSet(dataset.ApplyFunctionDataSet(dset("image","nvert"),mapf,["input","target"]),cache) - - p=Polygons(image_size,[3,4],fg_min=1./255,fg_max=1./255,rot_max=1.,scale_min=0.35,scale_max=0.9,pos_min=0.1, pos_max=0.9) - data = p.subset[0:n_examples] - trainset=convert_dataset(data.subset[0:n_examples]) - return trainset - - -def make_triangles_rectangles_datasets(n_examples=600,train_frac=0.5,image_size=(10,10), cache = True): - """ - Make two binary classification datasets to discriminate triangle images from rectangle images. - The first one is the training set, the second is the test set. - """ - data = make_triangles_rectangles_dataset(n_examples=n_examples,image_size=image_size, cache = cache) - n_train = int(n_examples*train_frac) - trainset=convert_dataset(data.subset[0:n_train]) - testset=convert_dataset(data.subset[n_train:n_examples]) - return trainset,testset - - -def make_artificial_datasets_from_function(n_inputs=1, - n_targets=1, - n_examples=20, - train_frac=0.5, - noise_level=0.1, # add Gaussian noise, noise_level=sigma - params_shape=None, - f=None, # function computing E[Y|X] - otherargs=None, # extra args to f - b=None): # force theta[0] with this value - """ - Make regression data of the form - Y | X ~ Normal(f(X,theta,otherargs),noise_level^2) - If n_inputs==1 then X is chosen at regular locations on the [-1,1] interval. - Otherwise X is sampled according to a Normal(0,1) on all dimensions (independently). - The parameters theta is a matrix of shape params_shape that is sampled from Normal(0,1). - Optionally theta[0] is set to the argument 'b', if b is provided. - - Return a training set and a test set, by splitting the generated n_examples - according to the 'train_frac'tion. - """ - n_train=int(train_frac*n_examples) - n_test=n_examples-n_train - if n_inputs==1: - delta1=2./n_train - delta2=2./n_test - inputs = vstack((array(zip(range(n_train)))*delta1-1, - 0.5*delta2+array(zip(range(n_test)))*delta2-1)) - else: - inputs = random.normal(size=(n_examples,n_inputs)) - if not f: - f = linear_predictor - if f==kernel_predictor and not otherargs[1]: - otherargs=(otherargs[0],inputs[0:n_train]) - if not params_shape: - if f==linear_predictor: - params_shape = (n_inputs+1,n_targets) - elif f==kernel_predictor: - params_shape = (otherargs[1].shape[0]+1,n_targets) - theta = random.normal(size=params_shape) if params_shape else None - if b: - theta[0]=b - outputs = f(inputs,theta,otherargs) - targets = outputs + random.normal(scale=noise_level,size=(n_examples,n_targets)) - # the | stacking creates a strange bug in LookupList constructor: - # trainset = ArrayDataSet(inputs[0:n_examples/2],{'input':slice(0,n_inputs)}) | \ - # ArrayDataSet(targets[0:n_examples/2],{'target':slice(0,n_targets)}) - # testset = ArrayDataSet(inputs[n_examples/2:],{'input':slice(0,n_inputs)}) | \ - # ArrayDataSet(targets[n_examples/2:],{'target':slice(0,n_targets)}) - data = hstack((inputs,targets)) - - trainset = ArrayDataSet(data[0:n_train], - {'input':slice(0,n_inputs),'target':slice(n_inputs,n_inputs+n_targets)}) - testset = ArrayDataSet(data[n_train:], - {'input':slice(0,n_inputs),'target':slice(n_inputs,n_inputs+n_targets)}) - return trainset,testset,theta diff -r 27b1344a57b1 -r 8fff4bc26f4c misc_theano.py --- a/misc_theano.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ - -import theano - -class Print(theano.Op): - def __init__(self,message=""): - self.message=message - self.view_map={0:[0]} - - def make_node(self,xin): - xout = xin.type.make_result() - return theano.Apply(op = self, inputs = [xin], outputs=[xout]) - - def perform(self,node,inputs,output_storage): - xin, = inputs - xout, = output_storage - xout[0] = xin - print self.message,xin - - def grad(self,input,output_gradients): - return output_gradients diff -r 27b1344a57b1 -r 8fff4bc26f4c mlp_factory_approach.py --- a/mlp_factory_approach.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,384 +0,0 @@ -import copy, sys, os -import numpy - -import theano -from theano import tensor as T - -import dataset, nnet_ops, stopper, filetensor -from pylearn.lookup_list import LookupList - - -class AbstractFunction (Exception): pass - -class AutoName(object): - """ - By inheriting from this class, class variables which have a name attribute - will have that name attribute set to the class variable name. - """ - class __metaclass__(type): - def __init__(cls, name, bases, dct): - type.__init__(name, bases, dct) - for key, val in dct.items(): - assert type(key) is str - if hasattr(val, 'name'): - val.name = key - -class GraphLearner(object): - class Model(object): - def __init__(self, algo, params): - self.algo = algo - self.params = params - graph = self.algo.graph - self.update_fn = algo._fn([graph.input, graph.target] + graph.params, - [graph.nll] + graph.new_params) - self._fn_cache = {} - - def __copy__(self): - raise Exception('why not called?') - return GraphLearner.Model(self.algo, [copy.copy(p) for p in params]) - - def __eq__(self,other,tolerance=0.) : - """ Only compares weights of matrices and bias vector. """ - if not isinstance(other,GraphLearner.Model) : - return False - for p in range(4) : - if self.params[p].shape != other.params[p].shape : - return False - if not numpy.all( numpy.abs(self.params[p] - other.params[p]) <= tolerance ) : - return False - return True - - def _cache(self, key, valfn): - d = self._fn_cache - if key not in d: - d[key] = valfn() - return d[key] - - def update_minibatch(self, minibatch): - if not isinstance(minibatch, LookupList): - print type(minibatch) - assert isinstance(minibatch, LookupList) - self.update_fn(minibatch['input'], minibatch['target'], *self.params) - - def update(self, dataset, - default_minibatch_size=32): - """ - Update this model from more training data.Uses all the data once, cut - into minibatches. No early stopper here. - """ - params = self.params - minibatch_size = min(default_minibatch_size, len(dataset)) - for mb in dataset.minibatches(['input', 'target'], minibatch_size=minibatch_size): - self.update_minibatch(mb) - - def save(self, f): - self.algo.graph.save(f, self) - - def __call__(self, testset, fieldnames=['output_class']): - """Apply this model (as a function) to new data. - - @param testset: DataSet, whose fields feed Result terms in self.algo.g - @type testset: DataSet - - @param fieldnames: names of results in self.algo.g to compute. - @type fieldnames: list of strings - - @return: DataSet with fields from fieldnames, computed from testset by - this model. - @rtype: ApplyFunctionDataSet instance - - """ - graph = self.algo.graph - def getresult(name): - r = getattr(graph, name) - if not isinstance(r, theano.Result): - raise TypeError('string does not name a theano.Result', (name, r)) - return r - - provided = [getresult(name) for name in testset.fieldNames()] - wanted = [getresult(name) for name in fieldnames] - inputs = provided + graph.params - - theano_fn = self._cache((tuple(inputs), tuple(wanted)), - lambda: self.algo._fn(inputs, wanted)) - lambda_fn = lambda *args: theano_fn(*(list(args) + self.params)) - return dataset.ApplyFunctionDataSet(testset, lambda_fn, fieldnames) - - class Graph(object): - class Opt(object): - merge = theano.gof.MergeOptimizer() - gemm_opt_1 = theano.gof.TopoOptimizer(theano.tensor_opt.gemm_pattern_1) - sqr_opt_0 = theano.gof.TopoOptimizer(theano.gof.PatternSub( - (T.mul,'x', 'x'), - (T.sqr, 'x'))) - - def __init__(self, do_sqr=True): - self.do_sqr = do_sqr - - def __call__(self, env): - self.merge(env) - self.gemm_opt_1(env) - if self.do_sqr: - self.sqr_opt_0(env) - self.merge(env) - - def linker(self): - return theano.gof.PerformLinker() - - def early_stopper(self): - stopper.NStages(300,1) - - def train_iter(self, trainset): - raise AbstractFunction - optimizer = Opt() - - def load(self,f) : - raise AbstractFunction - - def save(self,f,model) : - raise AbstractFunction - - - def __init__(self, graph): - self.graph = graph - - def _fn(self, inputs, outputs): - # Caching here would hamper multi-threaded apps - # prefer caching in Model.__call__ - return theano.function(inputs, outputs, - unpack_single=False, - optimizer=self.graph.optimizer, - linker=self.graph.linker() if hasattr(self.graph, 'linker') - else 'c|py') - - def __call__(self, - trainset=None, - validset=None, - iparams=None, - stp=None): - """Allocate and optionally train a model - - @param trainset: Data for minimizing the cost function - @type trainset: None or Dataset - - @param validset: Data for early stopping - @type validset: None or Dataset - - @param input: name of field to use as input - @type input: string - - @param target: name of field to use as target - @type target: string - - @param stp: early stopper, if None use default in graphMLP.G - @type stp: None or early stopper - - @return: model - @rtype: GraphLearner.Model instance - - """ - - iparams = self.graph.iparams() if iparams is None else iparams - - # if we load, type(trainset) == 'str' - if isinstance(trainset,str) or isinstance(trainset,file): - #loadmodel = GraphLearner.Model(self, iparams) - loadmodel = self.graph.load(self,trainset) - return loadmodel - - curmodel = GraphLearner.Model(self, iparams) - best = curmodel - - if trainset is not None: - #do some training by calling Model.update_minibatch() - if stp == None : - stp = self.graph.early_stopper() - try : - countiter = 0 - for mb in self.graph.train_iter(trainset): - curmodel.update_minibatch(mb) - if stp.set_score: - if validset: - stp.score = curmodel(validset, ['validset_score']) - if (stp.score < stp.best_score): - best = copy.copy(curmodel) - else: - stp.score = 0.0 - countiter +=1 - stp.next() - except StopIteration : - print 'Iterations stopped after ', countiter,' iterations' - if validset: - curmodel = best - return curmodel - - -def graphMLP(ninputs, nhid, nclass, lr_val, l2coef_val=0.0): - - - def wrapper(i, node, thunk): - if 0: - print i, node - print thunk.inputs - print thunk.outputs - if node.op == nnet_ops.crossentropy_softmax_1hot_with_bias: - print 'here is the nll op' - thunk() #actually compute this piece of the graph - - class G(GraphLearner.Graph, AutoName): - - lr = T.constant(lr_val) - assert l2coef_val == 0.0 - l2coef = T.constant(l2coef_val) - input = T.matrix() # n_examples x n_inputs - target = T.ivector() # len: n_examples - #target = T.matrix() - W2, b2 = T.matrix(), T.vector() - - W1, b1 = T.matrix(), T.vector() - hid = T.tanh(b1 + T.dot(input, W1)) - hid_regularization = l2coef * T.sum(W1*W1) - - params = [W1, b1, W2, b2] - activations = b2 + T.dot(hid, W2) - nll, predictions = nnet_ops.crossentropy_softmax_1hot(activations, target ) - regularization = l2coef * T.sum(W2*W2) + hid_regularization - output_class = T.argmax(activations,1) - loss_01 = T.neq(output_class, target) - #g_params = T.grad(nll + regularization, params) - g_params = T.grad(nll, params) - new_params = [T.sub_inplace(p, lr * gp) for p,gp in zip(params, g_params)] - - - def __eq__(self,other) : - print 'G.__eq__ from graphMLP(), not implemented yet' - return NotImplemented - - - def load(self, algo, f): - """ Load from file the 2 matrices and bias vectors """ - cloase_at_end = False - if isinstance(f,str) : - f = open(f,'r') - close_at_end = True - params = [] - for i in xrange(4): - params.append(filetensor.read(f)) - if close_at_end : - f.close() - return GraphLearner.Model(algo, params) - - def save(self, f, model): - """ Save params to file, so 2 matrices and 2 bias vectors. Same order as iparams. """ - cloase_at_end = False - if isinstance(f,str) : - f = open(f,'w') - close_at_end = True - for p in model.params: - filetensor.write(f,p) - if close_at_end : - f.close() - - - def iparams(self): - """ init params. """ - def randsmall(*shape): - return (numpy.random.rand(*shape) -0.5) * 0.001 - return [randsmall(ninputs, nhid) - , randsmall(nhid) - , randsmall(nhid, nclass) - , randsmall(nclass)] - - def train_iter(self, trainset): - return trainset.minibatches(['input', 'target'], - minibatch_size=min(len(trainset), 32), n_batches=2000) - def early_stopper(self): - """ overwrites GraphLearner.graph function """ - return stopper.NStages(300,1) - - return G() - - -import unittest - -class TestMLP(unittest.TestCase): - def blah(self, g): - training_set1 = dataset.ArrayDataSet(numpy.array([[0, 0, 0], - [0, 1, 1], - [1, 0, 1], - [1, 1, 1]]), - {'input':slice(2),'target':2}) - training_set2 = dataset.ArrayDataSet(numpy.array([[0, 0, 0], - [0, 1, 1], - [1, 0, 0], - [1, 1, 1]]), - {'input':slice(2),'target':2}) - test_data = dataset.ArrayDataSet(numpy.array([[0, 0, 0], - [0, 1, 1], - [1, 0, 0], - [1, 1, 1]]), - {'input':slice(2)}) - - learn_algo = GraphLearner(g) - - model1 = learn_algo(training_set1) - - model2 = learn_algo(training_set2) - - omatch = [o1 == o2 for o1, o2 in zip(model1(test_data), - model2(test_data))] - - n_match = sum(omatch) - - self.failUnless(n_match == (numpy.sum(training_set1.fields()['target'] == - training_set2.fields()['target'])), omatch) - - model1.save('/tmp/model1') - - #denoising_aa = GraphLearner(denoising_g) - #model1 = denoising_aa(trainset) - #hidset = model(trainset, fieldnames=['hidden']) - #model2 = denoising_aa(hidset) - - #f = open('blah', 'w') - #for m in model: - # m.save(f) - #filetensor.write(f, initial_classification_weights) - #f.flush() - - #deep_sigmoid_net = GraphLearner(deepnetwork_g) - #deep_model = deep_sigmoid_net.load('blah') - #deep_model.update(trainset) #do some fine tuning - - model1_dup = learn_algo('/tmp/model1') - - - def equiv(self, g0, g1): - training_set1 = dataset.ArrayDataSet(numpy.array([[0, 0, 0], - [0, 1, 1], - [1, 0, 1], - [1, 1, 1]]), - {'input':slice(2),'target':2}) - learn_algo_0 = GraphLearner(g0) - learn_algo_1 = GraphLearner(g1) - - model_0 = learn_algo_0(training_set1) - model_1 = learn_algo_1(training_set1) - - print '----' - for p in zip(model_0.params, model_1.params): - abs_rel_err = theano.gradient.numeric_grad.abs_rel_err(p[0], p[1]) - max_abs_rel_err = numpy.max(abs_rel_err) - if max_abs_rel_err > 1.0e-7: - print 'p0', p[0] - print 'p1', p[1] - #self.failUnless(max_abs_rel_err < 1.0e-7, max_abs_rel_err) - - - def test0(self): self.blah(graphMLP(2, 10, 2, .1)) - def test1(self): self.blah(graphMLP(2, 3, 2, .1)) - -if __name__ == '__main__': - unittest.main() - - diff -r 27b1344a57b1 -r 8fff4bc26f4c nnet_ops.py --- a/nnet_ops.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,721 +0,0 @@ - -import sys -sys.stderr.write("Use theano.tensor.nnet instead of pylearn.nnet_ops.\n") -if 0: - ## This file contain ops that are not currently integrated in the core of threano. - ## Not all of those ops have been thoroughly tested. - - import theano - from theano import tensor, scalar - import numpy - - ############ - # - # SCALAR OPS - # - - class ScalarSigmoid(scalar.UnaryScalarOp): - @staticmethod - def st_impl(x): - if x < -30.0: - return 0.0 - if x > 30.0: - return 1.0 - return 1.0 / (1.0 + numpy.exp(-x)) - def impl(self, x): - return ScalarSigmoid.st_impl(x) - def grad(self, (x,), (gz,)): - y = scalar_sigmoid(x) - return [gz * y * (1.0 - y)] - def c_code(self, node, name, (x,), (z,), sub): - if node.inputs[0].type in [scalar.float32, scalar.float64]: - return """%(z)s = - %(x)s < -30.0 - ? 0.0 - : %(x)s > 30.0 - ? 1.0 - : 1.0 /(1.0+exp(-%(x)s));""" % locals() - raise NotImplementedError('only floatingpoint is implemented') - scalar_sigmoid = ScalarSigmoid(scalar.upgrade_to_float, name='scalar_sigmoid') - sigmoid = tensor.Elemwise(scalar_sigmoid, name='sigmoid') - - class ScalarSoftplus(scalar.UnaryScalarOp): - @staticmethod - def static_impl(x): - if x < -30.0: - return 0.0 - if x > 30.0: - return x - return numpy.log1p(numpy.exp(x)) - def impl(self, x): - return ScalarSoftplus.static_impl(x) - def grad(self, (x,), (gz,)): - return [gz * scalar_sigmoid(x)] - def c_code(self, node, name, (x,), (z,), sub): - if node.inputs[0].type in [scalar.float32, scalar.float64]: - return """%(z)s = - %(x)s < -30.0 - ? 0.0 - : %(x)s > 30.0 - ? %(x)s - : log1p(exp(%(x)s));""" % locals() - raise NotImplementedError('only floating point x is implemented') - scalar_softplus = ScalarSoftplus(scalar.upgrade_to_float, name='scalar_softplus') - softplus = tensor.Elemwise(scalar_softplus, name='softplus') - - - ############ - # - # TENSOR OPS - # - - - class SoftmaxWithBias(theano.Op): - """ - An L{Op} for the output of neural-net multiclass classifiers. - - @type x: is a matrix of floats (32 or 64) - @type b: is a [row] vector of floats (32 or 64), length is number of cols in x - - This L{Op}'s output is softmax(x+b). - softmax(x[i]) is the i'th distribution over len(x[i]) options. - """ - - nin = 2 - nout = 1 - def __init__(self, **kwargs): - theano.Op.__init__(self, **kwargs) - - def make_node(self, x, b): - x = tensor.as_tensor(x) - b = tensor.as_tensor(b) - if x.type.ndim != 2 \ - or x.type.dtype not in ['float32', 'float64']: - raise ValueError('x must be 2-d tensor of floats') - if b.type.ndim != 1 \ - or x.type.dtype not in ['float32', 'float64']: - raise ValueError('b must be 1-d tensor of floats') - - sm = x.type.make_result() - return theano.Apply(self, [x, b], [sm]) - - def perform(self, node, input_storage, output_storage): - x, b = input_storage - if b.shape[0] != x.shape[1]: - raise ValueError('b must have same number of columns as x') - - sm = numpy.zeros_like(x) - for i in xrange(sm.shape[0]): - row = x[i] + b - sm[i] = numpy.exp(row - numpy.max(row)) - sm[i] *= 1.0 / numpy.sum(sm[i]) - output_storage[0][0] = sm - - def grad(self, (x, b), (g_sm,)): - sm = softmax_with_bias(x, b) - dx = SoftmaxWithBiasDx()(g_sm, sm) - db = tensor.sum(dx, axis = 0) - return dx, db - - def c_headers(self): - return [''] - - @staticmethod - def c_code_template(): - # this implementation was lifted from - # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx - - #TODO: put this into a templated function, in the support code - #TODO: declare the max of each row as an Op output - - #TODO: set error messages for failures in this code - - #TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1] - init_decl = """ - npy_intp* Nx = %(x)s->dimensions; - - if (%(x)s->nd != 2) - { - PyErr_SetString(PyExc_ValueError, "a not 2d tensor"); - %(fail)s; - } - if (%(b)s->nd != 1) - { - PyErr_SetString(PyExc_ValueError, "b not 1d tensor"); - %(fail)s; - } - if (%(x)s->descr->type_num != PyArray_DOUBLE) - { - PyErr_SetString(PyExc_TypeError, "a not float64"); - %(fail)s; - } - if (%(b)s->descr->type_num != PyArray_DOUBLE) - { - PyErr_SetString(PyExc_TypeError, "b not float64"); - %(fail)s; - } - if ((%(x)s->dimensions[1] != %(b)s->dimensions[0])) - { - PyErr_SetString(PyExc_ValueError, "dimension mismatch in arguments"); - %(fail)s; - } - - if ((NULL == %(sm)s) - || (%(sm)s->dimensions[0] != %(x)s->dimensions[0]) - || (%(sm)s->dimensions[1] != %(x)s->dimensions[1])) - { - if (NULL != %(sm)s) Py_XDECREF(%(sm)s); - %(sm)s = (PyArrayObject*)PyArray_SimpleNew(2, PyArray_DIMS(%(x)s), type_num_%(x)s); - if(!%(sm)s) { - PyErr_SetString(PyExc_MemoryError, "failed to alloc sm output"); - %(fail)s - } - } - """ - - begin_row_loop = """ - for (size_t i = 0; i < Nx[0]; ++i) - { - size_t j; - double sum = 0.0; - bool discount_max = false; - - const double* __restrict__ x_i = (double*)(%(x)s->data + %(x)s->strides[0] * i); - const double* __restrict__ b_i = (double*)(%(b)s->data); - double* __restrict__ sm_i = (double*)(%(sm)s->data + %(sm)s->strides[0] * i); - """ - - inside_row_loop = """ - npy_intp Sx = %(x)s->strides[1]/sizeof(double); - npy_intp Sb = %(b)s->strides[0]/sizeof(double); - npy_intp Ssm = %(sm)s->strides[1]/sizeof(double); - - size_t row_max_j=0; - double row_max = x_i[0] + b_i[0]; - // Get the maximum value of the row - for (j = 0; j < Nx[1]; ++j) - { - double row_ij = x_i[j * Sx] + b_i[j * Sb]; - row_max_j = (row_ij > row_max) ? j : row_max_j; - row_max = (row_ij > row_max) ? row_ij : row_max; - } - - for (j = 0; j < Nx[1]; ++j) - { - double row_ij = x_i[j * Sx] + b_i[j * Sb]; - double sm_ij = exp(row_ij - row_max); - sum += sm_ij; - sm_i[j * Ssm] = sm_ij; - } - if ( (0.0 == sum) || (isinf(sum))) - { - //that was our best... - %(fail)s; - } - - //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n); - double sum_inv = 1.0 / sum; - for (j = 0; j < Nx[1]; ++j) - { - sm_i[j * Ssm] *= sum_inv; - } - - """ - - end_row_loop = """ - } - """ - - return (init_decl, begin_row_loop, inside_row_loop, end_row_loop) - - - def c_code(self, node, name, (x, b), (sm,), sub): - code_template = ''.join(self.c_code_template()) - return code_template % dict(locals(), **sub) - - softmax_with_bias = SoftmaxWithBias() - - - class SoftmaxWithBiasDx(theano.Op): - nin = 2 - nout = 1 - """Gradient wrt x of the SoftmaxWithBias Op""" - - def __init__(self, **kwargs): - theano.Op.__init__(self, **kwargs) - - def make_node(self, dy, sm, **kwargs): - dy = tensor.as_tensor(dy) - sm = tensor.as_tensor(sm) - return theano.Apply(self, [dy, sm], [sm.type.make_result()]) - - def perform(self, node, input_storage, output_storage): - dy, sm = input_storage - dx = numpy.zeros_like(sm) - #dx[i,j] = - (\sum_k dy[i,k] sm[i,k]) sm[i,j] + dy[i,j] sm[i,j] - for i in xrange(sm.shape[0]): - dy_times_sm_i = dy[i] * sm[i] - dx[i] = dy_times_sm_i - sum(dy_times_sm_i) * sm[i] - output_storage[0][0] = dx - - def grad(self, *args): - raise NotImplementedError() - - def c_code(self, node, name, (dy, sm), (dx,), sub): - return ''' - if ((%(dy)s->descr->type_num != PyArray_DOUBLE) - || (%(sm)s->descr->type_num != PyArray_DOUBLE)) - { - PyErr_SetString(PyExc_TypeError, "types should be float64, float64"); - %(fail)s; - } - if ((%(dy)s->nd != 2) - || (%(sm)s->nd != 2)) - { - PyErr_SetString(PyExc_ValueError, "rank error"); - %(fail)s; - } - if (%(dy)s->dimensions[0] != %(sm)s->dimensions[0]) - { - PyErr_SetString(PyExc_ValueError, "dimension mismatch"); - %(fail)s; - } - if ((NULL == %(dx)s) - || (%(dx)s->dimensions[0] != %(sm)s->dimensions[0]) - || (%(dx)s->dimensions[1] != %(sm)s->dimensions[1])) - { - Py_XDECREF(%(dx)s); - %(dx)s = (PyArrayObject*) PyArray_SimpleNew(2, PyArray_DIMS(%(sm)s), - type_num_%(sm)s); - if (!%(dx)s) - { - PyErr_SetString(PyExc_MemoryError, "failed to alloc dx output"); - %(fail)s; - } - } - - for (size_t i = 0; i < %(dx)s->dimensions[0]; ++i) - { - const double* __restrict__ dy_i = (double*) (%(dy)s->data + %(dy)s->strides[0] * i); - npy_intp Sdy = %(dy)s->strides[1]/sizeof(double); - const double* __restrict__ sm_i = (double*) (%(sm)s->data + %(sm)s->strides[0] * i); - npy_intp Ssm = %(sm)s->strides[1]/sizeof(double); - double* __restrict__ dx_i = (double*) (%(dx)s->data + %(dx)s->strides[0] * i); - npy_intp Sdx = %(dx)s->strides[1]/sizeof(double); - - double sum_dy_times_sm = 0.; - for (size_t j = 0; j < %(dx)s->dimensions[1]; ++j) - { - dx_i[j * Sdx] = dy_i[j * Sdy] * sm_i[j * Ssm]; - sum_dy_times_sm += dx_i[j * Sdx]; - } - for (size_t j = 0; j < %(dx)s->dimensions[1]; ++j) - { - dx_i[j * Sdx] -= sum_dy_times_sm * sm_i[j * Ssm]; - } - } - ''' % dict(locals(), **sub) - - def softmax(x, **kwargs): - b = tensor.zeros_like(x[0,:]) - return softmax_with_bias(x, b, **kwargs) - - - class CrossentropySoftmaxArgmax1HotWithBias(theano.Op): - """A special compound L{Op} for the output of neural-net classifiers. - - @type x: is a matrix of floats (32 or 64) - @type b: is a [row] vector of floats (32 or 64), length is number of cols in x - @type y_idx: a [column] vector of int (32 or 64), length is number of rows in x - - @precondition: every entry in y_idx is a valid (non-negative) column index into x - - This L{Op} has three outputs: - - KL(softmax(x+b), y) - - softmax(x+b) - - argmax(x+b) - - softmax(x[i]) is the i'th distribution over len(x[i]) options - argmax(x) is the index of x's greatest element - y_idx[i] is an integer index, encoding a 1-hot distribution. - - In practice, when we're trying to do classification, we have one row in x - and y_idx per example, and y[i] is the index of the (correct) class of the - i'th example. - - """ - nin=3 - nout=3 - def __init__(self, **kwargs): - theano.Op.__init__(self, **kwargs) - - def make_node(self, x, b, y_idx): - x = tensor.as_tensor(x) - b = tensor.as_tensor(b) - y_idx = tensor.as_tensor(y_idx) - if x.type.ndim != 2 \ - or x.type.dtype not in ['float32', 'float64']: - raise ValueError('x must be 2-d tensor of floats') - if b.type.ndim != 1 \ - or x.type.dtype not in ['float32', 'float64']: - raise ValueError('b must be 1-d tensor of floats') - if y_idx.type.ndim != 1 \ - or y_idx.type.dtype not in ['int8', 'int16', 'int32', 'int64']: - raise ValueError('y_idx must be 1-d tensor of ints') - - # TODO: Is this correct? It used to be y, not y_idx - nll = tensor.Tensor(x.type.dtype, - y_idx.type.broadcastable).make_result() - # nll = Tensor(x.dtype, y.broadcastable) - sm = x.type.make_result() - am = y_idx.type.make_result() - return theano.Apply(self, [x, b, y_idx], [nll, sm, am]) - def perform(self, node, input_storage, output_storage): - """ - The math, where x is an input vector, and t is a target index: - - softmax(x)[i] = exp(x[i]) / sum_j(exp(x[j])) - nll(x,t) = -log(softmax(x)[t]) - - We compute this by subtracting off the max of x. This avoids numerical instability. - - m = max_j x[j] - softmax(x)[i] = exp(x[i] -m) / sum_j(exp(x[j] - m)) - - nll = -log(exp(x[t] -m) / sum_j(exp(x[j] - m))) - = -x[t] + m + log( sum_j(exp(x[j] - m))) - - """ - x, b, y_idx = input_storage - if b.shape[0] != x.shape[1]: - raise ValueError('b must have same number of columns as x') - if y_idx.shape[0] != x.shape[0]: - raise ValueError('y_idx must have same number of rows as x') - - sm = numpy.zeros_like(x) # softmax - nll = numpy.zeros(x.shape[0]) #nll(y | softmax(x)) - am = numpy.zeros_like(y_idx) - for i in xrange(sm.shape[0]): - #add the bias vector to the i'th row of x - row = x[i] + b - - #get the maximum value of i'th row for numerically safe softmax / nll - am[i] = numpy.argmax(row) - m = row[am[i]] - - #compute the unnormalized softmax, and normalization constant - sm[i] = numpy.exp(row - m) - sum_j = numpy.sum(sm[i]) # sum_j(exp(x[j] - m)) - - #normalized our softmax - sm[i] *= 1.0 / sum_j - - # store the nll - nll[i] = -row[y_idx[i]] + m + numpy.log(sum_j) - - output_storage[0][0] = nll - output_storage[1][0] = sm - output_storage[2][0] = am - def grad(self, (x, b, y_idx), (g_nll, g_sm, g_am)): - if g_sm is not None or g_am is not None: - raise NotImplementedError() - nll, sm = crossentropy_softmax_1hot_with_bias(x, b, y_idx) - dx = CrossentropySoftmax1HotWithBiasDx()(g_nll, sm, y_idx) - db = tensor.sum(dx, axis = [0]) - return dx, db, None - - def c_headers(self): return [''] - - @staticmethod - def c_code_template(): - # this implementation was lifted from - # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx - - #TODO: put this into a templated function, in the support code - #TODO: declare the max of each row as an Op output - - #TODO: set error messages for failures in this code - - #TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1] - (init_decl, begin_row_loop, inside_row_loop, end_row_loop) = \ - SoftmaxWithBias.c_code_template() - return (init_decl, - """ - if (%(y_idx)s->nd != 1) - { - PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor"); - %(fail)s; - } - if ((%(y_idx)s->descr->type_num != PyArray_INT64) - && (%(y_idx)s->descr->type_num != PyArray_INT32) - && (%(y_idx)s->descr->type_num != PyArray_INT16) - && (%(y_idx)s->descr->type_num != PyArray_INT8)) - { - PyErr_SetString(PyExc_TypeError, "y_idx not int8, int16, int32, or int64"); - %(fail)s; - } - if (%(x)s->dimensions[0] != %(y_idx)s->dimensions[0]) - { - PyErr_SetString(PyExc_ValueError, "dimension mismatch in arguments"); - %(fail)s; - } - - if ((NULL == %(nll)s) //initial condition - || (%(nll)s->dimensions[0] != %(y_idx)s->dimensions[0])) - { - if (NULL != %(nll)s) Py_XDECREF(%(nll)s); - %(nll)s = (PyArrayObject*)PyArray_SimpleNew(1, PyArray_DIMS(%(y_idx)s), type_num_%(x)s); - if(!%(nll)s) - { - PyErr_SetString(PyExc_MemoryError, "failed to alloc nll output"); - %(fail)s; - } - } - if ((NULL == %(am)s) - || (%(am)s->dimensions[0] != %(y_idx)s->dimensions[0])) - { - Py_XDECREF(%(am)s); - %(am)s = (PyArrayObject*) PyArray_SimpleNew(1, PyArray_DIMS(%(y_idx)s), type_num_%(y_idx)s); - if(!%(am)s) - { - PyErr_SetString(PyExc_MemoryError, "failed to alloc am output"); - %(fail)s; - } - } - """, - begin_row_loop, - """ - const %(y_idx_type)s y_i = ((%(y_idx_type)s*)(%(y_idx)s->data + %(y_idx)s->strides[0] * i))[0]; - double* __restrict__ nll_i = (double*)(%(nll)s->data + %(nll)s->strides[0] * i); - %(am_type)s* __restrict__ am_i = (%(am_type)s*) (%(am)s->data + %(am)s->strides[0] * i); - """, - inside_row_loop, - """ - nll_i[0] = - x_i[y_i*Sx] - - b_i[y_i*Sb] - + row_max - + log(sum); - am_i[0] = row_max_j; - """, - end_row_loop) - - - def c_code(self, node, name, (x, b, y_idx), (nll, sm, am), sub): - y_idx_type = node.inputs[2].type.dtype_specs()[1] - am_type = y_idx_type - code_template = ''.join(self.c_code_template()) - return code_template % dict(locals(), **sub) - - class CrossentropySoftmax1HotWithBiasDx (theano.Op): - nin=3 - nout=1 - """Gradient wrt x of the CrossentropySoftmax1Hot Op""" - def __init__(self, **kwargs): - theano.Op.__init__(self,**kwargs) - def make_node(self, dy, sm, y_idx,**kwargs): - dy = tensor.as_tensor(dy) - sm = tensor.as_tensor(sm) - y_idx = tensor.as_tensor(y_idx) - return theano.Apply(self, [dy, sm, y_idx],[sm.type.make_result()]) - def perform(self, node, input_storage, output_storage): - dy,sm,y_idx = input_storage - dx = numpy.zeros_like(sm) - for i in xrange(sm.shape[0]): - dx[i] = dy[i] * sm[i] #vector scale - dx[i, y_idx[i]] -= dy[i] #scalar decrement - output_storage[0][0] = dx - def grad(self, *args): - raise NotImplementedError() - def c_code(self, node, name, (dnll, sm, y_idx), (dx,), sub): - y_idx_type = node.inputs[2].type.dtype_specs()[1] - return """ - - if ((%(dnll)s->descr->type_num != PyArray_DOUBLE) - || (%(sm)s->descr->type_num != PyArray_DOUBLE) - ) - { - PyErr_SetString(PyExc_TypeError, "types should be float64, float64, int64"); - %(fail)s; - } - if ((%(y_idx)s->descr->type_num != PyArray_INT64) - && (%(y_idx)s->descr->type_num != PyArray_INT32) - && (%(y_idx)s->descr->type_num != PyArray_INT16) - && (%(y_idx)s->descr->type_num != PyArray_INT8)) - { - PyErr_SetString(PyExc_TypeError, "y_idx not int8, int16, int32, or int64"); - %(fail)s; - } - if ((%(dnll)s->nd != 1) - || (%(sm)s->nd != 2) - || (%(y_idx)s->nd != 1)) - { - PyErr_SetString(PyExc_ValueError, "rank error"); - %(fail)s; - } - if ((%(dnll)s->dimensions[0] != %(sm)s->dimensions[0]) - || (%(dnll)s->dimensions[0] != %(y_idx)s->dimensions[0])) - { - PyErr_SetString(PyExc_ValueError, "dimension mismatch"); - %(fail)s; - } - if ((NULL == %(dx)s) - || (%(dx)s->dimensions[0] != %(sm)s->dimensions[0]) - || (%(dx)s->dimensions[1] != %(sm)s->dimensions[1])) - { - if (NULL != %(dx)s) Py_XDECREF(%(dx)s); - %(dx)s = (PyArrayObject*)PyArray_SimpleNew(2, PyArray_DIMS(%(sm)s), type_num_%(sm)s); - if(!%(dx)s) { - PyErr_SetString(PyExc_MemoryError, "failed to alloc dx output"); - %(fail)s - } - } - - for (size_t i = 0; i < %(dx)s->dimensions[0]; ++i) - { - const double dnll_i = ((double*)(%(dnll)s->data + %(dnll)s->strides[0] * i))[0]; - - const %(y_idx_type)s y_i = ((%(y_idx_type)s*)(%(y_idx)s->data + %(y_idx)s->strides[0] * i))[0]; - - const double* __restrict__ sm_i = (double*)(%(sm)s->data + %(sm)s->strides[0] * i); - npy_intp Ssm = %(sm)s->strides[1]/sizeof(double); - - double* __restrict__ dx_i = (double*)(%(dx)s->data + %(dx)s->strides[0] * i); - npy_intp Sdx = %(dx)s->strides[1]/sizeof(double); - - for (size_t j = 0; j < %(dx)s->dimensions[1]; ++j) - { - dx_i[j * Sdx] = dnll_i * sm_i[j * Ssm]; - } - if (y_i >= %(dx)s->dimensions[1]) - { - %(fail)s; - } - dx_i[y_i * Sdx] -= dnll_i; - } - """ % dict(locals(), **sub) - - crossentropy_softmax_argmax_1hot_with_bias = \ - CrossentropySoftmaxArgmax1HotWithBias() - - def crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs): - return crossentropy_softmax_argmax_1hot_with_bias(x, b, y_idx, **kwargs)[0:2] - - def crossentropy_softmax_1hot(x, y_idx, **kwargs): - b = tensor.zeros_like(x[0,:]) - return crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs) - - - class MultinomialCrossentropy1Hot(theano.Op): - pass - - - def binary_crossentropy(output, target): - """ - Compute the crossentropy of binary output wrt binary target. - @note: We do not sum, crossentropy is computed by component. - @todo: Rewrite as a scalar, and then broadcast to tensor. - @todo: This is essentially duplicated as cost.cross_entropy - @warning: OUTPUT and TARGET are reversed in cost.cross_entropy - """ - return -(target * tensor.log(output) + (1 - target) * tensor.log(1 - output)) - - - - class Prepend_scalar_constant_to_each_row(theano.Op): - def __init__(self, val = 0): - if isinstance(val, float): - val = scalar.constant(val) - self.val = val - - def make_node(self, mat): - #check type of input - if not isinstance(mat,theano.Result) or not mat.type==tensor.matrix().type: - raise TypeError("Expected a matrix as input") - x = tensor.as_tensor(mat) - y = tensor.as_tensor(self.val) - if x.type.dtype != y.type.dtype: - TypeError("the value to prepend don't have the same type as the matrix") - - node = theano.Apply(op=self, inputs=[mat], outputs=[tensor.matrix()]) - return node - - def perform(self, node, (mat, ), (output, )): - new_shape=(mat.shape[0],mat.shape[1]+1) - if output[0] == None: - output[0]=numpy.empty(new_shape,dtype=mat.dtype) - out=output[0] - else: - if output[0].shape!=new_shape: - try: - output[0].resize(new_shape) - except: - output[0]=numpy.empty(new_shape, dtype=mat.dtype) - out=output[0] - - out[:,0].fill(self.val.data) - out[:,1:]=mat - - def grad(self, (mat,), (goutput,)): - return goutput[:,1:] - - class Prepend_scalar_to_each_row(theano.Op): - def make_node(self, val, mat): - #check type of input - if isinstance(val, float): - val = scalar.constant(val) - if not isinstance(mat,theano.Result) or not mat.type==tensor.matrix().type: - raise TypeError("Expected a matrix as input") - x = tensor.as_tensor(mat) - y = tensor.as_tensor(val) - if x.type.dtype != y.type.dtype: - TypeError("the value to prepend don't have the same type as the matrix") - - node = theano.Apply(op=self, inputs=[val,mat], outputs=[tensor.matrix()]) - return node - - def perform(self, node, (val,mat), (output, )): - new_shape=(mat.shape[0],mat.shape[1]+1) - if output[0] == None: - output[0]=numpy.empty(new_shape,dtype=mat.dtype) - out=output[0] - else: - if output[0].shape!=new_shape: - try: - output[0].resize(new_shape) - except: - output[0]=numpy.empty(new_shape, dtype=mat.dtype) - out=output[0] - out[:,0].fill(val) - out[:,1:]=mat - - def grad(self, (val, mat), (goutput,)): - return goutput[:,0], goutput[:,1:] - - prepend_scalar_to_each_row = Prepend_scalar_to_each_row() - prepend_0_to_each_row = Prepend_scalar_constant_to_each_row(0.) - prepend_1_to_each_row = Prepend_scalar_constant_to_each_row(1.) - - class solve(theano.Op): - """ - Find the solution to the linear equation Ax=b, - where A is a 2d matrix and b is a 1d or 2d matrix. - It use numpy.solve to find the solution. - """ - - def make_node(self, A, b): - if not isinstance(A, theano.Result) or not A.type==tensor.matrix().type: - raise TypeError("We expected that A had a matrix type") - if not isinstance(B, theano.Result) or not B.type==tensor.matrix().type: - raise TypeError("We expected that B had a matrix type") - - node = theano.Apply(op=self, inputs=[A, B], outputs=[tensor.matrix()]) - return node - - def perform(self, node, (A, B), (output, )): - ret=numpy.solve(A,B) - output[0]=ret - - def grad(self, (theta, A, B), (gtheta,)): - raise NotImplementedError() - - diff -r 27b1344a57b1 -r 8fff4bc26f4c noise.py --- a/noise.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ -def binomial(input, rstate, p = 0.75): - """ - Op to corrupt an input with binomial noise. - Generate a noise vector of 1's and 0's (1 with probability p). - We multiply this by the input. - - @note: See U{ssh://projects@lgcm.iro.umontreal.ca/repos/denoising_aa} - to see how rstate is used. - """ - noise = rstate.gen_like(('binomial',{'p': p, 'n': 1}), input) - noise.name = 'noise' - return noise * input - diff -r 27b1344a57b1 -r 8fff4bc26f4c onehotop.py --- a/onehotop.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ -""" -One hot Op -""" - -#from theano import tensor -from theano.tensor import as_tensor, Tensor -from theano.gof import op -from theano.gof.graph import Apply - -import numpy - -class OneHot(op.Op): - """ - Construct a one-hot vector, x out of y. - - @todo: Document inputs and outputs - @todo: Use 'bool' as output dtype? Or, at least 'int64' ? Not float64! - @todo: Use 'bool' as output dtype, not 'int64' ? - @todo: Allow this to operate on column vectors (Tensor) - @todo: Describe better. - """ - - def make_node(self, x, y): - """ - @type x: Vector L{Tensor} of integers - @param x: The entries of the one-hot vector to be one. - @type y: Integer scalar L{Tensor} - @param y: The length (#columns) of the one-hot vectors. - @return: A L{Tensor} of one-hot vectors - - @precondition: x < y for all entries of x - @todo: Check that x and y are int types - """ - x = as_tensor(x) - y = as_tensor(y) - #assert x.dtype[0:3] == "int" - #assert y.dtype[0:3] == "int" - inputs = [x, y] - ##outputs = [tensor.Tensor("int64", broadcastable=[False, False])] - #outputs = [tensor.Tensor("float64", broadcastable=[False, False])] - #outputs = [Tensor("int64", broadcastable=[False, False])] - outputs = [Tensor("float64", broadcastable=[False, False]).make_result()] - node = Apply(op = self, inputs = inputs, outputs = outputs) - return node - - def perform(self, node, (x, y), (out, )): - assert x.dtype == "int64" or x.dtype == "int32" - assert x.ndim == 1 - assert y.dtype == "int64" or x.dtype == "int32" - assert y.ndim == 0 - out[0] = numpy.zeros((x.shape[0], y), dtype="float64") - for c in range(x.shape[0]): - assert x[c] < y - out[0][c, x[c]] = 1 - - def grad(self, (x, y), (out_gradient, )): - return None, None -one_hot = OneHot() diff -r 27b1344a57b1 -r 8fff4bc26f4c onehotop.py.scalar --- a/onehotop.py.scalar Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -""" -One hot Op -""" - -#from theano import tensor -from theano.tensor import as_tensor, Tensor -#from theano import scalar -from theano.scalar import as_scalar -from theano.gof import op -from theano.gof.graph import Apply - -import numpy - -class OneHot(op.Op): - """ - Construct a one-hot vector, x out of y. - - @todo: Document inputs and outputs - @todo: Use 'bool' as output dtype? Or, at least 'int64' ? Not float64! - @todo: Use 'bool' as output dtype, not 'int64' ? - @todo: Allow this to operate on column vectors (Tensor) - @todo: Describe better. - @todo: What type is y? - @todo: What about operating on L{Scalar}s? - """ - - def make_node(self, x, y): - """ - @type x: Vector L{Tensor} of integers - @param x: The entries of the one-hot vector to be one. - @type y: Integer L{Scalar} - @param y: The length (#columns) of the one-hot vectors. - @return: A L{Tensor} of one-hot vectors - - @precondition: x < y for all entries of x - @todo: Check that x and y are int types - """ - #x = tensor.as_tensor(x) - #y = scalar.as_scalar(y) - x = as_tensor(x) - y = as_scalar(y) - #assert x.dtype[0:3] == "int" - #assert y.dtype[0:3] == "int" - inputs = [x, y] - ##outputs = [tensor.Tensor("int64", broadcastable=[False, False])] - #outputs = [tensor.Tensor("float64", broadcastable=[False, False])] - #outputs = [Tensor("int64", broadcastable=[False, False])] - outputs = [Tensor("float64", broadcastable=[False, False]).make_result()] - node = Apply(op = self, inputs = inputs, outputs = outputs) - return node - - def perform(self, node, (x, y), (out, )): - assert x.dtype == "int64" - assert type(y) == numpy.int64 - assert x.ndim == 1 - #out = numpy.zeros((x.shape[0], y), dtype="int64") - out[0] = numpy.zeros((x.shape[0], y), dtype="float64") - for c in range(x.shape[0]): - assert x[c] < y - out[0][c, x[c]] = 1 - - def grad(self, (x, y), (out_gradient, )): - return None, None -one_hot = OneHot() diff -r 27b1344a57b1 -r 8fff4bc26f4c pmat.py --- a/pmat.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,526 +0,0 @@ -## Automatically adapted for numpy.numarray Jun 13, 2007 by python_numarray_to_numpy (-xsm) - -# PMat.py -# Copyright (C) 2005 Pascal Vincent -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# 3. The name of the authors may not be used to endorse or promote -# products derived from this software without specific prior written -# permission. -# -# THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR -# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN -# NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# This file is part of the PLearn library. For more information on the PLearn -# library, go to the PLearn Web site at www.plearn.org - - -# Author: Pascal Vincent - -#import numarray, sys, os, os.path -import numpy.numarray, sys, os, os.path -import fpconst - -def array_columns( a, cols ): - indices = None - if isinstance( cols, int ): - indices = [ cols ] - elif isinstance( cols, slice ): - #print cols - indices = range( *cols.indices(cols.stop) ) - else: - indices = list( cols ) - - return numpy.numarray.take(a, indices, axis=1) - -def load_pmat_as_array(fname): - s = file(fname,'rb').read() - formatstr = s[0:64] - datastr = s[64:] - structuretype, l, w, data_type, endianness = formatstr.split() - - if data_type=='DOUBLE': - elemtype = 'd' - elif data_type=='FLOAT': - elemtype = 'f' - else: - raise ValueError('Invalid data type in file header: '+data_type) - - if endianness=='LITTLE_ENDIAN': - byteorder = 'little' - elif endianness=='BIG_ENDIAN': - byteorder = 'big' - else: - raise ValueError('Invalid endianness in file header: '+endianness) - - l = int(l) - w = int(w) - X = numpy.numarray.fromstring(datastr,elemtype, shape=(l,w) ) - if byteorder!=sys.byteorder: - X.byteswap(True) - return X - -def load_pmat_as_array_dataset(fname): - import dataset,lookup_list - - #load the pmat as array - a=load_pmat_as_array(fname) - - #load the fieldnames - fieldnames = [] - fieldnamefile = os.path.join(fname+'.metadata','fieldnames') - if os.path.isfile(fieldnamefile): - f = open(fieldnamefile) - for row in f: - row = row.split() - if len(row)>0: - fieldnames.append(row[0]) - f.close() - else: - self.fieldnames = [ "field_"+str(i) for i in range(a.shape[1]) ] - - return dataset.ArrayDataSet(a,lookup_list.LookupList(fieldnames,[x for x in range(a.shape[1])])) - -def load_amat_as_array_dataset(fname): - import dataset,lookup_list - - #load the amat as array - (a,fieldnames)=readAMat(fname) - - #load the fieldnames - if len(fieldnames)==0: - self.fieldnames = [ "field_"+str(i) for i in range(a.shape[1]) ] - - return dataset.ArrayDataSet(a,lookup_list.LookupList(fieldnames,[x for x in range(a.shape[1])])) - -def save_array_dataset_as_pmat(fname,ds): - ar=ds.data - save_array_as_pmat(fname,ar,ds.fieldNames()) - -def save_array_as_pmat( fname, ar, fieldnames=[] ): - s = file(fname,'wb') - - length, width = ar.shape - if fieldnames: - assert len(fieldnames) == width - metadatadir = fname+'.metadata' - if not os.path.isdir(metadatadir): - os.mkdir(metadatadir) - fieldnamefile = os.path.join(metadatadir,'fieldnames') - f = open(fieldnamefile,'wb') - for name in fieldnames: - f.write(name+'\t0\n') - f.close() - - header = 'MATRIX ' + str(length) + ' ' + str(width) + ' ' - if ar.dtype.char=='d': - header += 'DOUBLE ' - elemsize = 8 - - elif ar.dtype.char=='f': - header += 'FLOAT ' - elemsize = 4 - - else: - raise TypeError('Unsupported typecode: %s' % ar.dtype.char) - - rowsize = elemsize*width - - if sys.byteorder=='little': - header += 'LITTLE_ENDIAN ' - elif sys.byteorder=='big': - header += 'BIG_ENDIAN ' - else: - raise TypeError('Unsupported sys.byteorder: '+repr(sys.byteorder)) - - header += ' '*(63-len(header))+'\n' - s.write( header ) - s.write( ar.tostring() ) - s.close() - - -####### Iterators ########################################################### - -class VMatIt: - def __init__(self, vmat): - self.vmat = vmat - self.cur_row = 0 - - def __iter__(self): - return self - - def next(self): - if self.cur_row==self.vmat.length: - raise StopIteration - row = self.vmat.getRow(self.cur_row) - self.cur_row += 1 - return row - -class ColumnIt: - def __init__(self, vmat, col): - self.vmat = vmat - self.col = col - self.cur_row = 0 - - def __iter__(self): - return self - - def next(self): - if self.cur_row==self.vmat.length: - raise StopIteration - val = self.vmat[self.cur_row, self.col] - self.cur_row += 1 - return val - -####### VMat classes ######################################################## - -class VMat: - def __iter__(self): - return VMatIt(self) - - def __getitem__( self, key ): - if isinstance( key, slice ): - start, stop, step = key.start, key.stop, key.step - if step!=None: - raise IndexError('Extended slice with step not currently supported') - - if start is None: - start = 0 - - l = self.length - if stop is None or stop > l: - stop = l - - return self.getRows(start,stop-start) - - elif isinstance( key, tuple ): - # Basically returns a SubVMatrix - assert len(key) == 2 - rows = self.__getitem__( key[0] ) - - shape = rows.shape - if len(shape) == 1: - return rows[ key[1] ] - - cols = key[1] - if isinstance(cols, slice): - start, stop, step = cols.start, cols.stop, cols.step - if start is None: - start = 0 - - if stop is None: - stop = self.width - elif stop < 0: - stop = self.width+stop - - cols = slice(start, stop, step) - - return array_columns(rows, cols) - - elif isinstance( key, str ): - # The key is considered to be a fieldname and a column is - # returned. - try: - return array_columns( self.getRows(0,self.length), - self.fieldnames.index(key) ) - except ValueError: - print >>sys.stderr, "Key is '%s' while fieldnames are:" % key - print >>sys.stderr, self.fieldnames - raise - - else: - if key<0: key+=self.length - return self.getRow(key) - - def getFieldIndex(self, fieldname): - try: - return self.fieldnames.index(fieldname) - except ValueError: - raise ValueError( "VMat has no field named %s. Field names: %s" - %(fieldname, ','.join(self.fieldnames)) ) - -class PMat( VMat ): - - def __init__(self, fname, openmode='r', fieldnames=[], elemtype='d', - inputsize=-1, targetsize=-1, weightsize=-1, array = None): - self.fname = fname - self.inputsize = inputsize - self.targetsize = targetsize - self.weightsize = weightsize - if openmode=='r': - self.f = open(fname,'rb') - self.read_and_parse_header() - self.load_fieldnames() - - elif openmode=='w': - self.f = open(fname,'w+b') - self.fieldnames = fieldnames - self.save_fieldnames() - self.length = 0 - self.width = len(fieldnames) - self.elemtype = elemtype - self.swap_bytes = False - self.write_header() - - elif openmode=='a': - self.f = open(fname,'r+b') - self.read_and_parse_header() - self.load_fieldnames() - - else: - raise ValueError("Currently only supported openmodes are 'r', 'w' and 'a': "+repr(openmode)+" is not supported") - - if array is not None: - shape = array.shape - if len(shape) == 1: - row_format = lambda r: [ r ] - elif len(shape) == 2: - row_format = lambda r: r - - for row in array: - self.appendRow( row_format(row) ) - - def __del__(self): - self.close() - - def write_header(self): - header = 'MATRIX ' + str(self.length) + ' ' + str(self.width) + ' ' - - if self.elemtype=='d': - header += 'DOUBLE ' - self.elemsize = 8 - elif self.elemtype=='f': - header += 'FLOAT ' - self.elemsize = 4 - else: - raise TypeError('Unsupported elemtype: '+repr(elemtype)) - self.rowsize = self.elemsize*self.width - - if sys.byteorder=='little': - header += 'LITTLE_ENDIAN ' - elif sys.byteorder=='big': - header += 'BIG_ENDIAN ' - else: - raise TypeError('Unsupported sys.byteorder: '+repr(sys.byteorder)) - - header += ' '*(63-len(header))+'\n' - - self.f.seek(0) - self.f.write(header) - - def read_and_parse_header(self): - header = self.f.read(64) - mat_type, l, w, data_type, endianness = header.split() - if mat_type!='MATRIX': - raise ValueError('Invalid file header (should start with MATRIX)') - self.length = int(l) - self.width = int(w) - if endianness=='LITTLE_ENDIAN': - byteorder = 'little' - elif endianness=='BIG_ENDIAN': - byteorder = 'big' - else: - raise ValueError('Invalid endianness in file header: '+endianness) - self.swap_bytes = (byteorder!=sys.byteorder) - - if data_type=='DOUBLE': - self.elemtype = 'd' - self.elemsize = 8 - elif data_type=='FLOAT': - self.elemtype = 'f' - self.elemsize = 4 - else: - raise ValueError('Invalid data type in file header: '+data_type) - self.rowsize = self.elemsize*self.width - - def load_fieldnames(self): - self.fieldnames = [] - fieldnamefile = os.path.join(self.fname+'.metadata','fieldnames') - if os.path.isfile(fieldnamefile): - f = open(fieldnamefile) - for row in f: - row = row.split() - if len(row)>0: - self.fieldnames.append(row[0]) - f.close() - else: - self.fieldnames = [ "field_"+str(i) for i in range(self.width) ] - - def save_fieldnames(self): - metadatadir = self.fname+'.metadata' - if not os.path.isdir(metadatadir): - os.mkdir(metadatadir) - fieldnamefile = os.path.join(metadatadir,'fieldnames') - f = open(fieldnamefile,'wb') - for name in self.fieldnames: - f.write(name+'\t0\n') - f.close() - - def getRow(self,i): - if i<0 or i>=self.length: - raise IndexError('PMat index out of range') - self.f.seek(64+i*self.rowsize) - data = self.f.read(self.rowsize) - ar = numpy.numarray.fromstring(data, self.elemtype, (self.width,)) - if self.swap_bytes: - ar.byteswap(True) - return ar - - def getRows(self,i,l): - if i<0 or l<0 or i+l>self.length: - raise IndexError('PMat index out of range') - self.f.seek(64+i*self.rowsize) - data = self.f.read(l*self.rowsize) - ar = numpy.numarray.fromstring(data, self.elemtype, (l,self.width)) - if self.swap_bytes: - ar.byteswap(True) - return ar - - def checkzerorow(self,i): - if i<0 or i>self.length: - raise IndexError('PMat index out of range') - self.f.seek(64+i*self.rowsize) - data = self.f.read(self.rowsize) - ar = numpy.numarray.fromstring(data, self.elemtype, (len(data)/self.elemsize,)) - if self.swap_bytes: - ar.byteswap(True) - for elem in ar: - if elem!=0: - return False - return True - - def putRow(self,i,row): - if i<0 or i>=self.length: - raise IndexError('PMat index out of range') - if len(row)!=self.width: - raise TypeError('length of row ('+str(len(row))+ ') differs from matrix width ('+str(self.width)+')') - if i<0 or i>=self.length: - raise IndexError - if self.swap_bytes: # must make a copy and swap bytes - ar = numpy.numarray.numarray(row,type=self.elemtype) - ar.byteswap(True) - else: # asarray makes a copy if not already a numarray of the right type - ar = numpy.numarray.asarray(row,type=self.elemtype) - self.f.seek(64+i*self.rowsize) - self.f.write(ar.tostring()) - - def appendRow(self,row): - if len(row)!=self.width: - raise TypeError('length of row ('+str(len(row))+ ') differs from matrix width ('+str(self.width)+')') - if self.swap_bytes: # must make a copy and swap bytes - ar = numpy.numarray.numarray(row,type=self.elemtype) - ar.byteswap(True) - else: # asarray makes a copy if not already a numarray of the right type - ar = numpy.numarray.asarray(row,type=self.elemtype) - - self.f.seek(64+self.length*self.rowsize) - self.f.write(ar.tostring()) - self.length += 1 - self.write_header() # update length in header - - def flush(self): - self.f.flush() - - def close(self): - if hasattr(self, 'f'): - self.f.close() - - def append(self,row): - self.appendRow(row) - - def __setitem__(self, i, row): - l = self.length - if i<0: i+=l - self.putRow(i,row) - - def __len__(self): - return self.length - - - -#copied from PLEARNDIR:python_modules/plearn/vmat/readAMat.py -def safefloat(str): - """Convert the given string to its float value. It is 'safe' in the sense - that missing values ('nan') will be properly converted to the corresponding - float value under all platforms, contrarily to 'float(str)'. - """ - if str.lower() == 'nan': - return fpconst.NaN - else: - return float(str) - -#copied from PLEARNDIR:python_modules/plearn/vmat/readAMat.py -def readAMat(amatname): - """Read a PLearn .amat file and return it as a numarray Array. - - Return a tuple, with as the first argument the array itself, and as - the second argument the fieldnames (list of strings). - """ - ### NOTE: this version is much faster than first creating the array and - ### updating each row as it is read... Bizarrely enough - f = open(amatname) - a = [] - fieldnames = [] - for line in f: - if line.startswith("#size:"): - (length,width) = line[6:].strip().split() - elif line.startswith("#sizes:"): # ignore input/target/weight/extra sizes - continue - - elif line.startswith("#:"): - fieldnames = line[2:].strip().split() - pass - elif not line.startswith('#'): - # Add all non-comment lines. - row = [ safefloat(x) for x in line.strip().split() ] - if row: - a.append(row) - - f.close() - return numpy.numarray.array(a), fieldnames - - -if __name__ == '__main__': - pmat = PMat( 'tmp.pmat', 'w', fieldnames=['F1', 'F2'] ) - pmat.append( [1, 2] ) - pmat.append( [3, 4] ) - pmat.close() - - pmat = PMat( 'tmp.pmat', 'r' ) - ar=load_pmat_as_array('tmp.pmat') - ds=load_pmat_as_array_dataset('tmp.pmat') - - print "PMat",pmat - print "PMat",pmat[:] - print "array",ar - print "ArrayDataSet",ds - for i in ds: - print i - save_array_dataset_as_pmat("tmp2.pmat",ds) - ds2=load_pmat_as_array_dataset('tmp2.pmat') - for i in ds2: - print i - # print "+++ tmp.pmat contains: " - # os.system( 'plearn vmat cat tmp.pmat' ) - import shutil - for fname in ["tmp.pmat", "tmp2.pmat"]: - os.remove( fname ) - if os.path.exists( fname+'.metadata' ): - shutil.rmtree( fname+'.metadata' ) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/__init__.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,6 @@ +#import exceptions + +def __src_version__(): + #todo - this is vulnerable to the bug in theano ticket #160 + return version.src_version(__name__) + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/__init__.py diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/aa.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/aa.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,108 @@ + +import theano +from theano import tensor as T +from theano.tensor import nnet as NN +import numpy as N + +class AutoEncoder(theano.Module): + + def __init__(self, input = None, regularize = True, tie_weights = True): + super(AutoEncoder, self).__init__() + + # MODEL CONFIGURATION + self.regularize = regularize + self.tie_weights = tie_weights + + # ACQUIRE/MAKE INPUT + if not input: + input = T.matrix('input') + self.input = theano.External(input) + + # HYPER-PARAMETERS + self.lr = theano.Member(T.scalar()) + + # PARAMETERS + self.w1 = theano.Member(T.matrix()) + if not tie_weights: + self.w2 = theano.Member(T.matrix()) + else: + self.w2 = self.w1.T + self.b1 = theano.Member(T.vector()) + self.b2 = theano.Member(T.vector()) + + # HIDDEN LAYER + self.hidden_activation = T.dot(input, self.w1) + self.b1 + self.hidden = self.build_hidden() + + # RECONSTRUCTION LAYER + self.output_activation = T.dot(self.hidden, self.w2) + self.b2 + self.output = self.build_output() + + # RECONSTRUCTION COST + self.reconstruction_cost = self.build_reconstruction_cost() + + # REGULARIZATION COST + self.regularization = self.build_regularization() + + # TOTAL COST + self.cost = self.reconstruction_cost + if self.regularize: + self.cost = self.cost + self.regularization + + # GRADIENTS AND UPDATES + if self.tie_weights: + self.params = self.w1, self.b1, self.b2 + else: + self.params = self.w1, self.w2, self.b1, self.b2 + gradients = T.grad(self.cost, self.params) + updates = dict((p, p - self.lr * g) for p, g in zip(self.params, gradients)) + + # INTERFACE METHODS + self.update = theano.Method(input, self.cost, updates) + self.reconstruction = theano.Method(input, self.output) + self.representation = theano.Method(input, self.hidden) + + def _instance_initialize(self, obj, input_size = None, hidden_size = None, seed = None, **init): + if (input_size is None) ^ (hidden_size is None): + raise ValueError("Must specify hidden_size and input_size or neither.") + super(AutoEncoder, self)._instance_initialize(obj, **init) + if seed is not None: + R = N.random.RandomState(seed) + else: + R = N.random + if input_size is not None: + sz = (input_size, hidden_size) + range = 1/N.sqrt(input_size) + obj.w1 = R.uniform(size = sz, low = -range, high = range) + if not self.tie_weights: + obj.w2 = R.uniform(size = list(reversed(sz)), low = -range, high = range) + obj.b1 = N.zeros(hidden_size) + obj.b2 = N.zeros(input_size) + + def build_regularization(self): + return T.zero() # no regularization! + + +class SigmoidXEAutoEncoder(AutoEncoder): + + def build_hidden(self): + return NN.sigmoid(self.hidden_activation) + + def build_output(self): + return NN.sigmoid(self.output_activation) + + def build_reconstruction_cost(self): + self.reconstruction_cost_matrix = self.input * T.log(self.output) + (1.0 - self.input) * T.log(1.0 - self.output) + self.reconstruction_costs = -T.sum(self.reconstruction_cost_matrix, axis=1) + return T.sum(self.reconstruction_costs) + + def build_regularization(self): + self.l2_coef = theano.Member(T.scalar()) + if self.tie_weights: + return self.l2_coef * T.sum(self.w1 * self.w1) + else: + return self.l2_coef * T.sum(self.w1 * self.w1) + T.sum(self.w2 * self.w2) + + def _instance_initialize(self, obj, input_size = None, hidden_size = None, **init): + init.setdefault('l2_coef', 0) + super(SigmoidXEAutoEncoder, self)._instance_initialize(obj, input_size, hidden_size, **init) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/cost.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/cost.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,33 @@ +""" +Cost functions. + +@note: All of these functions return one cost per example. So it is your +job to perform a tensor.sum over the individual example losses. + +@todo: Make a Cost class, with a particular contract. + +@todo: It would be nice to implement a hinge loss, with a particular margin. +""" + +import theano.tensor as T +from theano.tensor.xlogx import xlogx + +def quadratic(target, output, axis=1): + return T.mean(T.sqr(target - output), axis=axis) + +def cross_entropy(target, output, mean_axis=0, sum_axis=1): + """ + @todo: This is essentially duplicated as nnet_ops.binary_crossentropy + @warning: OUTPUT and TARGET are reversed in nnet_ops.binary_crossentropy + """ + XE = target * T.log(output) + (1 - target) * T.log(1 - output) + return -T.mean(T.sum(XE, axis=sum_axis),axis=mean_axis) + +def KL_divergence(target, output): + """ + @note: We do not compute the mean, because if target and output have + different shapes then the result will be garbled. + """ + return -(target * T.log(output) + (1 - target) * T.log(1 - output)) \ + + (xlogx(target) + xlogx(1 - target)) +# return cross_entropy(target, output, axis) - cross_entropy(target, target, axis) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/daa.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/daa.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,188 @@ + +import theano +from theano import tensor as T +from theano.tensor import nnet as NN +from theano.tensor.deprecated import rmodule + +import numpy as N + +from pylearn.algorithms import cost + +class DenoisingAA(rmodule.RModule): + """De-noising Auto-encoder + + WRITEME + + Abstract base class. Requires subclass with functions: + + - build_corrupted_input() + + Introductory article about this model WRITEME. + + + """ + + def __init__(self, input = None, regularize = True, tie_weights = True, + activation_function=NN.sigmoid, reconstruction_cost_function=cost.cross_entropy): + """ + :param input: WRITEME + + :param regularize: WRITEME + + :param tie_weights: WRITEME + + :param activation_function: WRITEME + + :param reconstruction_cost: Should return one cost per example (row) + + :todo: Default noise level for all daa levels + + """ + super(DenoisingAA, self).__init__() + + # MODEL CONFIGURATION + self.regularize = regularize + self.tie_weights = tie_weights + self.activation_function = activation_function + self.reconstruction_cost_function = reconstruction_cost_function + + # ACQUIRE/MAKE INPUT + if not input: + input = T.matrix('input') + self.input = theano.External(input) + + # HYPER-PARAMETERS + self.lr = theano.Member(T.scalar()) + + # PARAMETERS + self.w1 = theano.Member(T.matrix()) + if not tie_weights: + self.w2 = theano.Member(T.matrix()) + else: + self.w2 = self.w1.T + self.b1 = theano.Member(T.vector()) + self.b2 = theano.Member(T.vector()) + + + # REGULARIZATION COST + self.regularization = self.build_regularization() + + + ### NOISELESS ### + + # HIDDEN LAYER + self.hidden_activation = T.dot(self.input, self.w1) + self.b1 + self.hidden = self.hid_activation_function(self.hidden_activation) + + # RECONSTRUCTION LAYER + self.output_activation = T.dot(self.hidden, self.w2) + self.b2 + self.output = self.out_activation_function(self.output_activation) + + # RECONSTRUCTION COST + self.reconstruction_costs = self.build_reconstruction_costs(self.output) + self.reconstruction_cost = T.mean(self.reconstruction_costs) + + # TOTAL COST + self.cost = self.reconstruction_cost + if self.regularize: + self.cost = self.cost + self.regularization + + + ### WITH NOISE ### + self.corrupted_input = self.build_corrupted_input() + + # HIDDEN LAYER + self.nhidden_activation = T.dot(self.corrupted_input, self.w1) + self.b1 + self.nhidden = self.hid_activation_function(self.nhidden_activation) + + # RECONSTRUCTION LAYER + self.noutput_activation = T.dot(self.nhidden, self.w2) + self.b2 + self.noutput = self.out_activation_function(self.noutput_activation) + + # RECONSTRUCTION COST + self.nreconstruction_costs = self.build_reconstruction_costs(self.noutput) + self.nreconstruction_cost = T.mean(self.nreconstruction_costs) + + # TOTAL COST + self.ncost = self.nreconstruction_cost + if self.regularize: + self.ncost = self.ncost + self.regularization + + + # GRADIENTS AND UPDATES + if self.tie_weights: + self.params = self.w1, self.b1, self.b2 + else: + self.params = self.w1, self.w2, self.b1, self.b2 + gradients = T.grad(self.ncost, self.params) + updates = dict((p, p - self.lr * g) for p, g in zip(self.params, gradients)) + + # INTERFACE METHODS + self.update = theano.Method(self.input, self.ncost, updates) + self.compute_cost = theano.Method(self.input, self.cost) + self.noisify = theano.Method(self.input, self.corrupted_input) + self.reconstruction = theano.Method(self.input, self.output) + self.representation = theano.Method(self.input, self.hidden) + self.reconstruction_through_noise = theano.Method(self.input, [self.corrupted_input, self.noutput]) + + self.validate = theano.Method(self.input, [self.cost, self.output]) + + def _instance_initialize(self, obj, input_size = None, hidden_size = None, seed = None, **init): + if (input_size is None) ^ (hidden_size is None): + raise ValueError("Must specify input_size and hidden_size or neither.") + super(DenoisingAA, self)._instance_initialize(obj, **init) + if seed is not None: + R = N.random.RandomState(seed) + else: + R = N.random + if input_size is not None: + sz = (input_size, hidden_size) + inf = 1/N.sqrt(input_size) + hif = 1/N.sqrt(hidden_size) + obj.w1 = R.uniform(size = sz, low = -inf, high = inf) + if not self.tie_weights: + obj.w2 = R.uniform(size = list(reversed(sz)), low = -hif, high = hif) + obj.b1 = N.zeros(hidden_size) + obj.b2 = N.zeros(input_size) + if seed is not None: + obj.seed(seed) + obj.__hide__ = ['params'] + + def build_regularization(self): + """ + @todo: Why do we need this function? + """ + return T.zero() # no regularization! + + +class SigmoidXEDenoisingAA(DenoisingAA): + """ + @todo: Merge this into the above. + @todo: Default noise level for all daa levels + """ + + def build_corrupted_input(self): + self.noise_level = theano.Member(T.scalar()) + return self.random.binomial(T.shape(self.input), 1, 1 - self.noise_level) * self.input + + def hid_activation_function(self, activation): + return self.activation_function(activation) + + def out_activation_function(self, activation): + return self.activation_function(activation) + + def build_reconstruction_costs(self, output): + return self.reconstruction_cost_function(self.input, output) + + def build_regularization(self): + self.l2_coef = theano.Member(T.scalar()) + if self.tie_weights: + return self.l2_coef * T.sum(self.w1 * self.w1) + else: + return self.l2_coef * (T.sum(self.w1 * self.w1) + T.sum(self.w2 * self.w2)) + + def _instance_initialize(self, obj, input_size = None, hidden_size = None, seed = None, **init): + init.setdefault('noise_level', 0) + init.setdefault('l2_coef', 0) + super(SigmoidXEDenoisingAA, self)._instance_initialize(obj, input_size, hidden_size, seed, **init) + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/kernel_regression.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/kernel_regression.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,231 @@ +""" +Implementation of kernel regression: +""" + +from pylearn.learner import OfflineLearningAlgorithm +from theano import tensor as T +from theano.tensor.nnet import prepend_1_to_each_row +from theano.scalar import as_scalar +from common.autoname import AutoName +import theano +import numpy + +# map a N-vector to a 1xN matrix +row_vector = theano.tensor.DimShuffle((False,),['x',0]) +# map a N-vector to a Nx1 matrix +col_vector = theano.tensor.DimShuffle((False,),[0,'x']) + +class KernelRegression(OfflineLearningAlgorithm): + """ +Implementation of kernel regression: +* the data are n (x_t,y_t) pairs and we want to estimate E[y|x] +* the predictor computes + f(x) = b + \sum_{t=1}^n \alpha_t K(x,x_t) + with free parameters b and alpha, training inputs x_t, + and kernel function K (gaussian by default). + Clearly, each prediction involves O(n) computations. +* the learner chooses b and alpha to minimize + lambda alpha' G' G alpha + \sum_{t=1}^n (f(x_t)-y_t)^2 + where G is the matrix with entries G_ij = K(x_i,x_j). + The first (L2 regularization) term is the squared L2 + norm of the primal weights w = \sum_t \alpha_t phi(x_t) + where phi is the function s.t. K(u,v)=phi(u).phi(v). +* this involves solving a linear system with (n+1,n+1) + matrix, which is an O(n^3) computation. In addition, + that linear system matrix requires O(n^2) memory. + So this learning algorithm should be used only for + small datasets. +* the linear system is + (M + lambda I_n) theta = (1, y)' + where theta = (b, alpha), I_n is the (n+1)x(n+1) matrix that is the identity + except with a 0 at (0,0), M is the matrix with G in the sub-matrix starting + at (1,1), 1's in column 0, except for a value of n at (0,0), and sum_i G_{i,j} + in the rest of row 0. + +Note that this is gives an estimate of E[y|x,training_set] that is the +same as obtained with a Gaussian process regression. The GP +regression would also provide a Bayesian Var[y|x,training_set]. +It corresponds to an assumption that f is a random variable +with Gaussian (process) prior distribution with covariance +function K. Because we assume Gaussian noise we obtain a Gaussian +posterior for f (whose mean is computed here). + + + Usage: + + kernel_regressor=KernelRegression(L2_regularizer=0.1,gamma=0.5) (kernel=GaussianKernel(gamma=0.5)) + kernel_predictor=kernel_regressor(training_set) + all_results_dataset=kernel_predictor(test_set) # creates a dataset with "output" and "squared_error" field + outputs = kernel_predictor.compute_outputs(inputs) # inputs and outputs are numpy arrays + outputs, errors = kernel_predictor.compute_outputs_and_errors(inputs,targets) + errors = kernel_predictor.compute_errors(inputs,targets) + mse = kernel_predictor.compute_mse(inputs,targets) + + + + The training_set must have fields "input" and "target". + The test_set must have field "input", and needs "target" if + we want to compute the squared errors. + + The predictor parameters are obtained analytically from the training set. + Training is only done on a whole training set rather than on minibatches + (no online implementation). + + The dataset fields expected and produced by the learning algorithm and the trained model + are the following: + + - Input and output dataset fields (example-wise quantities): + + - 'input' (always expected as an input_dataset field) + - 'target' (always expected by the learning algorithm, optional for learned model) + - 'output' (always produced by learned model) + - 'squared_error' (optionally produced by learned model if 'target' is provided) + = example-wise squared error + """ + def __init__(self, kernel=None, L2_regularizer=0, gamma=1, use_bias=False): + # THE VERSION WITH BIAS DOES NOT SEEM RIGHT + self.kernel = kernel + self.L2_regularizer=L2_regularizer + self.use_bias=use_bias + self.gamma = gamma # until we fix things, the kernel type is fixed, Gaussian + self.equations = KernelRegressionEquations() + + def __call__(self,trainset): + n_examples = len(trainset) + first_example = trainset[0] + n_inputs = first_example['input'].size + n_outputs = first_example['target'].size + b1=1 if self.use_bias else 0 + M = numpy.zeros((n_examples+b1,n_examples+b1)) + Y = numpy.zeros((n_examples+b1,n_outputs)) + for i in xrange(n_examples): + M[i+b1,i+b1]=self.L2_regularizer + data = trainset.fields() + train_inputs = numpy.array(data['input']) + if self.use_bias: + Y[0]=1 + Y[b1:,:] = numpy.array(data['target']) + train_inputs_square,sumG,G=self.equations.compute_system_matrix(train_inputs,self.gamma) + M[b1:,b1:] += G + if self.use_bias: + M[0,1:] = sumG + M[1:,0] = 1 + M[0,0] = M.shape[0] + self.M=M + self.Y=Y + theta=numpy.linalg.solve(M,Y) + return KernelPredictor(theta,self.gamma, train_inputs, train_inputs_square) + +class KernelPredictorEquations(AutoName): + train_inputs = T.matrix() # n_examples x n_inputs + train_inputs_square = T.vector() # n_examples + inputs = T.matrix() # minibatchsize x n_inputs + targets = T.matrix() # minibatchsize x n_outputs + theta = T.matrix() # (n_examples+1) x n_outputs + b1 = T.shape(train_inputs_square)[0] 1: + print 'estimated train cost', cost_j + #TODO: consult iter[0] for periodic saving to cwd (model, minimizer, and stopper) + + def check(): + validate = logreg.validate(valid.x, valid.y) + if verbose > 0: + print 'iter', iter[0], 'validate', validate + sys.stdout.flush() + iter[0] += 1 + return validate[0] + + def save(): + return copy.deepcopy(logreg) + + stopper = make_stopper(**state.subdict(prefix='stopper_')) + stopper.find_min(step, check, save) + + state.train_01, state.train_rcost, state.train_cost = logreg.validate(train.x, train.y) + state.valid_01, state.valid_rcost, state.valid_cost = logreg.validate(valid.x, valid.y) + state.test_01, state.test_rcost, state.test_cost = logreg.validate(test.x, test.y) + + state.n_train = len(train.y) + state.n_valid = len(valid.y) + state.n_test = len(test.y) + +class LogReg2(module.FancyModule): + def __init__(self, input=None, targ=None, w=None, b=None, lr=None, regularize=False): + super(LogReg2, self).__init__() #boilerplate + + self.input = (input) if input is not None else T.matrix('input') + self.targ = (targ) if targ is not None else T.lcol() + + self.w = (w) if w is not None else (T.dmatrix()) + self.b = (b) if b is not None else (T.dvector()) + self.lr = (lr) if lr is not None else (T.dscalar()) + + self.params = [p for p in [self.w, self.b] if p.owner is None] + + output = nnet.sigmoid(T.dot(self.x, self.w) + self.b) + xent = -self.targ * T.log(output) - (1.0 - self.targ) * T.log(1.0 - output) + sum_xent = T.sum(xent) + + self.output = output + self.xent = xent + self.sum_xent = sum_xent + self.cost = sum_xent + + #define the apply method + self.pred = (T.dot(self.input, self.w) + self.b) > 0.0 + self.apply = module.Method([self.input], self.pred) + + #if this module has any internal parameters, define an update function for them + if self.params: + gparams = T.grad(sum_xent, self.params) + self.update = module.Method([self.input, self.targ], sum_xent, + updates = dict((p, p - self.lr * g) for p, g in zip(self.params, gparams))) + + +class classification: #this would go to a file called pylearn/algorithms/classification.py + + @staticmethod + def xent(p, q): + """cross-entropy (row-wise) + + :type p: M x N symbolic matrix (sparse or dense) + + :param p: each row is a true distribution over N things + + :type q: M x N symbolic matrix (sparse or dense) + + :param q: each row is an approximating distribution over N things + + :rtype: symbolic vector of length M + + :returns: the cross entropy between each row of p and the corresponding row of q. + + + Hint: To sum row-wise costs into a scalar value, use "xent(p, q).sum()" + """ + return (p * tensor.log(q)).sum(axis=1) + + @staticmethod + def errors(target, prediction): + """classification error (row-wise) + + :type p: M x N symbolic matrix (sparse or dense) + + :param p: each row is a true distribution over N things + + :type q: M x N symbolic matrix (sparse or dense) + + :param q: each row is an approximating distribution over N things + + :rtype: symbolic vector of length M + + :returns: a vector with 0 for every row pair that has a maximum in the same position, + and 1 for every other row pair. + + + Hint: Count errors with "errors(prediction, target).sum()", and get the error-rate with + "errors(prediction, target).mean()" + """ + return tensor.neq( + tensor.argmax(prediction, axis=1), + tensor.argmax(target, axis=1)) + +class LogReg_New(module.FancyModule): + """A symbolic module for performing multi-class logistic regression.""" + + params = property( + lambda self: [p for p in [self.w, self.b] if p.owner is None], + doc="WRITEME" + ) + + def __init__(self, n_in=None, n_out=None, w=None, b=None): + super(LogRegNew, self).__init__() #boilerplate + + self.n_in = n_in + self.n_out = n_out + + self.w = w if w is not None else (T.dmatrix()) + self.b = b if b is not None else (T.dvector()) + + def _instance_initialize(self, obj): + obj.w = N.zeros((self.n_in, self.n_out)) + obj.b = N.zeros(self.n_out) + obj.__pp_hide__ = ['params'] + + + def l1(self): + return abs(self.w).sum() + + def l2(self): + return (self.w**2).sum() + + def activation(self, input): + return theano.dot(input, self.w) + self.b + + def softmax(self, input): + return nnet.softmax(self.activation(input)) + + def argmax(self, input): + return tensor.argmax(self.activation(input)) + + def xent(self, input, target): + return classification.xent(target, self.softmax(input)) + + def errors(self, input, target): + return classification.errors(target, self.softmax(input)) + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/minimizer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/minimizer.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,36 @@ +"""Define the interface and factory for gradient-based minimizers. +""" +import theano + +class DummyMinimizer(theano.Module): + """ The idea of a minimizer is that it provides an `step` function that will + eventually converge toward (maybe realize?) the minimum of a cost function. + + The step_cost function takes a step and returns the cost associated with either + the current or previous parameter values (return whichever is easiest to compute, it's + meant for user feedback.) + + """ + def __init__(self, args, cost, parameters, gradients=None): + super(DummyMinimizer, self).__init__() + + def _instance_step(self, obj, *args): + """Move the parameters toward the minimum of a cost + + :param args: The arguments here should be values for the Variables that were in the + `args` argument to the constructor. + + :Return: None + """ + pass + + def _instance_step_cost(self, obj, *args): + """Move the parameters toward the minimum of a cost, and compute the cost + + :param args: The arguments here should be values for the Variables that were in the + `args` argument to the constructor. + + :Return: The current cost value. + """ + pass + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/rbm.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/rbm.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,100 @@ +import sys, copy +import theano +from theano import tensor as T +from theano.tensor.deprecated import rmodule +from theano.tensor.nnet import sigmoid +from theano.compile import module +from theano import printing, pprint +from theano import compile + +import numpy as N + +from ..datasets import make_dataset +from .minimizer import make_minimizer +from .stopper import make_stopper + +class RBM(rmodule.RModule): + + # is it really necessary to pass ALL of these ? - GD + def __init__(self, + nvis=None, nhid=None, + input=None, + w=None, hidb=None, visb=None, + seed=0, lr=0.1): + + super(RBM, self).__init__() + self.nhid, self.nvis = nhid, nvis + self.lr = lr + + # symbolic theano stuff + # what about multidimensional inputs/outputs ? do they have to be + # flattened or should we used tensors instead ? + self.w = w if w is not None else module.Member(T.dmatrix()) + self.visb = visb if visb is not None else module.Member(T.dvector()) + self.hidb = hidb if hidb is not None else module.Member(T.dvector()) + self.seed = seed; + + # 1-step Markov chain + vis = T.dmatrix() + hid = sigmoid(T.dot(vis, self.w) + self.hidb) + hid_sample = self.random.binomial(T.shape(hid), 1, hid) + neg_vis = sigmoid(T.dot(hid_sample, self.w.T) + self.visb) + neg_vis_sample = self.random.binomial(T.shape(neg_vis), 1, neg_vis) + neg_hid = sigmoid(T.dot(neg_vis_sample, self.w) + self.hidb) + + # function which execute 1-step Markov chain (with and without cd updates) + self.updownup = module.Method([vis], [hid, neg_vis_sample, neg_hid]) + + # function to perform manual cd update given 2 visible and 2 hidden values + vistemp = T.dmatrix() + hidtemp = T.dmatrix() + nvistemp = T.dmatrix() + nhidtemp = T.dmatrix() + self.cd_update = module.Method([vistemp, hidtemp, nvistemp, nhidtemp], + [], + updates = {self.w: self.w + self.lr * + (T.dot(vistemp.T, hidtemp) - + T.dot(nvistemp.T, nhidtemp)), + self.visb: self.visb + self.lr * + (T.sum(vistemp - nvistemp,axis=0)), + self.hidb: self.hidb + self.lr * + (T.sum(hidtemp - nhidtemp,axis=0))}); + + # TODO: add parameter for weigth initialization + def _instance_initialize(self, obj): + obj.w = N.random.standard_normal((self.nvis,self.nhid)) + obj.visb = N.zeros(self.nvis) + obj.hidb = N.zeros(self.nhid) + obj.seed(self.seed); + + def _instance_cd1(self, obj, input, k=1): + poshid, negvissample, neghid = obj.updownup(input) + for i in xrange(k-1): + ahid, negvissample, neghid = obj.updownup(negvissample) + # CD-k update + obj.cd_update(input, poshid, negvissample, neghid) + + +def train_rbm(state, channel=lambda *args, **kwargs:None): + dataset = make_dataset(**state.dataset) + train = dataset.train + + rbm_module = RBM( + nvis=train.x.shape[1], + nhid=state['nhid']) + rbm = rbm_module.make() + + batchsize = state.get('batchsize', 1) + verbose = state.get('verbose', 1) + iter = [0] + + while iter[0] != state['max_iters']: + for j in xrange(0,len(train.x)-batchsize+1,batchsize): + rbm.cd1(train.x[j:j+batchsize]) + if verbose > 1: + print 'estimated train cost...' + if iter[0] == state['max_iters']: + break + else: + iter[0] += 1 + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/regressor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/regressor.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,104 @@ + +import theano +from theano import tensor as T +from theano.tensor import nnet as NN +import numpy as N + +class Regressor(theano.FancyModule): + + def __init__(self, input = None, target = None, regularize = True): + super(Regressor, self).__init__() + + # MODEL CONFIGURATION + self.regularize = regularize + + # ACQUIRE/MAKE INPUT AND TARGET + self.input = theano.External(input) if input else T.matrix('input') + self.target = theano.External(target) if target else T.matrix('target') + + # HYPER-PARAMETERS + self.lr = theano.Member(T.scalar()) + + # PARAMETERS + self.w = theano.Member(T.matrix()) + self.b = theano.Member(T.vector()) + + # OUTPUT + self.output_activation = T.dot(self.input, self.w) + self.b + self.output = self.build_output() + + # REGRESSION COST + self.regression_cost = self.build_regression_cost() + + # REGULARIZATION COST + self.regularization = self.build_regularization() + + # TOTAL COST + self.cost = self.regression_cost + if self.regularize: + self.cost = self.cost + self.regularization + + # GRADIENTS AND UPDATES + self.params = self.w, self.b + gradients = T.grad(self.cost, self.params) + updates = dict((p, p - self.lr * g) for p, g in zip(self.params, gradients)) + + # INTERFACE METHODS + self.update = theano.Method([self.input, self.target], self.cost, updates) + self.get_cost = theano.Method([self.input, self.target], self.cost) + self.predict = theano.Method(self.input, self.output) + + self.build_extensions() + + def _instance_initialize(self, obj, input_size = None, output_size = None, seed = None, **init): + if seed is not None: + R = N.random.RandomState(seed) + else: + R = N.random + if (input_size is None) ^ (output_size is None): + raise ValueError("Must specify input_size and output_size or neither.") + super(Regressor, self)._instance_initialize(obj, **init) + if input_size is not None: + sz = (input_size, output_size) + range = 1/N.sqrt(input_size) + obj.w = R.uniform(size = sz, low = -range, high = range) + obj.b = N.zeros(output_size) + obj.__hide__ = ['params'] + + def _instance_flops_approx(self, obj): + return obj.w.size + + def build_extensions(self): + pass + + def build_output(self): + raise NotImplementedError('override in subclass') + + def build_regression_cost(self): + raise NotImplementedError('override in subclass') + + def build_regularization(self): + return T.zero() # no regularization! + + +class BinRegressor(Regressor): + + def build_extensions(self): + self.classes = T.iround(self.output) + self.classify = theano.Method(self.input, self.classes) + + def build_output(self): + return NN.sigmoid(self.output_activation) + + def build_regression_cost(self): + self.regression_cost_matrix = self.target * T.log(self.output) + (1.0 - self.target) * T.log(1.0 - self.output) + self.regression_costs = -T.sum(self.regression_cost_matrix, axis=1) + return T.mean(self.regression_costs) + + def build_regularization(self): + self.l2_coef = theano.Member(T.scalar()) + return self.l2_coef * T.sum(self.w * self.w) + + def _instance_initialize(self, obj, input_size = None, output_size = 1, seed = None, **init): + init.setdefault('l2_coef', 0) + super(BinRegressor, self)._instance_initialize(obj, input_size, output_size, seed, **init) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/rnn.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/rnn.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,247 @@ +#!/usr/bin/env python +import numpy as N +from theano import Op, Apply, tensor as T, Module, Member, Method, Mode, compile +from theano.gof import OpSub, TopoOptimizer + +from minimizer import make_minimizer # minimizer +from theano.printing import Print +import sgd #until Olivier's module-import thing works better + +#################### +# Library-type stuff +#################### + +class TanhRnn(Op): + """ + This class implements the recurrent part of a recurrent neural network. + + There is not a neat way to include this in a more fine-grained way in Theano at the moment, + so to get something working, I'm implementing a relatively complicated Op that could be + broken down later into constituents. + + Anyway, this Op implements recursive computation of the form: + + .. latex-eqn: + z_t &= \tanh( z_{t-1} A + x_{t-1}) + + For z0 a vector, and x a TxM matrix, it returns a matrix z of shape (T+1, M), + in which z[0] = z0. + + """ + + def make_node(self, x, z0, A): + """ + :type x: matrix (each row is an x_t) (shape: (T, M)) + :type z0: vector (the first row of output) (shape: M) + :type A: matrix (M by M) + + """ + x = T.as_tensor(x) + z0 = T.as_tensor(z0) + A = T.as_tensor(A) + z = x.type() #make a new symbolic result with the same type as x + return Apply(self, [x, z0, A], [z]) + + def perform(self, node, (x,z0,A), out): + T,M = x.shape + z = N.zeros((T+1, M)) + z[0] = z0 + for i in xrange(T): + z[i+1] = N.tanh(N.dot(z[i], A) + x[i]) + out[0][0] = z + + def grad(self, (x, z0, A), (gz,)): + z = tanh_rnn(x, z0, A) + gz_incl_rnn, gx = tanh_rnn_grad(A, z, gz) + return [gx, gz_incl_rnn[0], (T.dot(z[:-1].T, gx))] +tanh_rnn = TanhRnn() + +class TanhRnnGrad(Op): + """Gradient calculation for TanhRnn""" + + def __init__(self, inplace): + self.inplace = inplace + + if self.inplace: + self.destroy_map = {0: [2]} + + def __eq__(self, other): + return (type(self) == type(other)) and (self.inplace == other.inplace) + + def __hash__(self, other): + return hash(type(self)) ^ hash(self.inplace) + + def make_node(self, A, z, gz): + return Apply(self, [A,z,gz], (z.type(), gz.type())) + + def perform(self, node, (A, z, gz), out): + Tp1,M = z.shape + T = Tp1 - 1 + gx = N.zeros((T, M)) + + if not self.inplace: + gz = gz.copy() + + for i in xrange(T-1, -1, -1): + #back through the tanh + gx[i] = gz[i+1] * (1.0 - z[i+1] * z[i+1]) + gz[i] += N.dot(A, gx[i]) + + out[0][0] = gz + out[1][0] = gx + + def __str__(self): + if self.inplace: + return 'Inplace' + super(TanhRnnGrad, self).__str__() + else: + return super(TanhRnnGrad, self).__str__() + +tanh_rnn_grad = TanhRnnGrad(inplace=False) +tanh_rnn_grad_inplace = TanhRnnGrad(inplace=True) + +compile.optdb.register('inplace_rnngrad', TopoOptimizer(OpSub(tanh_rnn_grad, tanh_rnn_grad_inplace)), 60, 'fast_run', 'inplace') + + +####################### +# Experiment-type stuff +####################### + + + +class ExampleRNN(Module): + + def __init__(self, n_vis, n_hid, n_out, minimizer): + super(ExampleRNN, self).__init__() + + def affine(weight, bias): + return (lambda a : T.dot(a, weight) + bias) + + self.n_vis = n_vis + self.n_hid = n_hid + self.n_out = n_out + + #affine transformatoin x -> latent space + self.v, self.b = Member(T.dmatrix()), Member(T.dvector()) + input_transform = affine(self.v, self.b) + + #recurrent weight matrix in latent space + self.z0 = Member(T.dvector()) + self.w = Member(T.dmatrix()) + + #affine transformation latent -> output space + self.u, self.c = Member(T.dmatrix()), Member(T.dvector()) + output_transform = affine(self.u, self.c) + + self.params = [self.v, self.b, self.w, self.u, self.c] + + #input and target + x, y = T.dmatrix(), T.dmatrix() + + z = tanh_rnn(input_transform(x), self.z0, self.w) + yhat = output_transform(z[1:]) + self.cost = T.sum((y - yhat)**2) + + self.blah = Method([x,y], self.cost) + + # using the make_minimizer protocol + self.minimizer = minimizer([x, y], self.cost, self.params) + + def _instance_initialize(self, obj): + n_vis = self.n_vis + n_hid = self.n_hid + n_out = self.n_out + + rng = N.random.RandomState(2342) + + obj.z0 = N.zeros(n_hid) + obj.v = rng.randn(n_vis, n_hid) * 0.01 + obj.b = N.zeros(n_hid) + obj.w = rng.randn(n_hid, n_hid) * 0.01 + obj.u = rng.randn(n_hid, n_out) * 0.01 + obj.c = N.zeros(n_out) + obj.minimizer.initialize() + def _instance__eq__(self, other): + if not isinstance(other.component, ExampleRNN): + raise NotImplemented + #we compare the member. +# if self.n_vis != other.n_vis or slef.n_hid != other.n_hid or self.n_out != other.n_out: +# return False + if (N.abs(self.z0-other.z0)<1e-8).all() and (N.abs(self.v-other.v)<1e-8).all() and (N.abs(self.b-other.b)<1e-8).all() and (N.abs(self.w-other.w)<1e-8).all() and (N.abs(self.u-other.u)<1e-8).all() and (N.abs(self.c-other.c)<1e-8).all() and (N.abs(self.z0-other.z0)<1e-8).all(): + return True + return False + + def _instance__hash__(self): + raise NotImplemented + +def test_example_rnn(): + minimizer_fn = make_minimizer('sgd', stepsize = 0.001) + + n_vis = 5 + n_out = 3 + n_hid = 4 + rnn_module = ExampleRNN(n_vis, n_hid, n_out, minimizer_fn) + + rnn = rnn_module.make(mode='FAST_RUN') + + rng = N.random.RandomState(7722342) + x = rng.randn(10,n_vis) + y = rng.randn(10,n_out) + + #set y to be like x with a lag of LAG + LAG = 4 + y[LAG:] = x[:-LAG, 0:n_out] + + if 1: + for i, node in enumerate(rnn.minimizer.step_cost.maker.env.toposort()): + print i, node + + niter=1500 + for i in xrange(niter): + if i % 100 == 0: + print i, rnn.minimizer.step_cost(x, y), rnn.minimizer.stepsize + else: + rnn.minimizer.step_cost(x, y) + +def test_WEIRD_STUFF(): + n_vis = 5 + n_out = 3 + n_hid = 4 + rng = N.random.RandomState(7722342) + x = rng.randn(10,n_vis) + y = rng.randn(10,n_out) + + #set y to be like x with a lag of LAG + LAG = 4 + y[LAG:] = x[:-LAG, 0:n_out] + + minimizer_fn1 = make_minimizer('sgd', stepsize = 0.001) + minimizer_fn2 = make_minimizer('sgd', stepsize = 0.001) + rnn_module1 = ExampleRNN(n_vis, n_hid, n_out, minimizer_fn1) + rnn_module2 = ExampleRNN(n_vis, n_hid, n_out, minimizer_fn2) + rnn1 = rnn_module2.make(mode='FAST_RUN') + rnn2 = rnn_module1.make(mode='FAST_COMPILE') + if 0: + topo1=rnn1.minimizer.step_cost.maker.env.toposort() + topo2=rnn2.minimizer.step_cost.maker.env.toposort() + for i in range(len(topo1)): + print '1',i, topo1[i] + print '2',i, topo2[i] + + + + niter=50 + for i in xrange(niter): + rnn1.minimizer.step(x, y) + rnn2.minimizer.step(x, y) + + # assert rnn1.n_vis != rnn2.n_vis or slef.n_hid != rnn2.n_hid or rnn1.n_out != rnn2.n_out + assert (N.abs(rnn1.z0-rnn2.z0)<1e-8).all() + assert (N.abs(rnn1.v-rnn2.v)<1e-8).all() and (N.abs(rnn1.b-rnn2.b)<1e-8).all() and (N.abs(rnn1.w-rnn2.w)<1e-8).all() and (N.abs(rnn1.u-rnn2.u)<1e-8).all() and (N.abs(rnn1.c-rnn2.c)<1e-8).all() + + # assert b + +if __name__ == '__main__': +# from theano.tests import main +# main(__file__) + test_example_rnn() + test_WEIRD_STUFF() diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/sandbox/__init__.py diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/sandbox/_test_onehotop.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/sandbox/_test_onehotop.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,21 @@ +from onehotop import one_hot + +import unittest +from theano import compile +from theano import gradient +from theano import function +from theano.tensor import as_tensor + +import random +import numpy.random + +class T_OneHot(unittest.TestCase): + def test0(self): + x = as_tensor([3, 2, 1]) + y = as_tensor(5) + o = one_hot(x, y) + f = function([],o) + self.failUnless(numpy.all(f() == numpy.asarray([[0, 0, 0, 1, 0], [0, 0, 1, 0, 0], [0, 1, 0, 0, 0]]))) + +if __name__ == '__main__': + unittest.main() diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/sandbox/cost.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/sandbox/cost.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,154 @@ +""" +Cost functions. + +@note: All of these functions return one cost per example. So it is your +job to perform a tensor.sum over the individual example losses. +""" + +import theano as T +from theano import tensor, scalar +import numpy + +class UndefinedGradient(Exception): + """ + Raised by UndefinedGradientOp to indicate that the gradient is undefined mathematically. + """ + pass +from theano import gof +class UndefinedGradientOp(gof.Op): + def perform(self, x=None): + if x is not None: raise UndefinedGradient(x) + else: raise UndefinedGradient(x) +undefined_gradient = UndefinedGradientOp() + +class LogFactorial(scalar.UnaryScalarOp): + """ + Compute log x!. + @todo: Rewrite so that it uses INTs not FLOATs. + @todo: Move this to Theano. + @todo: This function is slow, probably want to cache the values. + """ + @staticmethod + def st_impl(x): + if not isinstance(x, int) and not isinstance(x, long): + raise TypeError('type(x) = %s, must be int or long' % type(x)) + if x == 0.0: + return 0.0 + v = 0.0 + for i in range(x): + v += numpy.log(x) + return v + def impl(self, x): + return LogFactorial.st_impl(x) + def grad(self, (x,), (gz,)): + undefined_gradient(self) +# def grad(self, (x,), (gz,)): +# raise NotImplementedError('gradient not defined over discrete values') +# return None +# return [gz * (1 + scalar.log(x))] +# def c_code(self, node, name, (x,), (z,), sub): +# if node.inputs[0].type in [scalar.float32, scalar.float64]: +# return """%(z)s = +# %(x)s == 0.0 +# ? 0.0 +# : %(x)s * log(%(x)s);""" % locals() +# raise NotImplementedError('only floatingpoint is implemented') +scalar_logfactorial = LogFactorial(scalar.upgrade_to_float, name='scalar_logfactoral') +logfactorial = tensor.Elemwise(scalar_logfactorial, name='logfactorial') + + +def poissonlambda(unscaled_output, doclen, beta_scale): + """ + A continuous parameter lambda_i which is the expected number of + occurence of word i in the document. Note how this must be positive, + and that is why Ranzato and Szummer (2008) use an exponential. + + Yoshua: I don't like exponentials to guarantee positivity. softplus + is numerically much better behaved (but you might want to try both + to see if it makes a difference). + + @todo: Maybe there are more sensible ways to set the beta_scale. + """ + beta = beta_scale * doclen + return beta * tensor.exp(unscaled_output) + +def nlpoisson(target, output, beta_scale=1, axis=0, sumloss=True, zerothreshold=0): + """ + The negative log Poisson regression probability. + From Ranzato and Szummer (2008). + + Output should be of the form Weight*code+bias, i.e. unsquashed. + NB this is different than the formulation in Salakhutdinov and Hinton + (2007), in which the output is softmax'ed and multiplied by the input + document length. That is also what Welling et. al (2005) do. It would + be useful to try the softmax, because it is more well-behaved. + + There is a beta term that is proportional to document length. We + are not sure what beta scale is used by the authors. We use 1 as + the default, but this value might be inappropriate. + For numerical reasons, Yoshua recommends choosing beta such that + the lambda is expected to be around 1 for words that have a non-zero count. + So he would take: + + beta = document_size / unique_words_per_document + + I am not sure the above math is correct, I need to talk to him. + + Yoshua notes that ``there is a x_i log(beta) term missing, if you + compare with eqn 2 (i.e., take the log). They did not include in + 3 because it does not depend on the parameters, so the gradient + wrt it would be 0. But if you really want log-likelihood it should + be included.'' If you want a true log-likelihood, you probably should + actually compute the derivative of the entire eqn 2. + + Axis is the axis along which we sum the target values, to obtain + the document length. + + If sumloss, we sum the loss along axis. + + If zerothreshold is non-zero, we threshold the loss: + If this target dimension is zero and beta * tensor.exp(output) + < zerothreshold, let this loss be zero. + + @todo: Include logfactorial term + """ +# from theano.printing import Print +# print dtype(target) # make sure dtype is int32 or int64 +# print target.dtype + doclen = tensor.sum(target, axis=axis) + lambdav = poissonlambda(output, doclen, beta_scale) + lossterms = lambdav - target*output + if sumloss: + return tensor.sum(lossterms, axis=axis) + else: + return lossterms +# return tensor.sum(beta * tensor.exp(output) - target*output + logfactorial(target), axis=axis) + + +#import numpy +#def nlpoisson_nontheano(target, output, beta_scale=1, axis=0): +# doclen = numpy.sum(target, axis=axis) +# print "doclen", doclen +# beta = beta_scale * doclen +# print "beta", beta +# print "exp", numpy.exp(output) +# print "beta * exp", beta * numpy.exp(output) +# print "x * y", target * output +# +# import theano.tensor as TT +# x = TT.as_tensor(target) +# o = logfactorial(x) +# f = T.function([],o) +# logf = f() +# print "log factorial(x)", logf +# print "beta * exp - dot + log factorial", beta * numpy.exp(output) - target*output + f() +# print "total loss", numpy.sum(beta * numpy.exp(output) - target*output + f(), axis=axis) +# +## return beta * numpy.exp(output) - numpy.dot(target, output) +## #+ logfactorial(target) +# +#import numpy +#target = numpy.array([0, 0, 1, 1, 2, 2, 100, 100]) +##output = numpy.array([0., 0.5, 1., 0.5, 2., 0.5, 100., 0.5]) +#output = numpy.array([0., 1, 1., 0, 1, 0, 5, 1]) +#nlpoisson_nontheano(target, output) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/sandbox/kalman.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/sandbox/kalman.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,57 @@ + +""" +Modules and misc. code related to the Kalman Filter. + + +Kalman filter algorithm as presented in "Probabilistic Robotics" + +x_t is the state + +u_t is a control vector + +z_t is the observation vector + +\epsilon_t is a random noise term with zero mean and covariance R_t. + +\delta_t is a random noise term with zero mean and covariance Q_t. + +state (x_t) evolves according to + + x_t = A_t x_{t-1} + B_t u_t + \epsilon_t + +Observation z_t is made according to + + z_t = C_t x_t + \delta_t + +Assume that the distribution over initial states is a Gaussian. + +With these linear/Gaussian assumptions, the belief about the state all times t is Gaussian, so +we can represent it compactly by the mean (mu) and the covariance (sigma). + +""" + +class KalmanModule(Module): + """ + """ + def __init__(self): + + self.mu = Member() + self.sigma = Member() + + u, z = vector(), vector() + + # the formulas here work for A, B, R, C matrix or sparse matrix. + # ... anything that supports dot, +, -, dotinv, and transpose. + + A, B, C= matrix(), matrix(), matrix() + R, Q = matrix(), matrix() + + #algo from Probabilistic Robotics pg. 42 + mu_bar = dot(A, self.mu) + dot(B, u) + sigma_bar = dot(A, self.sigma, A.T) + R + K = dot(sigma_bar, C.T, dotinv(dot(C, sigma_bar, C.T) + Q)) + mu_t = mu_bar + dot(K, z - dot(C,mu_bar)) + sigma_t = dot(ident - dot(K,C), sigma_bar) + + self.update = Method([u, z, A, B, C, R, Q], [], updates = {self.mu:mu_t, self.sigma:sigma_t}) + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/sandbox/onehotop.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/sandbox/onehotop.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,58 @@ +""" +One hot Op +""" + +#from theano import tensor +from theano.tensor import as_tensor, Tensor +from theano.gof import op +from theano.gof.graph import Apply + +import numpy + +class OneHot(op.Op): + """ + Construct a one-hot vector, x out of y. + + @todo: Document inputs and outputs + @todo: Use 'bool' as output dtype? Or, at least 'int64' ? Not float64! + @todo: Use 'bool' as output dtype, not 'int64' ? + @todo: Allow this to operate on column vectors (Tensor) + @todo: Describe better. + """ + + def make_node(self, x, y): + """ + @type x: Vector L{Tensor} of integers + @param x: The entries of the one-hot vector to be one. + @type y: Integer scalar L{Tensor} + @param y: The length (#columns) of the one-hot vectors. + @return: A L{Tensor} of one-hot vectors + + @precondition: x < y for all entries of x + @todo: Check that x and y are int types + """ + x = as_tensor(x) + y = as_tensor(y) + #assert x.dtype[0:3] == "int" + #assert y.dtype[0:3] == "int" + inputs = [x, y] + ##outputs = [tensor.Tensor("int64", broadcastable=[False, False])] + #outputs = [tensor.Tensor("float64", broadcastable=[False, False])] + #outputs = [Tensor("int64", broadcastable=[False, False])] + outputs = [Tensor("float64", broadcastable=[False, False]).make_result()] + node = Apply(op = self, inputs = inputs, outputs = outputs) + return node + + def perform(self, node, (x, y), (out, )): + assert x.dtype == "int64" or x.dtype == "int32" + assert x.ndim == 1 + assert y.dtype == "int64" or x.dtype == "int32" + assert y.ndim == 0 + out[0] = numpy.zeros((x.shape[0], y), dtype="float64") + for c in range(x.shape[0]): + assert x[c] < y + out[0][c, x[c]] = 1 + + def grad(self, (x, y), (out_gradient, )): + return None, None +one_hot = OneHot() diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/sandbox/stat_ops.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/sandbox/stat_ops.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,92 @@ + +import theano +from theano import gof +from theano import tensor +import numpy + + +class ExampleWiseMean(gof.Op): + + def __init__(self): + self.destroy_map = {0: [1, 2]} + + def make_node(self, x): + return gof.Apply(self, + [x, tensor.value(float('nan')), tensor.value(0)], + [tensor.Tensor(dtype = 'float64', + broadcastable = x.type.broadcastable)()]) + + def perform(self, node, (x, sum, n), (out,)): + if numpy.isnan(sum).any(): + sum.resize(x.shape, refcheck=0) + sum[:] = x + else: + sum += x + n += 1 + out[0] = sum / n + + def c_code(self, name, node, (x, sum, n), (out, ), sub): + return """ + PyObject* multi; + int nelems; + if (isnan(((double*)(%(sum)s->data))[0])) { + PyArray_Dims dims; + dims.len = %(x)s->nd; + dims.ptr = %(x)s->dimensions; + PyArray_Resize(%(sum)s, &dims, 0, PyArray_CORDER); + multi = PyArray_MultiIterNew(2, %(sum)s, %(x)s); + nelems = PyArray_SIZE(%(sum)s); + while (nelems--) { + // Copy %(x)s in %(sum)s + *(double*)PyArray_MultiIter_DATA(multi, 0) = *(double*)PyArray_MultiIter_DATA(multi, 1); + PyArray_MultiIter_NEXT(multi); + } + } + else { + // Add some error checking on the size of x + multi = PyArray_MultiIterNew(2, %(sum)s, %(x)s); + nelems = PyArray_SIZE(%(sum)s); + while (nelems--) { + // Add %(x)s to %(sum)s + *(double*)PyArray_MultiIter_DATA(multi, 0) += *(double*)PyArray_MultiIter_DATA(multi, 1); + PyArray_MultiIter_NEXT(multi); + } + } + ((npy_int64*)(%(n)s->data))[0]++; + int n = ((npy_int64*)(%(n)s->data))[0]; + if (%(out)s == NULL) { + %(out)s = (PyArrayObject*)PyArray_EMPTY(%(sum)s->nd, %(sum)s->dimensions, NPY_FLOAT64, 0); + } + multi = PyArray_MultiIterNew(2, %(sum)s, %(out)s); + nelems = PyArray_SIZE(%(sum)s); + while (nelems--) { + // %(out)s <- %(sum)s / %(n)s + *(double*)PyArray_MultiIter_DATA(multi, 1) = *(double*)PyArray_MultiIter_DATA(multi, 0) / n; + PyArray_MultiIter_NEXT(multi); + } + """ % dict(locals(), **sub) + + + +if __name__ == '__main__': + + vectors = numpy.random.RandomState(666).rand(10, 2) + + x = tensor.dvector() + e = ExampleWiseMean()(x) + + # f = theano.function([x], [e], linker = 'py') + + # for i, v in enumerate(vectors): + # print v, "->", f(v), numpy.mean(vectors[:i+1], axis=0) + + # print + + f = theano.function([x], [e], linker = 'c|py') + + for i, v in enumerate(vectors): + print v, "->", f(v), numpy.mean(vectors[:i+1], axis=0) + + + + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/sandbox/test_cost.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/sandbox/test_cost.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,53 @@ +import pylearn.algorithms.sandbox.cost as cost + +import unittest +import theano as T +import theano.tensor as TT +import numpy + +class T_logfactorial(unittest.TestCase): + def test(self): + x = TT.as_tensor(range(10)) + o = cost.logfactorial(x) + f = T.function([],o) + self.failUnless(numpy.all(f() - numpy.asarray([0., 0., 1.38629436, 3.29583687, 5.54517744, 8.04718956, 10.75055682, 13.62137104, 16.63553233, 19.7750212])) < 1e-5) + + def test_float(self): + """ + This should fail because we can't use floats in logfactorial + """ + x = TT.as_tensor([0.5, 2.7]) + o = cost.logfactorial(x) + f = T.function([],o) +# print repr(f()) + self.failUnless(numpy.all(f() == numpy.asarray([0., 0., 1.38629436, 3.29583687, 5.54517744, 8.04718956, 10.75055682, 13.62137104, 16.63553233, 19.7750212]))) + +class T_nlpoisson(unittest.TestCase): + def test(self): + target = TT.as_tensor([0, 0, 1, 1, 2, 2, 100, 100]) + output = TT.as_tensor([0., 1, 1., 0, 1, 0, 5, 1]) + o = cost.nlpoisson(target, output) + f = T.function([],o) + self.failUnless(f() - 33751.7816277 < 1e-5) + + def test_gradient(self): + target = TT.as_tensor([0, 0, 1, 1, 2, 2, 100, 100]) + output = TT.as_tensor([0., 1, 1., 0, 1, 0, 5, 1]) + loss = cost.nlpoisson(target, output) + (goutput) = TT.grad(loss, [output]) +# (goutput) = TT.grad(loss, [target]) + f = T.function([], goutput) + print f() + self.failUnless(numpy.all(f() - numpy.asarray([206., 559.96605666, 558.96605666, 205., 557.96605666, 204., 30473.11077513, 459.96605666] < 1e-5))) + + def test_gradient_fail(self): + target = TT.as_tensor([0, 0, 1, 1, 2, 2, 100, 100]) + output = TT.as_tensor([0., 1, 1., 0, 1, 0, 5, 1]) + loss = cost.nlpoisson(target, output) + (goutput) = TT.grad(loss, [target]) + f = T.function([], goutput) + print f() + self.failUnless(numpy.all(f() - numpy.asarray([206., 559.96605666, 558.96605666, 205., 557.96605666, 204., 30473.11077513, 459.96605666] < 1e-5))) + +if __name__ == '__main__': + unittest.main() diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/sgd.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/sgd.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,48 @@ +"""A stochastic gradient descent minimizer. (Possibly the simplest minimizer.) +""" + +import theano + +class StochasticGradientDescent(theano.Module): + """Fixed stepsize gradient descent""" + def __init__(self, args, cost, params, gradients=None, stepsize=None): + """ + :param stepsize: the step to take in (negative) gradient direction + :type stepsize: None, scalar value, or scalar TensorVariable + """ + super(StochasticGradientDescent, self).__init__() + self.stepsize_init = None + + if stepsize is None: + self.stepsize = theano.tensor.dscalar() + elif isinstance(stepsize, theano.tensor.TensorVariable): + self.stepsize = stepsize + else: + self.stepsize = (theano.tensor.as_tensor_variable(stepsize)) + + if self.stepsize.ndim != 0: + raise TypeError('stepsize must be a scalar', stepsize) + + self.params = params + self.gparams = theano.tensor.grad(cost, self.params) if gradients is None else gradients + + self.updates = dict((p, p - self.stepsize * g) for p, g in zip(self.params, self.gparams)) + + self.step = theano.Method( + args, [], + updates=self.updates) + self.step_cost = theano.Method( + args, cost, + updates=self.updates) + + def _instance_initialize(self, obj): + pass + +def sgd_minimizer(stepsize=None): + """Curry the stepsize argument to StochasticGradientDescent, providing standard minimizer interface + + :returns: standard minimizer constructor f(args, cost, params, gradient=None) + """ + def f(args, cost, params, gradient=None): + return StochasticGradientDescent(args, cost, params, gradient, stepsize) + return f diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/stacker.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/stacker.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,102 @@ + +# for example in examples: +# repr = example +# for layer in stacked.layers: +# layer.update(repr) +# repr = layer.representation(repr) + +import theano +from theano import tensor as T +from theano.tensor.deprecated import rmodule +import sys +import numpy as N + +class Stacker(rmodule.RModule): + """ + @note: Assumes some names in the layers: input, cost, lr, and update + @todo: Maybe compile functions on demand, rather than immediately. + """ + + def __init__(self, submodules, input = None, regularize = False): + super(Stacker, self).__init__() + + current = input + layers = [] + for i, (submodule, outname) in enumerate(submodules): + layer = submodule(current, regularize = regularize) + layers.append(layer) + current = layer[outname] + self.layers = layers + + self.input = self.layers[0].input + self.output = current + + representation = [] + local_update = [] + global_update = [] + to_update = [] + all_kits = [] + for layer, (submodule, outname) in zip(layers, submodules): + u = layer.update + u.resolve_all() + to_update += u.updates.keys() + all_kits += u.kits + # the input is the whole deep model's input instead of the layer's own + # input (which is previous_layer[outname]) + inputs = [self.input] + u.inputs[1:] + method = theano.Method(inputs, u.outputs, u.updates, u.kits) + local_update.append(method) + global_update.append( + theano.Method(inputs, + u.outputs, + # we update the params of the previous layers too but wrt + # this layer's cost + dict((param, param - layer.lr * T.grad(layer.cost, param)) + for param in to_update), + list(all_kits))) + representation.append(theano.Method(self.input, layer[outname])) + +# @todo: Add diagnostics +# self.diagnose_from_input = Method([self.input], self.layers[0].diagnose.outputs + self.layers[1].diagnose.outputs ... + + self.local_update = local_update + self.global_update = global_update + self.representation = representation + self.update = self.global_update[-1] + self.compute = theano.Method(self.input, self.output) + ll = self.layers[-1] + for name, method in ll.components_map(): + if isinstance(method, theano.Method) and not hasattr(self, name): + m = method.dup() + m.resolve_all() + m.inputs = [self.input if x is ll.input else x for x in m.inputs] + setattr(self, name, m) + + def _instance_initialize(self, obj, nunits = None, lr = 0.01, seed = None, **kwargs): + super(Stacker, self)._instance_initialize(obj, **kwargs) + if seed is not None: + R = N.random.RandomState(seed) + else: + R = N.random + for layer in obj.layers: + if layer.lr is None: + layer.lr = lr + if nunits: + obj.input_dimension = nunits[0] + obj.output_dimension = nunits[-1] + if len(nunits) != len(obj.layers) + 1: + raise ValueError('You should give exactly one more unit numbers as there are layers.') + for ni, no, layer in zip(nunits[:-1], nunits[1:], obj.layers): + if seed is not None: + layer.initialize(ni, no, seed = R.random_integers(sys.maxint - 1)) + else: + layer.initialize(ni, no) + if seed is not None: + obj.seed(seed) + + def _instance_flops_approx(self, obj): + rval = 0 + for layer in obj.layers: + rval += layer.flops_approx() + return rval + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/stopper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/stopper.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,125 @@ +"""Early stopping iterators + +The idea here is to supply early-stopping heuristics that can be used in the +form: + + stopper = SomeEarlyStopper() + + for i in stopper(): + # train from data + if i.set_score: + i.score = validation_score + + +So far I only have one heuristic, so maybe this won't scale. +""" + +class Stopper(object): + + def train(self, data, update_rows_fn, update, validate, save=None): + """Return the best model trained on data + + Parameters: + data - a thing that accepts getitem(), or a tuple of such things + update_rows_fn - fn : int --> + update - fn: update an internal model from elements of data + validate - fn: evaluate an internal model based on elements of data + save - fn: return a copy of the internal model + + The body of this function exhausts the iterator, and trains a + model using early stopping in the process. + """ + + best = None + for stp in self: + i = stp.iter + + # call update on some training set rows + t_rows = update_rows_fn(i) + if isinstance(data, (tuple, list)): + update(*[d[t_rows] for d in data]) + else: + update(data[t_rows]) + + if stp.set_score: + stp.score = validate() + if (stp.score < stp.best_score) and save: + best = save() + return best + + def find_min(self, step, check, save): + best = None + for stp in self: + step() + if stp.set_score: + stp.score = check() + if (stp.score < stp.best_score) and save: + best = (save(), stp.iter, stp.score) + return best + +class ICML08Stopper(Stopper): + @staticmethod + def icml08(ntrain, batchsize): + """Some setting similar to what I used for ICML08 submission""" + #TODO: what did I actually use? put that in here. + return ICML08Stopper(30*ntrain/batchsize, + ntrain/batchsize, 0.96, 2.0, 100000000) + + def __init__(self, i_wait, v_int, min_improvement, patience, hard_limit): + self.initial_wait = i_wait + self.set_score_interval = v_int + self.min_improvement = min_improvement + self.patience = patience + self.hard_limit = hard_limit + + self.best_score = float('inf') + self.best_iter = -1 + self.iter = -1 + + self.set_score = False + self.score = None + + def __iter__(self): + return self + + E_set_score = 'when iter.set_score is True, caller must assign a score to iter.score' + def next(self): + + #print 'ICML08 stopper, were doing a next' + + if self.set_score: #left over from last time + if self.score is None: + raise Exception(ICML08Stopper.E_set_score) + if self.score < (self.best_score * self.min_improvement): + (self.best_score, self.best_iter) = (self.score, self.iter) + self.score = None #un-set it + + + starting = self.iter < self.initial_wait + waiting = self.iter < (self.patience * self.best_iter) + if starting or waiting: + # continue to iterate + self.iter += 1 + if self.iter == self.hard_limit: + raise StopIteration + self.set_score = (self.iter % self.set_score_interval == 0) + return self + + raise StopIteration + +class NStages(ICML08Stopper): + """Run for a fixed number of steps, checking validation set every so + often.""" + def __init__(self, hard_limit, v_int): + ICML08Stopper.__init__(self, hard_limit, v_int, 1.0, 1.0, hard_limit) + + #TODO: could optimize next() function. Most of what's in ICML08Stopper.next() + #is not necessary + +def geometric_patience(i_wait, v_int, min_improvement, patience, hard_limit): + return ICML08Stopper(i_wait, v_int, min_improvement, patience, hard_limit) + +def nstages(hard_limit, v_int): + return ICML08Stopper(hard_limit, v_int, 1.0, 1.0, hard_limit) + + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/tests/test_aa.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/tests/test_aa.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,51 @@ +#from __future__ import absolute_imports + +from pylearn.algorithms import aa as models +import theano +import numpy +import time + + +def test_train(mode = theano.Mode('c|py', 'fast_run')): + + aa = models.SigmoidXEAutoEncoder(regularize = False) +# print aa.update.pretty(mode = theano.Mode('py', 'fast_run').excluding('inplace')) + + model = aa.make(lr = 0.01, + input_size = 100, + hidden_size = 1000, + mode = mode) + + data = [[0, 1, 0, 0, 1, 1, 1, 0, 1, 0]*10]*10 + #data = numpy.random.rand(10, 100) + + t1 = time.time() + for i in xrange(1001): + cost = model.update(data) + if i % 100 == 0: + print i, cost + t2 = time.time() + return t2 - t1 + +if __name__ == '__main__': + numpy.random.seed(10) + print 'sanity check:' + t1 = test_train('SANITY_CHECK') +# t1 = test_train([theano.Mode('c|py', 'fast_compile'), +# theano.Mode('c|py', 'fast_run')]) + print 'time:',t1 + print + + numpy.random.seed(10) + print 'optimized:' + t1 = test_train(theano.Mode('c|py', 'fast_run')) + print 'time:',t1 + print + + numpy.random.seed(10) + print 'not optimized:' + t2 = test_train(theano.Mode('c|py', 'fast_compile')) + print 'time:',t2 + + + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/tests/test_daa.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/tests/test_daa.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,90 @@ +#!/usr/bin/python + +from pylearn import algorithms as models +import theano +import numpy +import time + +import pylearn.algorithms.logistic_regression + +def test_train_daa(mode = theano.Mode('c|py', 'fast_run')): + + ndaa = 3 + daa = models.Stacker([(models.SigmoidXEDenoisingAA, 'hidden')] * ndaa + [(models.BinRegressor, 'output')], + regularize = False) + + model = daa.make([4, 20, 20, 20, 1], + lr = 0.01, + mode = mode, + seed = 10) + + model.layers[0].noise_level = 0.3 + model.layers[1].noise_level = 0.3 + model.layers[2].noise_level = 0.3 + + # Update the first hidden layer + for l in range(3): + for i in range(10): + model.local_update[l]([[0, 1, 0, 1]]) + model.local_update[l]([[1, 0, 1, 0]]) + + for i in range(10): + model.update([[0, 1, 0, 1]], [[1]]) + model.update([[1, 0, 1, 0]], [[0]]) + print model.classify([[0, 1, 0, 1]]) + print model.classify([[1, 0, 1, 0]]) + + +def test_train_daa2(mode = theano.Mode('c|py', 'fast_run')): + + ndaa = 3 + daa = models.Stacker([(models.SigmoidXEDenoisingAA, 'hidden')] * ndaa + [(pylearn.algorithms.logistic_regression.Module_Nclass, 'pred')], + regularize = False) + + model = daa.make([4] + [20] * ndaa + [10], + lr = 0.01, + mode = mode, + seed = 10) + + for l in range(ndaa): model.layers[l].noise_level = 0.3 + + instances = [([[0, 1, 0, 1]], [1]), ([[1, 0, 1, 0]], [0])] + + for l in range(ndaa): + for i in range(10): + for (input, output) in instances: + model.local_update[l](input) + + for i in range(10): + for (input, output) in instances: +# model.update(input, output) + print "OLD:", + print model.validate(input, output) + oldloss = model.update(input, output) + print oldloss + print "NEW:" + print model.validate(input, output) + print + + print model.apply([[0, 1, 0, 1]]) + print model.apply([[1, 0, 1, 0]]) + + + + +if __name__ == '__main__': +# print 'optimized:' +# t1 = test_train_daa(theano.Mode('py', 'fast_compile')) +# t1 = test_train_daa(theano.Mode('c|py', 'fast_run')) +# print 'time:',t1 +# print + +# print 'not optimized:' +# t2 = test_train_daa(theano.Mode('c|py', 'fast_compile')) +## print 'time:',t2 + +# test_train_daa(theano.compile.Mode('c&py', 'merge')) +# test_train_daa(theano.compile.Mode('c|py', 'merge')) + test_train_daa(theano.compile.Mode('py', 'merge')) + + test_train_daa2(theano.compile.Mode('c|py', 'merge')) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/tests/test_linear_regression.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/tests/test_linear_regression.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,38 @@ + +import unittest +from pylearn.algorithms.linear_regression import * +from make_test_datasets import * +import numpy + +class test_linear_regression(unittest.TestCase): + + def test1(self): + trainset,testset,theta=make_artificial_datasets_from_function(n_inputs=3, + n_targets=2, + n_examples=100, + f=linear_predictor) + + assert trainset.fields()['input'].shape==(50,3) + assert testset.fields()['target'].shape==(50,2) + regressor = LinearRegression(L2_regularizer=0.1) + predictor = regressor(trainset) + test_data = testset.fields() + mse = predictor.compute_mse(test_data['input'],test_data['target']) + print 'mse = ',mse + +if __name__ == '__main__': + import sys + + if len(sys.argv)==1: + unittest.main() + else: + assert sys.argv[1]=="--debug" + tests = [] + for arg in sys.argv[2:]: + tests.append(arg) + if tests: + unittest.TestSuite(map(T_DataSet, tests)).debug() + else: + module = __import__("_test_linear_regression") + tests = unittest.TestLoader().loadTestsFromModule(module) + tests.debug() diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/tests/test_logistic_regression.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/tests/test_logistic_regression.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,60 @@ +from pylearn.algorithms.logistic_regression import * +import sys, time + +if __name__ == '__main__': + pprint.assign(nnet.crossentropy_softmax_1hot_with_bias_dx, printing.FunctionPrinter('xsoftmaxdx')) + pprint.assign(nnet.crossentropy_softmax_argmax_1hot_with_bias, printing.FunctionPrinter('nll', 'softmax', 'argmax')) + if 1: + lrc = Module_Nclass() + + print '================' + print lrc.update.pretty() + print '================' + print lrc.update.pretty(mode = theano.Mode('py', 'fast_run')) + print '================' +# print lrc.update.pretty(mode = compile.FAST_RUN.excluding('inplace')) +# print '================' + +# sys.exit(0) + + lr = lrc.make(10, 2, mode=theano.Mode('c|py', 'fast_run')) + #lr = lrc.make(10, 2, mode=compile.FAST_RUN.excluding('fast_run')) + #lr = lrc.make(10, 2, mode=theano.Mode('py', 'merge')) #'FAST_RUN') + + data_x = N.random.randn(5, 10) + data_y = (N.random.randn(5) > 0) + + t = time.time() + for i in xrange(10000): + lr.lr = 0.02 + xe = lr.update(data_x, data_y) + #if i % 100 == 0: + # print i, xe + + print 'training time:', time.time() - t + print 'final error', xe + + #print + #print 'TRAINED MODEL:' + #print lr + + if 0: + lrc = Module() + + lr = lrc.make(10, mode=theano.Mode('c|py', 'merge')) #'FAST_RUN') + + data_x = N.random.randn(5, 10) + data_y = (N.random.randn(5, 1) > 0) + + for i in xrange(10000): + xe = lr.update(data_x, data_y) + if i % 100 == 0: + print i, xe + + print + print 'TRAINED MODEL:' + print lr + + + + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/tests/test_regressor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/tests/test_regressor.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,46 @@ + + +import pylearn.algorithms as models +import theano +import numpy +import time + + +def test_train(mode = theano.Mode('c|py', 'fast_run')): + + reg = models.BinRegressor(regularize = False) + + model = reg.make(lr = 0.01, + input_size = 100, + mode = mode, + seed = 10) + +# data = [[0, 1, 0, 0, 1, 1, 1, 0, 1, 0]*10]*10 +# targets = [[1]]*10 + #data = numpy.random.rand(10, 100) + + R = numpy.random.RandomState(100) + t1 = time.time() + for i in xrange(1001): + data = R.random_integers(0, 1, size = (10, 100)) + targets = data[:, 6].reshape((10, 1)) + cost = model.update(data, targets) + if i % 100 == 0: + print i, '\t', cost, '\t', 1*(targets.T == model.classify(data).T) + t2 = time.time() + return t2 - t1 + +if __name__ == '__main__': + print 'optimized:' + t1 = test_train(theano.Mode('c|py', 'fast_run')) + print 'time:',t1 + print + + print 'not optimized:' + t2 = test_train(theano.Mode('c|py', 'fast_compile')) + print 'time:',t2 + + + + + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/tests/test_sgd.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/tests/test_sgd.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,68 @@ +import theano +from pylearn.algorithms import sgd + +def test_sgd0(): + + x = theano.tensor.dscalar('x') + y = theano.tensor.dscalar('y') + + M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y], stepsize=0.01) + M.y = y + m = M.make() + m.y = 5.0 + for i in xrange(100): + c = m.step_cost(3.0) + # print c, m.y + + assert c < 1.0e-5 + assert abs(m.y - (1.0 / 3)) < 1.0e-4 + +def test_sgd_stepsize_variable(): + + x = theano.tensor.dscalar('x') + y = theano.tensor.dscalar('y') + lr = theano.tensor.dscalar('lr') + + M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y], stepsize=lr) + M.y = y + M.lr = lr + m = M.make() + m.y = 5.0 + m.lr = 0.01 + for i in xrange(100): + c = m.step_cost(3.0) + # print c, m.y + + assert c < 1.0e-5 + assert abs(m.y - (1.0 / 3)) < 1.0e-4 + + + #test that changing the lr has impact + + m.y = 5.0 + m.lr = 0.0 + for i in xrange(10): + c = m.step_cost(3.0) + # print c, m.y + + assert m.y == 5.0 + +def test_sgd_stepsize_none(): + + x = theano.tensor.dscalar('x') + y = theano.tensor.dscalar('y') + + M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y]) + M.y = y + m = M.make() + m.y = 5.0 + #there should be a learning rate here by default + assert m.stepsize is None + m.stepsize = 0.01 + for i in xrange(100): + c = m.step_cost(3.0) + # print c, m.y + + assert c < 1.0e-5 + assert abs(m.y - (1.0 / 3)) < 1.0e-4 + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/tests/test_stacker.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/tests/test_stacker.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,43 @@ + +import pylearn.algorithms as models +import theano +import numpy +import time + + +def test_train(mode = theano.Mode('c|py', 'fast_run')): + + reg = models.Stacker([(models.BinRegressor, 'output'), (models.BinRegressor, 'output')], + regularize = False) + #print reg.global_update[1].pretty(mode = mode.excluding('inplace')) + + model = reg.make([100, 200, 1], + lr = 0.01, + mode = mode, + seed = 10) + + R = numpy.random.RandomState(100) + t1 = time.time() + for i in xrange(1001): + data = R.random_integers(0, 1, size = (10, 100)) + targets = data[:, 6].reshape((10, 1)) + cost = model.update(data, targets) + if i % 100 == 0: + print i, '\t', cost, '\t', 1*(targets.T == model.classify(data).T) + t2 = time.time() + return t2 - t1 + +if __name__ == '__main__': + print 'optimized:' + t1 = test_train(theano.Mode('c|py', 'fast_run')) + print 'time:',t1 + print + + print 'not optimized:' + t2 = test_train(theano.Mode('c|py', 'fast_compile')) + print 'time:',t2 + + + + + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/algorithms/weights.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/weights.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,40 @@ +""" +Routine to initialize weights. + +@note: We assume that numpy.random.seed() has already been performed. +""" + +from math import pow, sqrt +import numpy.random + +sqrt3 = sqrt(3.0) +def random_weights(nin, nout, scale_by=1./sqrt3, power=0.5): + """ + Generate an initial weight matrix with nin inputs (rows) and nout + outputs (cols). + Each weight is chosen uniformly at random to be in range: + [-scale_by*sqrt(3)/pow(nin,power), +scale_by*sqrt(3)/pow(nin,power)] + @note: Play with scale_by, but reasonable values are <=1, maybe 1./sqrt3 + power=0.5 is strongly recommanded (see below). + + Suppose these weights w are used in dot products as follows: + output = w' input + If w ~ Uniform(-r,r) and Var[input_i]=1 and x_i's are independent, then + Var[w]=r2/3 + Var[output] = Var[ sum_{i=1}^d w_i input_i] = d r2 / 3 + To make sure that variance is not changed after the dot product, + we therefore want Var[output]=1 and r = sqrt(3)/sqrt(d). This choice + corresponds to the default values scale_by=sqrt(3) and power=0.5. + More generally we see that Var[output] = Var[input] * scale_by. + + Now, if these are weights in a deep multi-layer neural network, + we would like the top layers to be initially more linear, so as to let + gradients flow back more easily (this is an explanation by Ronan Collobert). + To achieve this we want scale_by smaller than 1. + Ronan used scale_by=1/sqrt(3) (by mistake!) and got better results than scale_by=1 + in the experiment of his ICML'2008 paper. + Note that if we have a multi-layer network, ignoring the effect of the tanh non-linearity, + the variance of the layer outputs would go down roughly by a factor 'scale_by' at each + layer (making the layers more linear as we go up towards the output). + """ + return (numpy.random.rand(nin, nout) * 2.0 - 1) * scale_by * sqrt3 / pow(nin,power) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/MNIST.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/MNIST.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,81 @@ +""" +Various routines to load/access MNIST data. +""" +from __future__ import absolute_import + +import os +import numpy + +from ..io.amat import AMat +from .config import data_root # config +from .dataset import Dataset + +def head(n=10, path=None): + """Load the first MNIST examples. + + Returns two matrices: x, y. x has N rows of 784 columns. Each row of x represents the + 28x28 grey-scale pixels in raster order. y is a vector of N integers. Each element y[i] + is the label of the i'th row of x. + + """ + path = os.path.join(data_root(), 'mnist','mnist_with_header.amat') if path is None else path + + dat = AMat(path=path, head=n) + + try: + assert dat.input.shape[0] == n + assert dat.target.shape[0] == n + except Exception , e: + raise Exception("failed to read MNIST data", (dat, e)) + + return dat.input, numpy.asarray(dat.target, dtype='int64').reshape(dat.target.shape[0]) + +def all(path=None): + return head(n=None, path=path) + +def train_valid_test(ntrain=50000, nvalid=10000, ntest=10000, path=None): + all_x, all_targ = head(ntrain+nvalid+ntest, path=path) + + rval = Dataset() + + rval.train = Dataset.Obj(x=all_x[0:ntrain], + y=all_targ[0:ntrain]) + rval.valid = Dataset.Obj(x=all_x[ntrain:ntrain+nvalid], + y=all_targ[ntrain:ntrain+nvalid]) + rval.test = Dataset.Obj(x=all_x[ntrain+nvalid:ntrain+nvalid+ntest], + y=all_targ[ntrain+nvalid:ntrain+nvalid+ntest]) + + rval.n_classes = 10 + rval.img_shape = (28,28) + return rval + + +def full(): + return train_valid_test() + +#usefull for test, keep it +def first_10(): + return train_valid_test(ntrain=10, nvalid=10, ntest=10) + +#usefull for test, keep it +def first_100(): + return train_valid_test(ntrain=100, nvalid=100, ntest=100) + +def first_1k(): + return train_valid_test(ntrain=1000, nvalid=200, ntest=200) + +def first_10k(): + return train_valid_test(ntrain=10000, nvalid=2000, ntest=2000) + +#old method from factory idea days... delete when ready -JB20090119 +def mnist_factory(variant="", ntrain=None, nvalid=None, ntest=None): + if variant=="": + return full() + elif variant=="1k": + return first_1k() + elif variant=="10k": + return first_10k() + elif variant=="custom": + return train_valid_test(ntrain=ntrain, nvalid=nvalid, ntest=ntest) + else: + raise Exception('Unknown MNIST variant', variant) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/__init__.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,1 @@ +from dataset import make_dataset, Dataset diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/config.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/config.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,18 @@ +"""Configuration options for datasets + + +Especially, the locations of data files. +""" + +import os, sys +def env_get(key, default, key2 = None): + if key2 and os.getenv(key) is None: + key=key2 + if os.getenv(key) is None: + print >> sys.stderr, "WARNING: Environment variable", key, + print >> sys.stderr, "is not set. Using default of", default + return default if os.getenv(key) is None else os.getenv(key) + +def data_root(): + return env_get('PYLEARN_DATA_ROOT', os.getenv('HOME')+'/data', 'DBPATH') + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/dataset.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/dataset.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,120 @@ +"""The dataset-from-descriptor mechanism.""" + +_datasets = {} + +def add_dataset_factory(family, fn): + """Add `fn` as the handler for descriptors whose first token is `family`. + + :returns: None + + """ + if family in _datasets: + raise Exception('dataset identifier already in use:', family) + else: + _datasets[family] = fn + +def dataset_factory(family): + """Register a function as the handler for a given kind of dataset, identified by `family`. + + When someone calls dataset_from_descr('kind_of_dataset option1 option2, etc.', approx=1), + then the handler registered for 'kind_of_dataset' will be called with the same arguments as + dataset_from_descr. + + .. code-block:: python + + @dataset_factory('MNIST') + def mnist_related_dataset(descr, **kwargs): + ... + + :returns: `dectorator` + """ + def decorator(fn): + add_dataset_factory(family, fn) + return fn + return decorator + +def make_dataset(family, **kwargs): + """Return the dataset described by `descr`. + + :param descr: a dataset identifier + :type descr: str + :returns: `Dataset` + + """ + return _datasets[family](**kwargs) + + +class Dataset(object): + class Obj(object): + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + + """Dataset is a generic container for pylearn datasets. + + It is not intended to put any restriction whatsoever on its contents. + + It is intended to encourage certain conventions, described below. Conventions should arise + naturally among datasets in PyLearn. When a few datasets adhere to a new convention, then + describe it here and make it more official. + + If no particular convention applies. Create your own object to store the dataset, and + assign it to the `data` attribute. + """ + data = None + + """ + SIMPLE REGRESSION / CLASSIFICATION + ---------------------------------- + + In this setting, you are aiming to do vector classification or vector regression + where your train, valid and test sets fit in memory. + The convention is to put your data into numpy ndarray instances. Put training data in the + `train` attribute, validation data in the `valid` attribute and test data in the `test + attribute`. + Each of those attributes should be an instance that defines at least two attributes: `x` for the + input matrix and `y` for the target matrix. The `x` ndarray should be one example per + leading index (row for matrices). + The `y` ndarray should be one target per leading index (entry for vectors, row for matrices). + If `y` is a classification target, than it should be a vector with numpy dtype 'int32'. + + If there are weights associated with different examples, then create a 'weights' attribute whose + value is a vector with one floating-point value (typically double-precision) per example. + + If the task is classification, then the classes should be mapped to the integers + 0,1,...,N-1. + The number of classes (here, N) should be stored in the `n_classes` attribute. + + """ + train = None #instance with .x, .y + + valid = None #instance with .x, .y + + test = None #instance with .x, .y + + n_classes = None #int + + """ + WHEN INPUTS ARE FIXED-SIZE GREYSCALE IMAGES + ------------------------------------------- + + In this setting we typically encode images as vectors, by enumerating the pixel values in + left-to-right, top-to-bottom order. Pixel values should be in floating-point, and + normalized between 0 and 1. + + The shape of the images should be recorded in the `img_shape` attribute as a tuple (rows, + cols). + + """ + + img_shape = None # (rows, cols) + + + """ + TIMESERIES + ---------- + + When dealing with examples which are themselves timeseries, put each example timeseries in a + tensor and make a list of them. Generally use tensors, and resort to lists or arrays + wherever different + """ + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/embeddings/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/embeddings/README.txt Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,3 @@ +Messy scripts for working with Jason + Ronan's embeddings. + +Parameters are given in parameters.py diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/embeddings/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/embeddings/__init__.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,1 @@ +from process import * diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/embeddings/convert.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/embeddings/convert.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,15 @@ +#!/usr/bin/python +""" +Convert stdin sentences to word embeddings, and output YAML. +""" + +import sys, string +import read +import yaml + +output = [] +for l in sys.stdin: + l = string.strip(l) + output.append((l, read.convert_string(l))) + +print yaml.dump(output) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/embeddings/one-per-line.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/embeddings/one-per-line.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,27 @@ +#!/usr/bin/python + +import string +#import psyco + +weightsfile = "lm-weights.txt" +vocabfile = "words.asc" +size = 30000 +dimensions = 50 + +import numpy, math +import sys +from percent import percent + +word_to_vector = {} + +f = open(weightsfile) +f.readline() +vals = [float(v) for v in string.split(f.readline())] +assert len(vals) == size * dimensions +vals.reverse() +#for i in range(size): +r = range(size) +r.reverse() +for i in r: + l = vals[dimensions*i:dimensions*(i+1)] + print string.join([`s` for s in l], "\t") diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/embeddings/parameters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/embeddings/parameters.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,10 @@ +""" +Locations of the embedding data files. +""" +WEIGHTSFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/lm-weights.txt" +VOCABFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/words.asc" +#WEIGHTSFILE = "/home/joseph/data/word_embeddings.collobert-and-weston/lm-weights.txt" +#VOCABFILE = "/home/joseph/data/word_embeddings.collobert-and-weston/words.asc" +NUMBER_OF_WORDS = 30000 +DIMENSIONS = 50 +UNKNOWN = "UNKNOWN" diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/embeddings/percent.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/embeddings/percent.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,9 @@ +def percent(a, b): + """ + Return percentage string of a and b, e.g.: + "1 of 10 (10%)" + """ + assert a <= b + assert a >= 0 + assert b > 0 + return "%s of %s (%.2f%%)" % (a, b, 100.*a/b) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/embeddings/process.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/embeddings/process.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,136 @@ +""" +Read in the weights file +""" + +import string +import sys + +from parameters import * + +__words = None +__word_to_embedding = None +__read = False + +def length(): + """ + @return: The length of embeddings + """ + return len(__word_to_embedding[__words[0]]) + +def word_to_embedding(w): + read_embeddings() + return __word_to_embedding[w] + +def read_embeddings(): + global __words + global __word_to_embedding + global __read + if __read: return + + __words = [string.strip(w) for w in open(VOCABFILE).readlines()] + assert len(__words) == NUMBER_OF_WORDS + + import numpy, math + from percent import percent + + __word_to_embedding = {} + + sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) + f = open(WEIGHTSFILE) + f.readline() + vals = [float(v) for v in string.split(f.readline())] + assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS + for i in range(NUMBER_OF_WORDS): + l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] + w = __words[i] + __word_to_embedding[w] = l + __read = True + for w in __word_to_embedding: assert len(__word_to_embedding[__words[0]]) == len(__word_to_embedding[w]) + sys.stderr.write("...done reading %s\n" % WEIGHTSFILE) + +import re +numberre = re.compile("[0-9]") +slashre = re.compile("\\\/") + +def preprocess_word(origw): + """ + Convert a word so that it can be embedded directly. + Returned the preprocessed sequence. + @note: Preprocessing is appropriate for Penn Treebank style documents. + #@note: Perhaps run L{common.penntreebank.preprocess} on the word first. + """ + read_embeddings() + if origw == "-LRB-": w = "(" + elif origw == "-RRB-": w = ")" + elif origw == "-LCB-": w = "{" + elif origw == "-RCB-": w = "}" + elif origw == "-LSB-": w = "[" + elif origw == "-RSB-": w = "]" + else: + w = origw + if w not in __word_to_embedding: + w = string.lower(w) + w = slashre.sub("/", w) + w = numberre.sub("NUMBER", w) +# if w not in __word_to_embedding: +# w = string.lower(w) +# w = numberre.sub("NUMBER", w) + if w not in __word_to_embedding: +# sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) + w = UNKNOWN + assert w in __word_to_embedding + return w + +def preprocess_seq(l): + """ + Convert a sequence so that it can be embedded directly. + Returned the preprocessed sequence. + @note: Preprocessing is appropriate for Penn Treebank style documents. + """ + read_embeddings() + lnew = [] + for origw in l: + w = preprocess_word(origw) + lnew.append(w) + return lnew + +#def convert_string(s, strict=False): +# """ +# Convert a string to a sequence of embeddings. +# @param strict: If strict, then words *must* be in the vocabulary. +# @todo: DEPRECATED Remove this function. +# """ +# read_embeddings() +# e = [] +# for origw in string.split(string.lower(s)): +# w = numberre.sub("NUMBER", origw) +# if w in __word_to_embedding: +# e.append(__word_to_embedding[w]) +# else: +# sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) +# assert not strict +# e.append(__word_to_embedding[UNKNOWN]) +# return e + +#def test(): +# """ +# Debugging code. +# """ +# read_embeddings() +# for w in __word_to_embedding: +# assert len(__word_to_embedding[w]) == 50 +# import numpy +# for w1 in __words: +# e1 = numpy.asarray(__word_to_embedding[w1]) +# lst = [] +# print w1, numpy.dot(e1, e1) +# for w2 in __word_to_embedding: +# if w1 >= w2: continue +# e2 = numpy.asarray(__word_to_embedding[w2]) +# d = (e1 - e2) +# l2 = numpy.dot(d, d) +# lst.append((l2, w1, w2)) +# lst.sort() +# print lst[:10] +# +#test() diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/embeddings/read-original.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/embeddings/read-original.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,47 @@ +#!/usr/bin/python + +import string +#import psyco + +weightsfile = "lm-weights.txt" +vocabfile = "words.asc" +size = 30000 +dimensions = 50 + +words = [string.strip(w) for w in open(vocabfile).readlines()] +assert len(words) == 30000 + +import numpy, math +import sys +from percent import percent + +word_to_vector = {} + +f = open(weightsfile) +f.readline() +vals = [float(v) for v in string.split(f.readline())] +assert len(vals) == size * dimensions +vals.reverse() +for i in range(size): + l = vals[dimensions*i:dimensions*(i+1)] + w = words[i] + word_to_vector[w] = l + +# l2 = numpy.asarray(l) +# print math.fabs(50 - numpy.sum(l2*l2)), w + +cnt = 0 +for i1 in range(len(words)): + for i2 in range(len(words)): + w1 = words[i1] + w2 = words[i2] + cnt += 1 + if i1 <= i2: continue + l1 = numpy.asarray(word_to_vector[w1]) + l2 = numpy.asarray(word_to_vector[w2]) + d = l2 - l1 + dist = numpy.sum(d * d) + if dist < 50: + print numpy.sum(d * d), w1, w2, i1, i2 + if cnt % 1000 == 0: + sys.stderr.write("%s done...\n" % percent(cnt, len(word_to_vector) * len(word_to_vector))) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/flickr.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/flickr.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,71 @@ +""" +Routines to load variations on the Flickr image dataset. +""" +from __future__ import absolute_import + +import os +import numpy + +from ..io import filetensor +from .config import data_root +from .dataset import Dataset + + +path_test_10class ='flickr_10classes_test.ft' + +path_train_10class = 'flickr_10classes_train.ft' + +path_valid_10class = 'flickr_10classes_valid.ft' + +def basic_10class(folder = None): + """Return the basic flickr image classification problem. + The images are 75x75, and there are 7500 training examples. + """ + root = os.path.join(data_root(), 'flickr') if folder is None else folder + train = filetensor.read(open(os.path.join(root, path_train_10class))) + valid = filetensor.read(open(os.path.join(root, path_valid_10class))) + test = filetensor.read(open(os.path.join(root, path_test_10class))) + + assert train.shape[1] == 75*75 +1 + assert valid.shape[1] == 75*75 +1 + assert test.shape[1] == 75*75 +1 + + rval = Dataset() + + rval.train = Dataset.Obj( + x=train[:, 0:-1], + y=numpy.asarray(train[:, -1], dtype='int64')) + rval.valid = Dataset.Obj( + x=valid[:, 0:-1], + y=numpy.asarray(valid[:, -1], dtype='int64')) + rval.test = Dataset.Obj( + x=test[:, 0:-1], + y=numpy.asarray(test[:, -1], dtype='int64')) + + rval.n_classes = 10 + rval.img_shape = (75,75) + + return rval + +def translations_10class(): + raise NotImplementedError('TODO') + + +def render_a_few_images(n=10, prefix='flickr_img', suffix='png'): + #TODO: document this and move it to a more common + # place where other datasets can use it + from PIL import Image + root = os.path.join(data_root(), 'flickr') + valid = filetensor.read(open(os.path.join(root, path_valid_10class))) + assert valid.shape == (1000,75*75+1) + for i in xrange(n): + pixelarray = valid[i,0:-1].reshape((75,75)).T + assert numpy.all(pixelarray >= 0) + assert numpy.all(pixelarray <= 1) + + pixel_uint8 = numpy.asarray( pixelarray * 255.0, dtype='uint8') + im = Image.frombuffer('L', pixel_uint8.shape, pixel_uint8.data, 'raw', 'L', 0, 1) + im.save(prefix + str(i) + '.' + suffix) + + + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/make_test_datasets.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/make_test_datasets.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,118 @@ +from dataset import ArrayDataSet +from shapeset.dset import Polygons +from linear_regression import linear_predictor +from kernel_regression import kernel_predictor +from numpy import * + +""" +General-purpose code to generate artificial datasets that can be used +to test different learning algorithms. +""" + + +def make_triangles_rectangles_online_dataset(image_size=(10,10)): + """ + Make a binary classification dataset to discriminate triangle images from rectangle images. + """ + def convert_dataset(dset): + # convert the n_vert==3 into target==0 and n_vert==4 into target==1 + def mapf(images,n_vertices): + n=len(n_vertices) + targets = ndarray((n,1),dtype='float64') + for i in xrange(n): + targets[i,0] = array([0. if n_vertices[i]==3 else 1.],dtype='float64') + return images.reshape(len(images),images[0].size).astype('float64'),targets + return dataset.ApplyFunctionDataSet(dset("image","nvert"),mapf,["input","target"]) + + p=Polygons(image_size,[3,4],fg_min=1./255,fg_max=1./255,rot_max=1.,scale_min=0.35,scale_max=0.9,pos_min=0.1, pos_max=0.9) + trainset=convert_dataset(p) + return trainset + + +def make_triangles_rectangles_dataset(n_examples=600,image_size=(10,10), cache = True): + """ + Make a binary classification dataset to discriminate triangle images from rectangle images. + """ + def convert_dataset(dset): + # convert the n_vert==3 into target==0 and n_vert==4 into target==1 + def mapf(images,n_vertices): + n=len(n_vertices) + targets = ndarray((n,1),dtype='float64') + for i in xrange(n): + targets[i,0] = array([0. if n_vertices[i]==3 else 1.],dtype='float64') + return images.reshape(len(images),images[0].size).astype('float64'),targets + return dataset.CachedDataSet(dataset.ApplyFunctionDataSet(dset("image","nvert"),mapf,["input","target"]),cache) + + p=Polygons(image_size,[3,4],fg_min=1./255,fg_max=1./255,rot_max=1.,scale_min=0.35,scale_max=0.9,pos_min=0.1, pos_max=0.9) + data = p.subset[0:n_examples] + trainset=convert_dataset(data.subset[0:n_examples]) + return trainset + + +def make_triangles_rectangles_datasets(n_examples=600,train_frac=0.5,image_size=(10,10), cache = True): + """ + Make two binary classification datasets to discriminate triangle images from rectangle images. + The first one is the training set, the second is the test set. + """ + data = make_triangles_rectangles_dataset(n_examples=n_examples,image_size=image_size, cache = cache) + n_train = int(n_examples*train_frac) + trainset=convert_dataset(data.subset[0:n_train]) + testset=convert_dataset(data.subset[n_train:n_examples]) + return trainset,testset + + +def make_artificial_datasets_from_function(n_inputs=1, + n_targets=1, + n_examples=20, + train_frac=0.5, + noise_level=0.1, # add Gaussian noise, noise_level=sigma + params_shape=None, + f=None, # function computing E[Y|X] + otherargs=None, # extra args to f + b=None): # force theta[0] with this value + """ + Make regression data of the form + Y | X ~ Normal(f(X,theta,otherargs),noise_level^2) + If n_inputs==1 then X is chosen at regular locations on the [-1,1] interval. + Otherwise X is sampled according to a Normal(0,1) on all dimensions (independently). + The parameters theta is a matrix of shape params_shape that is sampled from Normal(0,1). + Optionally theta[0] is set to the argument 'b', if b is provided. + + Return a training set and a test set, by splitting the generated n_examples + according to the 'train_frac'tion. + """ + n_train=int(train_frac*n_examples) + n_test=n_examples-n_train + if n_inputs==1: + delta1=2./n_train + delta2=2./n_test + inputs = vstack((array(zip(range(n_train)))*delta1-1, + 0.5*delta2+array(zip(range(n_test)))*delta2-1)) + else: + inputs = random.normal(size=(n_examples,n_inputs)) + if not f: + f = linear_predictor + if f==kernel_predictor and not otherargs[1]: + otherargs=(otherargs[0],inputs[0:n_train]) + if not params_shape: + if f==linear_predictor: + params_shape = (n_inputs+1,n_targets) + elif f==kernel_predictor: + params_shape = (otherargs[1].shape[0]+1,n_targets) + theta = random.normal(size=params_shape) if params_shape else None + if b: + theta[0]=b + outputs = f(inputs,theta,otherargs) + targets = outputs + random.normal(scale=noise_level,size=(n_examples,n_targets)) + # the | stacking creates a strange bug in LookupList constructor: + # trainset = ArrayDataSet(inputs[0:n_examples/2],{'input':slice(0,n_inputs)}) | \ + # ArrayDataSet(targets[0:n_examples/2],{'target':slice(0,n_targets)}) + # testset = ArrayDataSet(inputs[n_examples/2:],{'input':slice(0,n_inputs)}) | \ + # ArrayDataSet(targets[n_examples/2:],{'target':slice(0,n_targets)}) + data = hstack((inputs,targets)) + + trainset = ArrayDataSet(data[0:n_train], + {'input':slice(0,n_inputs),'target':slice(n_inputs,n_inputs+n_targets)}) + testset = ArrayDataSet(data[n_train:], + {'input':slice(0,n_inputs),'target':slice(n_inputs,n_inputs+n_targets)}) + return trainset,testset,theta diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/shapeset1.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/shapeset1.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,77 @@ +""" +Routines to load/access Shapeset1 +""" + +from __future__ import absolute_import + +import os +import numpy + +from ..io.amat import AMat +from .config import data_root +from .dataset import Dataset + +def _head(path, n): + dat = AMat(path=path, head=n) + + try: + assert dat.input.shape[0] == n + assert dat.target.shape[0] == n + except Exception , e: + raise Exception("failed to read %i lines from file %s" % (n, path)) + + return dat.input, numpy.asarray(dat.target, dtype='int64').reshape(dat.target.shape[0]) + + +def head_train(n=10000): + """Load the first Shapeset1 training examples. + + Returns two matrices: x, y. + x has N rows of 1024 columns. + Each row of x represents the 32x32 grey-scale pixels in raster order. + y is a vector of N integers between 0 and 2. + Each element y[i] is the label of the i'th row of x. + """ + path = os.path.join(data_root(), 'shapeset1','shapeset1_1cspo_2_3.10000.train.shape.amat') + return _head(path, n) + +def head_valid(n=5000): + """Load the first Shapeset1 validation examples. + + Returns two matrices: x, y. + x has N rows of 1024 columns. + Each row of x represents the 32x32 grey-scale pixels in raster order. + y is a vector of N integers between 0 and 2. + Each element y[i] is the label of the i'th row of x. + """ + path = os.path.join(data_root(), 'shapeset1','shapeset1_1cspo_2_3.5000.valid.shape.amat') + return _head(path, n) + +def head_test(n=5000): + """Load the first Shapeset1 testing examples. + + Returns two matrices: x, y. + x has N rows of 1024 columns. + Each row of x represents the 32x32 grey-scale pixels in raster order. + y is a vector of N integers between 0 and 2. + Each element y[i] is the label of the i'th row of x. + """ + path = os.path.join(data_root(), 'shapeset1','shapeset1_1cspo_2_3.5000.test.shape.amat') + return _head(path, n) + +def train_valid_test(ntrain=10000, nvalid=5000, ntest=5000): + train_x, train_y = head_train(n=ntrain) + valid_x, valid_y = head_valid(n=nvalid) + test_x, test_y = head_test(n=ntest) + + rval = Dataset() + rval.train = Dataset.Obj(x = train_x, y = train_y) + rval.valid = Dataset.Obj(x = valid_x, y = valid_y) + rval.test = Dataset.Obj(x = test_x, y = test_y) + + rval.n_classes = 3 + rval.img_shape = (32, 32) + + return rval + + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/smallNorb.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/smallNorb.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,108 @@ +import os +import numpy +from ..io.filetensor import read +from .config import data_root + +#Path = '/u/bergstrj/pub/data/smallnorb' +#Path = '/home/fringant2/lisa/louradoj/data/smallnorb' +#Path = '/home/louradou/data/norb' + +class Paths(object): + """File-related operations on smallNorb + """ + def __init__(self): + smallnorb = [data_root(), 'smallnorb'] + self.train_dat = os.path.join(*\ + smallnorb + ['smallnorb-5x46789x9x18x6x2x96x96-training-dat.mat']) + self.test_dat = os.path.join(*\ + smallnorb + ['smallnorb-5x01235x9x18x6x2x96x96-testing-dat.mat']) + self.train_cat = os.path.join(*\ + smallnorb + ['smallnorb-5x46789x9x18x6x2x96x96-training-cat.mat']) + self.test_cat = os.path.join(*\ + smallnorb + ['smallnorb-5x01235x9x18x6x2x96x96-testing-cat.mat']) + self.train_info = os.path.join(*\ + smallnorb + ['smallnorb-5x46789x9x18x6x2x96x96-training-info.mat']) + self.test_info = os.path.join(*\ + smallnorb + ['smallnorb-5x01235x9x18x6x2x96x96-testing-info.mat']) + + def load_append_train_test(self, normalize_pixels=True, downsample_amt=1, dtype='uint8'): + """ Load the smallNorb data into numpy matrices. + + normalize_pixels True will divide the values by 255, which makes sense in conjunction + with dtype=float32 or dtype=float64. + + """ + def downsample(dataset): + return dataset[:, 0, ::downsample_amt, ::downsample_amt] + + samples = downsample(read(open(self.train_dat))) + samples = numpy.vstack((samples, downsample(read(open(self.test_dat))))) + samples = numpy.asarray(samples, dtype=dtype) + if normalize_pixels: + samples *= (1.0 / 255.0) + + labels = read(open(self.train_cat)) + labels = numpy.hstack((labels, read(open(self.test_cat)))) + + infos = read(open(self.train_info)) + infos = numpy.vstack((infos, read(open(self.test_info)))) + + return samples, labels, infos + +def smallnorb_iid(ntrain=29160, nvalid=9720, ntest=9720, dtype='float64', normalize_pixels=True): + """Variation of the smallNorb task in which we randomly shuffle all the object instances + together before dividing into train/valid/test. + + The default train/valid/test sizes correspond to 60/20/20 split of the entire dataset. + + :returns: 5, (train_x, train_labels), (valid_x, valid_labels), (test_x, test_labels) + + """ + # cut from /u/louradoj/theano/hpu/expcode1.py + rng = numpy.random.RandomState(1) + samples, labels, infos = Paths().load_append_train_test(downsample_amt=3, dtype=dtype, normalize_pixels=normalize_pixels) + + nsamples = samples.shape[0] + if ntrain + nvalid + ntest > nsamples: + raise Exception("ntrain+nvalid+ntest exceeds number of samples (%i)" % nsamples, + (ntrain, nvalid, ntest)) + i0 = 0 + i1 = ntrain + i2 = ntrain + nvalid + i3 = ntrain + nvalid + ntest + + indices = rng.permutation(nsamples) + train_rows = indices[i0:i1] + valid_rows = indices[i1:i2] + test_rows = indices[i2:i3] + + n_labels = 5 + + def _pick_rows(rows): + a = numpy.array([samples[i].flatten() for i in rows]) + b = numpy.array([labels[i] for i in rows]) + return a, b + + return [_pick_rows(r) for r in (train_rows, valid_rows, test_rows)] + +def smallnorb_azSplit(): + # cut from /u/louradoj/theano/hpu/expcode1.py + # WARNING NOT NECESSARILY WORKING CODE + + samples, labels, infos = _load_append_train_test() + train_rows, valid_rows, test_rows = [], [], [] + train_rows_azimuth = [] + for instance in range(10): + az_min = 4*instance + az_max = 4*instance + 18 + train_rows_azimuth.append( [a % 36 for a in range(az_min,az_max,2)] ) + #print "train_rows_azimuth", train_rows_azimuth + for i, info in enumerate(infos): + if info[2] in train_rows_azimuth[info[0]]: + train_rows.append(i) + elif info[2] / 2 % 2 == 0: + test_rows.append(i) + else: + valid_rows.append(i) + + return [_pick_rows(samples, labels, r) for r in (train_rows, valid_rows, test_rows)] diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/testDataset.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/testDataset.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,43 @@ +""" +Various routines to load/access MNIST data. +""" +from __future__ import absolute_import + +import os +import numpy + +from ..io.amat import AMat +from .config import data_root +from .dataset import dataset_factory, Dataset + +VALSEQ, VALRAND = range(2) + +@dataset_factory('DEBUG') +def mnist_factory(variant='', ntrain=10, nvalid=10, ntest=10, \ + nclass=2, ndim=1, dshape=None, valtype=VALSEQ): + + temp = [] + [temp.append(5) for i in range(ndim)] + dshape = temp if dshape is None else dshape + + rval = Dataset() + rval.n_classes = nclass + rval.img_shape = dshape + + dsize = numpy.prod(dshape); + + print ntrain, nvalid, ntest, nclass, dshape, valtype + + ntot = ntrain + nvalid + ntest + xdata = numpy.arange(ntot*numpy.prod(dshape)).reshape((ntot,dsize)) \ + if valtype is VALSEQ else \ + numpy.random.random((ntot,dsize)); + ydata = numpy.round(numpy.random.random(ntot)); + + rval.train = Dataset.Obj(x=xdata[0:ntrain],y=ydata[0:ntrain]) + rval.valid = Dataset.Obj(x=xdata[ntrain:ntrain+nvalid],\ + y=ydata[ntrain:ntrain+nvalid]) + rval.test = Dataset.Obj(x=xdata[ntrain+nvalid:ntrain+nvalid+ntest], + y=ydata[ntrain+nvalid:ntrain+nvalid+ntest]) + + return rval diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/test_tzanetakis.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/test_tzanetakis.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,22 @@ +import theano + +from pylearn.io import wavread +from pylearn.datasets import tzanetakis + +def test_tzanetakis(): + idx = theano.tensor.lscalar() + + path, label = tzanetakis.tzanetakis_example(idx) + print path, label + + f = theano.function([idx], [path, label]) + + for i in xrange(len(tzanetakis.tzanetakis_example)): + print i, f(i) + + wav,sr = wavread.wav_read_int16(path) + + f = theano.function([idx], wav) + for i in xrange(len(tzanetakis.tzanetakis_example)): + print i, f(i).shape + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/datasets/tzanetakis.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/tzanetakis.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,100 @@ +""" +Load Tzanetakis' genre-classification dataset. + +""" +from __future__ import absolute_import + +import os +import numpy + +from ..io.amat import AMat +from .config import data_root +from .dataset import dataset_factory, Dataset + +def centre_data(x, inplace=False): + rval = x if inplace else x.copy() + #zero-mean + rval -= numpy.mean(rval, axis=0) + #unit-variance + rval *= 1.0 / (1.0e-6 + numpy.std(rval, axis=0)) + return rval + +def mfcc16(segments_per_song = 1, include_covariance = True, random_split = 0, + ntrain = 700, nvalid = 100, ntest = 200, + normalize=True): + if segments_per_song != 1: + raise NotImplementedError() + + path = os.path.join(data_root(), 'tzanetakis','feat_mfcc16_540_1.stat.amat') + dat = AMat(path=path) + all_input = dat.input + assert all_input.shape == (1000 * segments_per_song, 152) + all_targ = numpy.tile(numpy.arange(10).reshape(10,1), 100 * segments_per_song)\ + .reshape(1000 * segments_per_song) + + if not include_covariance: + all_input = all_input[:,0:16] + + #shuffle the data according to the random split + assert all_input.shape[0] == all_targ.shape[0] + seed = random_split + 1 + numpy.random.RandomState(seed).shuffle(all_input) + numpy.random.RandomState(seed).shuffle(all_targ) + + #construct a dataset to return + rval = Dataset() + + def prepx(x): + return centre_data(x, inplace=True) if normalize else x + + rval.train = Dataset.Obj(x=prepx(all_input[0:ntrain]), + y=all_targ[0:ntrain]) + rval.valid = Dataset.Obj(x=prepx(all_input[ntrain:ntrain+nvalid]), + y=all_targ[ntrain:ntrain+nvalid]) + rval.test = Dataset.Obj(x=prepx(all_input[ntrain+nvalid:ntrain+nvalid+ntest]), + y=all_targ[ntrain+nvalid:ntrain+nvalid+ntest]) + + rval.n_classes = 10 + + return rval + +import theano + +class TzanetakisExample(theano.Op): + @staticmethod + def read_tzanetakis_file(): + tracklist = open(data_root() + '/tzanetakis/tracklist.txt') + path = [] + label = [] + for line in tracklist: + toks = line.split() + try: + path.append(toks[0]) + label.append(toks[1]) + except: + print 'BAD LINE IN TZANETAKIS TRACKLIST' + print line, toks + raise + assert len(path) == 1000 + return path, label + + def __init__(self): + self.path, self.label = self.read_tzanetakis_file() + + def __len__(self): + return len(self.path) + + def make_node(self, idx): + return theano.Apply(self, + [theano.tensor.as_tensor_variable(idx)], + [theano.generic(), theano.generic()]) + + def perform(self, node, (idx,), outputs): + assert len(outputs) == 2 + outputs[0][0] = self.path[idx] + outputs[1][0] = self.label[idx] + + def grad(self, inputs, g_output): + return [None for i in inputs] +tzanetakis_example = TzanetakisExample() + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/examples/linear_classifier.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/examples/linear_classifier.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,224 @@ +#! /usr/bin/env python +""" +T. Bertin-Mahieux (2008) University of Montreal +bertinmt@iro.umontreal.ca + +linear_classifier.py +Simple script that creates a linear_classifier, and +learns the parameters using backpropagation. + +This is to illustrate how to use theano/pylearn. +Anyone who knows how to make this script simpler/clearer is welcome to +make the modifications. +""" + + +import os +import sys +import time +import copy +import pickle +import numpy +import numpy as N +import numpy.random as NR +from pylearn import cost +import theano +from theano import tensor as T + + +def cost_function(*args,**kwargs) : + """ default cost function, quadratic """ + return cost.quadratic(*args,**kwargs) + + +class modelgraph() : + """ class that contains the graph of the model """ + lr = T.scalar() # learning rate + inputs = T.matrix() # inputs (one example per line) + true_outputs = T.matrix() # outputs (one example per line) + W = T.matrix() # weights input * W + b= output + b = T.vector() # bias + outputs = T.dot(inputs,W) + b # output, one per line + costs = cost_function(true_outputs,outputs) # costs + g_W = T.grad(costs,W) # gradient of W + g_b = T.grad(costs,b) # gradient of b + new_W = T.sub_inplace(W, lr * g_W) # update inplace of W + new_b = T.sub_inplace(b, lr * g_b) # update inplace of b + + +class model() : + """ + The model! + Contains needed matrices, needed functions, and a link to the model graph. + """ + + def __init__(self,input_size,output_size) : + """ init matrix and bias, creates the graph, create a dict of compiled functions """ + # graph + self.graph = modelgraph() + # weights and bias, saved in self.params + seed = 666 + r = NR.RandomState(seed) + W = r.uniform(size = [input_size, output_size], low = -1/N.sqrt(input_size), high = 1/N.sqrt(input_size)) + b = numpy.zeros((output_size, )) + self.params = [W,b] + # dictionary of compiled functions + self.func_dict = dict() + # keep some init_infos (may not be necessary) + self.init_params = [input_size,output_size] + + + def update(self,lr,true_inputs,true_outputs) : + """ does an update of the model, one gradient descent """ + # do we already have the proper theano function? + if self.func_dict.has_key('update_func') : + self.func_dict['update_func'](lr,true_inputs,true_outputs,self.params[0],self.params[1]) + return + else : + # create the theano function, tell him what are the inputs and outputs) + func = theano.function([self.graph.lr,self.graph.inputs,self.graph.true_outputs, + self.graph.W, self.graph.b], + [self.graph.new_W,self.graph.new_b]) + # add function to dictionary, so we don't compile it again + self.func_dict['update_func'] = func + # use this function + func(lr,true_inputs,true_outputs,self.params[0],self.params[1]) + return + + def costs(self,true_inputs,true_outputs) : + """ get the costs for given examples, don't update """ + # do we already have the proper theano function? + if self.func_dict.has_key('costs_func') : + return self.func_dict['costs_func'](true_inputs,true_outputs,self.params[0],self.params[1]) + else : + # create the theano function, tell him what are the inputs and outputs) + func = theano.function([self.graph.inputs,self.graph.true_outputs,self.graph.W,self.graph.b], + [self.graph.costs]) + # add function to dictionary, se we don't compile it again + self.func_dict['costs_func'] = func + # use this function + return func(true_inputs,true_outputs,self.params[0],self.params[1]) + + def outputs(self,true_inputs) : + """ get the output for a set of examples (could be called 'predict') """ + # do we already have the proper theano function? + if self.func_dict.has_key('outputs_func') : + return self.func_dict['outputs_func'](true_inputs,self.params[0],self.params[1]) + else : + # create the theano function, tell him what are the inputs and outputs) + func = theano.function([self.graph.inputs, self.graph.W, self.graph.b], + [self.graph.outputs]) + # add function to dictionary, se we don't compile it again + self.func_dict['outputs_func'] = func + # use this function + return func(true_inputs,self.params[0],self.params[1]) + + def __getitem__(self,inputs) : + """ for simplicity, we can use the model this way: predictions = model[inputs] """ + return self.outputs(inputs) + + def __getstate__(self) : + """ + To save/copy the model, used by pickle.dump() and by copy.deepcopy(). + @return a dictionnary with the params (matrix + bias) + """ + d = dict() + d['params'] = self.params + d['init_params'] = self.init_params + return d + + def __setstate__(self,d) : + """ + Get the dictionary created by __getstate__(), use it to recreate the model. + """ + self.params = d['params'] + self.init_params = d['init_params'] + self.graph = modelgraph() # we did not save the model graph + + def __str__(self) : + """ returns a string representing the model """ + res = "Linear regressor, input size =",str(self.init_params[0]) + res += ", output size =", str(self.init_params[1]) + return res + + def __equal__(self,other) : + """ + Compares the model based on the params. + @return True if the params are the same, False otherwise + """ + # class + if not isinstance(other,model) : + return False + # input size + if self.params[0].shape[0] != other.params[0].shape[0] : + return False + # output size + if self.params[0].shape[1] != other.params[0].shape[1] : + return False + # actual values + if not (self.params[0] == other.params[0]).all(): + return False + if not (self.params[1] == other.params[1]).all(): + return False + # all good + return True + + +def die_with_usage() : + """ help menu """ + print 'simple script to illustrate how to use theano/pylearn' + print 'to launch:' + print ' python linear_classifier.py -launch' + sys.exit(0) + + + +#************************************************************ +# main + +if __name__ == '__main__' : + + if len(sys.argv) < 2 : + die_with_usage() + + # print create data + inputs = numpy.array([[.1,.2], + [.2,.8], + [.9,.3], + [.6,.5]]) + outputs = numpy.array([[0], + [0], + [1], + [1]]) + assert inputs.shape[0] == outputs.shape[0] + + # create model + m = model(2,1) + + # predict + print 'prediction before training:' + print m[inputs] + + # update it for 100 iterations + for k in range(50) : + m.update(.1,inputs,outputs) + + # predict + print 'prediction after training:' + print m[inputs] + + # show points + import pylab as P + colors = outputs.flatten().tolist() + x = inputs[:,0] + y = inputs[:,1] + P.plot(x[numpy.where(outputs==0)[0]],y[numpy.where(outputs==0)[0]],'r+') + P.plot(x[numpy.where(outputs==1)[0]],y[numpy.where(outputs==1)[0]],'b+') + # decision line + p1 = (.5 - m.params[1] * 1.) / m.params[0][1,0] # abs = 0 + p2 = (.5 - m.params[1] * 1.) / m.params[0][0,0] # ord = 0 + P.plot((0,p2[0],2*p2[0]),(p1[0],0,-p1[0]),'g-') + # show + P.axis([-1,2,-1,2]) + P.show() + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/examples/theano_update.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/examples/theano_update.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,56 @@ +import theano +from theano import tensor + +import numpy + +# Two scalar symbolic variables +a = tensor.scalar() +b = tensor.scalar() + +# Definition of output symbolic variable +c = a * b +# Definition of the function computing it +fprop = theano.function([a,b], [c]) + +# Initialize numerical variables +a_val = numpy.array(12.) +b_val = numpy.array(2.) +print 'a_val =', a_val +print 'b_val =', b_val + +# Numerical value of output is returned by the call to "fprop" +c_val = fprop(a_val, b_val) +print 'c_val =', c_val + + +# Definition of simple update (increment by one) +new_b = b + 1 +update = theano.function([b], [new_b]) + +# New numerical value of b is returned by the call to "update" +b_val = update(b_val) +print 'new b_val =', b_val +# We can use the new value in "fprop" +c_val = fprop(a_val, b_val) +print 'c_val =', c_val + + +# Definition of in-place update (increment by one) +re_new_b = tensor.add_inplace(b, 1.) +re_update = theano.function([b], [re_new_b]) + +# "re_update" can be used the same way as "update" +b_val = re_update(b_val) +print 'new b_val =', b_val +# We can use the new value in "fprop" +c_val = fprop(a_val, b_val) +print 'c_val =', c_val + +# It is not necessary to keep the return value when the update is done in place +re_update(b_val) +print 'new b_val =', b_val +c_val = fprop(a_val, b_val) +print 'c_val =', c_val + + + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/exceptions.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/exceptions.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,7 @@ +""" +Common exceptions. +@todo: This file should be part of a common/ python package. +""" + +class AbstractFunction (Exception): """Derived class must override this function""" +class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented""" diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/external/__init__.py diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/external/wrap_libsvm.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/external/wrap_libsvm.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,100 @@ +"""Run an experiment using libsvm. +""" +import numpy +from ..datasets import make_dataset + +# libsvm currently has no python installation instructions/convention. +# +# This module uses a specific convention for libsvm's installation. +# I base this on installing libsvm-2.88. +# To install libsvm's python module, do three things: +# 1. Build libsvm (run make in both the root dir and the python subdir). +# 2. touch a '__init__.py' file in the python subdir +# 3. add a symbolic link to a PYTHONPATH location that looks like this: +# libsvm -> /libsvm-2.88/python/ +# +# That is the sort of thing that this module expects from 'import libsvm' + +import libsvm + +def score_01(x, y, model): + assert len(x) == len(y) + size = len(x) + errors = 0 + for i in range(size): + prediction = model.predict(x[i]) + #probability = model.predict_probability + if (y[i] != prediction): + errors = errors + 1 + return float(errors)/size + +#this is the dbdict experiment interface... if you happen to use dbdict +class State(object): + #TODO: parametrize to get all the kernel types, not hardcode for RBF + dataset = 'MNIST_1k' + C = 10.0 + kernel = 'RBF' + # rel_gamma is related to the procedure Jerome used. He mentioned why in + # quadratic_neurons/neuropaper/draft3.pdf. + rel_gamma = 1.0 + + def __init__(self, **kwargs): + for k, v in kwargs: + setattr(self, k, type(getattr(self, k))(v)) + + +def dbdict_run_svm_experiment(state, channel=lambda *args, **kwargs:None): + """Parameters are described in state, and returned in state. + + :param state: object instance to store parameters and return values + :param channel: not used + + :returns: None + + This is the kind of function that dbdict-run can use. + + """ + dataset = make_dataset(**state.dataset) + + + #libsvm needs stuff in int32 on a 32bit machine + #TODO: test this on a 64bit machine + # -> Both int32 and int64 (default) seem to be OK + train_y = numpy.asarray(dataset.train.y, dtype='int32') + valid_y = numpy.asarray(dataset.valid.y, dtype='int32') + test_y = numpy.asarray(dataset.test.y, dtype='int32') + problem = libsvm.svm_problem(train_y, dataset.train.x); + + gamma0 = 0.5 / numpy.sum(numpy.var(dataset.train.x, axis=0)) + + param = libsvm.svm_parameter(C=state['C'], + kernel_type=getattr(libsvm, state['kernel']), + gamma=state['rel_gamma'] * gamma0) + + model = libsvm.svm_model(problem, param) #this is the expensive part + + state['train_01'] = score_01(dataset.train.x, train_y, model) + state['valid_01'] = score_01(dataset.valid.x, valid_y, model) + state['test_01'] = score_01(dataset.test.x, test_y, model) + + state['n_train'] = len(train_y) + state['n_valid'] = len(valid_y) + state['n_test'] = len(test_y) + +def run_svm_experiment(**kwargs): + """Python-friendly interface to dbdict_run_svm_experiment + + Parameters are used to construct a `State` instance, which is returned after running + `dbdict_run_svm_experiment` on it. + + .. code-block:: python + results = run_svm_experiment(dataset='MNIST_1k', C=100.0, rel_gamma=0.01) + print results.n_train + # 1000 + print results.valid_01, results.test_01 + # 0.14, 0.10 #.. or something... + + """ + state_run_svm_experiment(state=kwargs) + return kwargs + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/io/__init__.py diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/io/amat.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/io/amat.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,138 @@ +"""load PLearn AMat files + + +An AMat file is an ascii format for dense matrices. + +The format is not precisely defined, so I'll describe here a single recipe for making a valid +file. + +.. code-block:: text + + #size: + #sizes: + number number number .... + number number number .... + + +Tabs and spaces are both valid delimiters. Newlines separate consecutive rows. + +""" + +import sys, numpy, array + +class AMat: + """DataSource to access a plearn amat file as a periodic unrandomized stream. + + Attributes: + + input -- all columns of input + target -- all columns of target + weight -- all columns of weight + extra -- all columns of extra + + all -- the entire data contents of the amat file + n_examples -- the number of training examples in the file + + AMat stands for Ascii Matri[x,ces] + + """ + + marker_size = '#size:' + marker_sizes = '#sizes:' + marker_col_names = '#:' + + def __init__(self, path, head=None, update_interval=0, ofile=sys.stdout): + + """Load the amat at into memory. + + path - str: location of amat file + head - int: stop reading after this many data rows + update_interval - int: print '.' to ofile every lines + ofile - file: print status, msgs, etc. to this file + + """ + self.all = None + self.input = None + self.target = None + self.weight = None + self.extra = None + + self.header = False + self.header_size = None + self.header_rows = None + self.header_cols = None + self.header_sizes = None + self.header_col_names = [] + + data_started = False + data = array.array('d') + + f = open(path) + n_data_lines = 0 + len_float_line = None + + for i,line in enumerate(f): + if n_data_lines == head: + #we've read enough data, + # break even if there's more in the file + break + if len(line) == 0 or line == '\n': + continue + if line[0] == '#': + if not data_started: + #the condition means that the file has a header, and we're on + # some header line + self.header = True + if line.startswith(AMat.marker_size): + info = line[len(AMat.marker_size):] + self.header_size = [int(s) for s in info.split()] + self.header_rows, self.header_cols = self.header_size + if line.startswith(AMat.marker_col_names): + info = line[len(AMat.marker_col_names):] + self.header_col_names = info.split() + elif line.startswith(AMat.marker_sizes): + info = line[len(AMat.marker_sizes):] + self.header_sizes = [int(s) for s in info.split()] + else: + #the first non-commented line tells us that the header is done + data_started = True + float_line = [float(s) for s in line.split()] + if len_float_line is None: + len_float_line = len(float_line) + if (self.header_cols is not None) \ + and self.header_cols != len_float_line: + print >> sys.stderr, \ + 'WARNING: header declared %i cols but first line has %i, using %i',\ + self.header_cols, len_float_line, len_float_line + else: + if len_float_line != len(float_line): + raise IOError('wrong line length', i, line) + data.extend(float_line) + n_data_lines += 1 + + if update_interval > 0 and (ofile is not None) \ + and n_data_lines % update_interval == 0: + ofile.write('.') + ofile.flush() + + if update_interval > 0: + ofile.write('\n') + f.close() + + # convert from array.array to numpy.ndarray + nshape = (len(data) / len_float_line, len_float_line) + self.all = numpy.frombuffer(data).reshape(nshape) + self.n_examples = self.all.shape[0] + + # assign + if self.header_sizes is not None: + if len(self.header_sizes) > 4: + print >> sys.stderr, 'WARNING: ignoring sizes after 4th in %s' % path + leftmost = 0 + #here we make use of the fact that if header_sizes has len < 4 + # the loop will exit before 4 iterations + attrlist = ['input', 'target', 'weight', 'extra'] + for attr, ncols in zip(attrlist, self.header_sizes): + setattr(self, attr, self.all[:, leftmost:leftmost+ncols]) + leftmost += ncols + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/io/filetensor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/io/filetensor.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,141 @@ +""" +Read and write the matrix file format described at +U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html} + +The format is for dense tensors: + + - magic number indicating type and endianness - 4bytes + - rank of tensor - int32 + - dimensions - int32, int32, int32, ... + - + +The number of dimensions and rank is slightly tricky: + - for scalar: rank=0, dimensions = [1, 1, 1] + - for vector: rank=1, dimensions = [?, 1, 1] + - for matrix: rank=2, dimensions = [?, ?, 1] + +For rank >= 3, the number of dimensions matches the rank exactly. + + +@todo: add complex type support + +""" +import sys +import numpy + +def _prod(lst): + p = 1 + for l in lst: + p *= l + return p + +_magic_dtype = { + 0x1E3D4C51 : ('float32', 4), + #0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix? + 0x1E3D4C53 : ('float64', 8), + 0x1E3D4C54 : ('int32', 4), + 0x1E3D4C55 : ('uint8', 1), + 0x1E3D4C56 : ('int16', 2), + } +_dtype_magic = { + 'float32': 0x1E3D4C51, + #'packed matrix': 0x1E3D4C52, + 'float64': 0x1E3D4C53, + 'int32': 0x1E3D4C54, + 'uint8': 0x1E3D4C55, + 'int16': 0x1E3D4C56 + } + +# +# TODO: implement item selection: +# e.g. load('some mat', subtensor=(:6, 2:5)) +# +# This function should be memory efficient by: +# - allocating an output matrix at the beginning +# - seeking through the file, reading subtensors from multiple places +def read(f, subtensor=None, debug=False): + """Load all or part of file 'f' into a numpy ndarray + + @param f: file from which to read + @type f: file-like object + + If subtensor is not None, it should be like the argument to + numpy.ndarray.__getitem__. The following two expressions should return + equivalent ndarray objects, but the one on the left may be faster and more + memory efficient if the underlying file f is big. + + read(f, subtensor) <===> read(f)[*subtensor] + + Support for subtensors is currently spotty, so check the code to see if your + particular type of subtensor is supported. + + """ + def _read_int32(f): + s = f.read(4) + s_array = numpy.fromstring(s, dtype='int32') + return s_array.item() + + #what is the data type of this matrix? + #magic_s = f.read(4) + #magic = numpy.fromstring(magic_s, dtype='int32') + magic = _read_int32(f) + magic_t, elsize = _magic_dtype[magic] + if debug: + print 'header magic', magic, magic_t, elsize + if magic_t == 'packed matrix': + raise NotImplementedError('packed matrix not supported') + + #what is the rank of the tensor? + ndim = _read_int32(f) + if debug: print 'header ndim', ndim + + #what are the dimensions of the tensor? + dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] + dim_size = _prod(dim) + if debug: print 'header dim', dim, dim_size + + rval = None + if subtensor is None: + rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) + elif isinstance(subtensor, slice): + if subtensor.step not in (None, 1): + raise NotImplementedError('slice with step', subtensor.step) + if subtensor.start not in (None, 0): + bytes_per_row = _prod(dim[1:]) * elsize + raise NotImplementedError('slice with start', subtensor.start) + dim[0] = min(dim[0], subtensor.stop) + rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) + else: + raise NotImplementedError('subtensor access not written yet:', subtensor) + + return rval + +def write(f, mat): + """Write a numpy.ndarray to file. + + @param f: file into which to write + @type f: file-like object + + @param mat: array to write to file + @type mat: numpy ndarray or compatible + + """ + def _write_int32(f, i): + i_array = numpy.asarray(i, dtype='int32') + if 0: print 'writing int32', i, i_array + i_array.tofile(f) + + try: + _write_int32(f, _dtype_magic[str(mat.dtype)]) + except KeyError: + raise TypeError('Invalid ndarray dtype for filetensor format', mat.dtype) + + _write_int32(f, len(mat.shape)) + shape = mat.shape + if len(shape) < 3: + shape = list(shape) + [1] * (3 - len(shape)) + if 0: print 'writing shape =', shape + for sh in shape: + _write_int32(f, sh) + mat.tofile(f) + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/io/image_tiling.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/io/image_tiling.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,84 @@ +""" +Illustrate filters (or data) in a grid of small image-shaped tiles. +""" + +import numpy +from PIL import Image + +def scale_to_unit_interval(ndar): + ndar = ndar.copy() + ndar -= ndar.min() + ndar *= 1.0 / ndar.max() + return ndar + +def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0,0), + scale_rows_to_unit_interval=True, + output_pixel_vals=True + ): + """ + Transform an array with one flattened image per row, into an array in which images are + reshaped and layed out like tiles on a floor. + + This function is useful for visualizing datasets whose rows are images, and also columns of + matrices for transforming those rows (such as the first layer of a neural net). + + :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can be 2-D ndarrays or None + :param X: a 2-D array in which every row is a flattened image. + :type img_shape: tuple; (height, width) + :param img_shape: the original shape of each image + :type tile_shape: tuple; (rows, cols) + :param tile_shape: the number of images to tile (rows, cols) + + :returns: array suitable for viewing as an image. (See:`PIL.Image.fromarray`.) + :rtype: a 2-d array with same dtype as X. + + """ + assert len(img_shape) == 2 + assert len(tile_shape) == 2 + assert len(tile_spacing) == 2 + + out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp + in zip(img_shape, tile_shape, tile_spacing)] + + if isinstance(X, tuple): + assert len(X) == 4 + if output_pixel_vals: + out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8') + else: + out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype) + + #colors default to 0, alpha defaults to 1 (opaque) + if output_pixel_vals: + channel_defaults = [0,0,0,255] + else: + channel_defaults = [0.,0.,0.,1.] + + for i in xrange(4): + if X[i] is None: + out_array[:,:,i] = numpy.zeros(out_shape, + dtype='uint8' if output_pixel_vals else out_array.dtype + )+channel_defaults[i] + else: + out_array[:,:,i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals) + return out_array + + else: + H, W = img_shape + Hs, Ws = tile_spacing + + out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype) + for tile_row in xrange(tile_shape[0]): + for tile_col in xrange(tile_shape[1]): + if tile_row * tile_shape[1] + tile_col < X.shape[0]: + if scale_rows_to_unit_interval: + this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)) + else: + this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape) + out_array[ + tile_row * (H+Hs):tile_row*(H+Hs)+H, + tile_col * (W+Ws):tile_col*(W+Ws)+W + ] \ + = this_img * (255 if output_pixel_vals else 1) + return out_array + + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/io/pmat.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/io/pmat.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,526 @@ +## Automatically adapted for numpy.numarray Jun 13, 2007 by python_numarray_to_numpy (-xsm) + +# PMat.py +# Copyright (C) 2005 Pascal Vincent +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. The name of the authors may not be used to endorse or promote +# products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN +# NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# This file is part of the PLearn library. For more information on the PLearn +# library, go to the PLearn Web site at www.plearn.org + + +# Author: Pascal Vincent + +#import numarray, sys, os, os.path +import numpy.numarray, sys, os, os.path +import fpconst + +def array_columns( a, cols ): + indices = None + if isinstance( cols, int ): + indices = [ cols ] + elif isinstance( cols, slice ): + #print cols + indices = range( *cols.indices(cols.stop) ) + else: + indices = list( cols ) + + return numpy.numarray.take(a, indices, axis=1) + +def load_pmat_as_array(fname): + s = file(fname,'rb').read() + formatstr = s[0:64] + datastr = s[64:] + structuretype, l, w, data_type, endianness = formatstr.split() + + if data_type=='DOUBLE': + elemtype = 'd' + elif data_type=='FLOAT': + elemtype = 'f' + else: + raise ValueError('Invalid data type in file header: '+data_type) + + if endianness=='LITTLE_ENDIAN': + byteorder = 'little' + elif endianness=='BIG_ENDIAN': + byteorder = 'big' + else: + raise ValueError('Invalid endianness in file header: '+endianness) + + l = int(l) + w = int(w) + X = numpy.numarray.fromstring(datastr,elemtype, shape=(l,w) ) + if byteorder!=sys.byteorder: + X.byteswap(True) + return X + +def load_pmat_as_array_dataset(fname): + import dataset,lookup_list + + #load the pmat as array + a=load_pmat_as_array(fname) + + #load the fieldnames + fieldnames = [] + fieldnamefile = os.path.join(fname+'.metadata','fieldnames') + if os.path.isfile(fieldnamefile): + f = open(fieldnamefile) + for row in f: + row = row.split() + if len(row)>0: + fieldnames.append(row[0]) + f.close() + else: + self.fieldnames = [ "field_"+str(i) for i in range(a.shape[1]) ] + + return dataset.ArrayDataSet(a,lookup_list.LookupList(fieldnames,[x for x in range(a.shape[1])])) + +def load_amat_as_array_dataset(fname): + import dataset,lookup_list + + #load the amat as array + (a,fieldnames)=readAMat(fname) + + #load the fieldnames + if len(fieldnames)==0: + self.fieldnames = [ "field_"+str(i) for i in range(a.shape[1]) ] + + return dataset.ArrayDataSet(a,lookup_list.LookupList(fieldnames,[x for x in range(a.shape[1])])) + +def save_array_dataset_as_pmat(fname,ds): + ar=ds.data + save_array_as_pmat(fname,ar,ds.fieldNames()) + +def save_array_as_pmat( fname, ar, fieldnames=[] ): + s = file(fname,'wb') + + length, width = ar.shape + if fieldnames: + assert len(fieldnames) == width + metadatadir = fname+'.metadata' + if not os.path.isdir(metadatadir): + os.mkdir(metadatadir) + fieldnamefile = os.path.join(metadatadir,'fieldnames') + f = open(fieldnamefile,'wb') + for name in fieldnames: + f.write(name+'\t0\n') + f.close() + + header = 'MATRIX ' + str(length) + ' ' + str(width) + ' ' + if ar.dtype.char=='d': + header += 'DOUBLE ' + elemsize = 8 + + elif ar.dtype.char=='f': + header += 'FLOAT ' + elemsize = 4 + + else: + raise TypeError('Unsupported typecode: %s' % ar.dtype.char) + + rowsize = elemsize*width + + if sys.byteorder=='little': + header += 'LITTLE_ENDIAN ' + elif sys.byteorder=='big': + header += 'BIG_ENDIAN ' + else: + raise TypeError('Unsupported sys.byteorder: '+repr(sys.byteorder)) + + header += ' '*(63-len(header))+'\n' + s.write( header ) + s.write( ar.tostring() ) + s.close() + + +####### Iterators ########################################################### + +class VMatIt: + def __init__(self, vmat): + self.vmat = vmat + self.cur_row = 0 + + def __iter__(self): + return self + + def next(self): + if self.cur_row==self.vmat.length: + raise StopIteration + row = self.vmat.getRow(self.cur_row) + self.cur_row += 1 + return row + +class ColumnIt: + def __init__(self, vmat, col): + self.vmat = vmat + self.col = col + self.cur_row = 0 + + def __iter__(self): + return self + + def next(self): + if self.cur_row==self.vmat.length: + raise StopIteration + val = self.vmat[self.cur_row, self.col] + self.cur_row += 1 + return val + +####### VMat classes ######################################################## + +class VMat: + def __iter__(self): + return VMatIt(self) + + def __getitem__( self, key ): + if isinstance( key, slice ): + start, stop, step = key.start, key.stop, key.step + if step!=None: + raise IndexError('Extended slice with step not currently supported') + + if start is None: + start = 0 + + l = self.length + if stop is None or stop > l: + stop = l + + return self.getRows(start,stop-start) + + elif isinstance( key, tuple ): + # Basically returns a SubVMatrix + assert len(key) == 2 + rows = self.__getitem__( key[0] ) + + shape = rows.shape + if len(shape) == 1: + return rows[ key[1] ] + + cols = key[1] + if isinstance(cols, slice): + start, stop, step = cols.start, cols.stop, cols.step + if start is None: + start = 0 + + if stop is None: + stop = self.width + elif stop < 0: + stop = self.width+stop + + cols = slice(start, stop, step) + + return array_columns(rows, cols) + + elif isinstance( key, str ): + # The key is considered to be a fieldname and a column is + # returned. + try: + return array_columns( self.getRows(0,self.length), + self.fieldnames.index(key) ) + except ValueError: + print >>sys.stderr, "Key is '%s' while fieldnames are:" % key + print >>sys.stderr, self.fieldnames + raise + + else: + if key<0: key+=self.length + return self.getRow(key) + + def getFieldIndex(self, fieldname): + try: + return self.fieldnames.index(fieldname) + except ValueError: + raise ValueError( "VMat has no field named %s. Field names: %s" + %(fieldname, ','.join(self.fieldnames)) ) + +class PMat( VMat ): + + def __init__(self, fname, openmode='r', fieldnames=[], elemtype='d', + inputsize=-1, targetsize=-1, weightsize=-1, array = None): + self.fname = fname + self.inputsize = inputsize + self.targetsize = targetsize + self.weightsize = weightsize + if openmode=='r': + self.f = open(fname,'rb') + self.read_and_parse_header() + self.load_fieldnames() + + elif openmode=='w': + self.f = open(fname,'w+b') + self.fieldnames = fieldnames + self.save_fieldnames() + self.length = 0 + self.width = len(fieldnames) + self.elemtype = elemtype + self.swap_bytes = False + self.write_header() + + elif openmode=='a': + self.f = open(fname,'r+b') + self.read_and_parse_header() + self.load_fieldnames() + + else: + raise ValueError("Currently only supported openmodes are 'r', 'w' and 'a': "+repr(openmode)+" is not supported") + + if array is not None: + shape = array.shape + if len(shape) == 1: + row_format = lambda r: [ r ] + elif len(shape) == 2: + row_format = lambda r: r + + for row in array: + self.appendRow( row_format(row) ) + + def __del__(self): + self.close() + + def write_header(self): + header = 'MATRIX ' + str(self.length) + ' ' + str(self.width) + ' ' + + if self.elemtype=='d': + header += 'DOUBLE ' + self.elemsize = 8 + elif self.elemtype=='f': + header += 'FLOAT ' + self.elemsize = 4 + else: + raise TypeError('Unsupported elemtype: '+repr(elemtype)) + self.rowsize = self.elemsize*self.width + + if sys.byteorder=='little': + header += 'LITTLE_ENDIAN ' + elif sys.byteorder=='big': + header += 'BIG_ENDIAN ' + else: + raise TypeError('Unsupported sys.byteorder: '+repr(sys.byteorder)) + + header += ' '*(63-len(header))+'\n' + + self.f.seek(0) + self.f.write(header) + + def read_and_parse_header(self): + header = self.f.read(64) + mat_type, l, w, data_type, endianness = header.split() + if mat_type!='MATRIX': + raise ValueError('Invalid file header (should start with MATRIX)') + self.length = int(l) + self.width = int(w) + if endianness=='LITTLE_ENDIAN': + byteorder = 'little' + elif endianness=='BIG_ENDIAN': + byteorder = 'big' + else: + raise ValueError('Invalid endianness in file header: '+endianness) + self.swap_bytes = (byteorder!=sys.byteorder) + + if data_type=='DOUBLE': + self.elemtype = 'd' + self.elemsize = 8 + elif data_type=='FLOAT': + self.elemtype = 'f' + self.elemsize = 4 + else: + raise ValueError('Invalid data type in file header: '+data_type) + self.rowsize = self.elemsize*self.width + + def load_fieldnames(self): + self.fieldnames = [] + fieldnamefile = os.path.join(self.fname+'.metadata','fieldnames') + if os.path.isfile(fieldnamefile): + f = open(fieldnamefile) + for row in f: + row = row.split() + if len(row)>0: + self.fieldnames.append(row[0]) + f.close() + else: + self.fieldnames = [ "field_"+str(i) for i in range(self.width) ] + + def save_fieldnames(self): + metadatadir = self.fname+'.metadata' + if not os.path.isdir(metadatadir): + os.mkdir(metadatadir) + fieldnamefile = os.path.join(metadatadir,'fieldnames') + f = open(fieldnamefile,'wb') + for name in self.fieldnames: + f.write(name+'\t0\n') + f.close() + + def getRow(self,i): + if i<0 or i>=self.length: + raise IndexError('PMat index out of range') + self.f.seek(64+i*self.rowsize) + data = self.f.read(self.rowsize) + ar = numpy.numarray.fromstring(data, self.elemtype, (self.width,)) + if self.swap_bytes: + ar.byteswap(True) + return ar + + def getRows(self,i,l): + if i<0 or l<0 or i+l>self.length: + raise IndexError('PMat index out of range') + self.f.seek(64+i*self.rowsize) + data = self.f.read(l*self.rowsize) + ar = numpy.numarray.fromstring(data, self.elemtype, (l,self.width)) + if self.swap_bytes: + ar.byteswap(True) + return ar + + def checkzerorow(self,i): + if i<0 or i>self.length: + raise IndexError('PMat index out of range') + self.f.seek(64+i*self.rowsize) + data = self.f.read(self.rowsize) + ar = numpy.numarray.fromstring(data, self.elemtype, (len(data)/self.elemsize,)) + if self.swap_bytes: + ar.byteswap(True) + for elem in ar: + if elem!=0: + return False + return True + + def putRow(self,i,row): + if i<0 or i>=self.length: + raise IndexError('PMat index out of range') + if len(row)!=self.width: + raise TypeError('length of row ('+str(len(row))+ ') differs from matrix width ('+str(self.width)+')') + if i<0 or i>=self.length: + raise IndexError + if self.swap_bytes: # must make a copy and swap bytes + ar = numpy.numarray.numarray(row,type=self.elemtype) + ar.byteswap(True) + else: # asarray makes a copy if not already a numarray of the right type + ar = numpy.numarray.asarray(row,type=self.elemtype) + self.f.seek(64+i*self.rowsize) + self.f.write(ar.tostring()) + + def appendRow(self,row): + if len(row)!=self.width: + raise TypeError('length of row ('+str(len(row))+ ') differs from matrix width ('+str(self.width)+')') + if self.swap_bytes: # must make a copy and swap bytes + ar = numpy.numarray.numarray(row,type=self.elemtype) + ar.byteswap(True) + else: # asarray makes a copy if not already a numarray of the right type + ar = numpy.numarray.asarray(row,type=self.elemtype) + + self.f.seek(64+self.length*self.rowsize) + self.f.write(ar.tostring()) + self.length += 1 + self.write_header() # update length in header + + def flush(self): + self.f.flush() + + def close(self): + if hasattr(self, 'f'): + self.f.close() + + def append(self,row): + self.appendRow(row) + + def __setitem__(self, i, row): + l = self.length + if i<0: i+=l + self.putRow(i,row) + + def __len__(self): + return self.length + + + +#copied from PLEARNDIR:python_modules/plearn/vmat/readAMat.py +def safefloat(str): + """Convert the given string to its float value. It is 'safe' in the sense + that missing values ('nan') will be properly converted to the corresponding + float value under all platforms, contrarily to 'float(str)'. + """ + if str.lower() == 'nan': + return fpconst.NaN + else: + return float(str) + +#copied from PLEARNDIR:python_modules/plearn/vmat/readAMat.py +def readAMat(amatname): + """Read a PLearn .amat file and return it as a numarray Array. + + Return a tuple, with as the first argument the array itself, and as + the second argument the fieldnames (list of strings). + """ + ### NOTE: this version is much faster than first creating the array and + ### updating each row as it is read... Bizarrely enough + f = open(amatname) + a = [] + fieldnames = [] + for line in f: + if line.startswith("#size:"): + (length,width) = line[6:].strip().split() + elif line.startswith("#sizes:"): # ignore input/target/weight/extra sizes + continue + + elif line.startswith("#:"): + fieldnames = line[2:].strip().split() + pass + elif not line.startswith('#'): + # Add all non-comment lines. + row = [ safefloat(x) for x in line.strip().split() ] + if row: + a.append(row) + + f.close() + return numpy.numarray.array(a), fieldnames + + +if __name__ == '__main__': + pmat = PMat( 'tmp.pmat', 'w', fieldnames=['F1', 'F2'] ) + pmat.append( [1, 2] ) + pmat.append( [3, 4] ) + pmat.close() + + pmat = PMat( 'tmp.pmat', 'r' ) + ar=load_pmat_as_array('tmp.pmat') + ds=load_pmat_as_array_dataset('tmp.pmat') + + print "PMat",pmat + print "PMat",pmat[:] + print "array",ar + print "ArrayDataSet",ds + for i in ds: + print i + save_array_dataset_as_pmat("tmp2.pmat",ds) + ds2=load_pmat_as_array_dataset('tmp2.pmat') + for i in ds2: + print i + # print "+++ tmp.pmat contains: " + # os.system( 'plearn vmat cat tmp.pmat' ) + import shutil + for fname in ["tmp.pmat", "tmp2.pmat"]: + os.remove( fname ) + if os.path.exists( fname+'.metadata' ): + shutil.rmtree( fname+'.metadata' ) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/io/tests/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/io/tests/__init__.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,2 @@ + + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/io/tests/test_filetensor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/io/tests/test_filetensor.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,121 @@ + +from pylearn.io import filetensor +import numpy + +import unittest +import os + +class T(unittest.TestCase): + fname = '/tmp/some_mat' + + def setUp(self): + #TODO: test that /tmp/some_mat does not exist + try: + os.stat(self.fname) + except OSError: + return #assume file was not found + raise Exception('autotest file "%s" exists!' % self.fname) + + def tearDown(self): + os.remove(self.fname) + + def test_file(self): + gen = numpy.random.rand(1) + f = file(self.fname, 'w'); + filetensor.write(f, gen) + f.flush() + f = file(self.fname, 'r'); + mat = filetensor.read(f, None, debug=False) #load from filename + self.failUnless(gen.shape == mat.shape) + self.failUnless(numpy.all(gen == mat)) + + def test_filename(self): + gen = numpy.random.rand(1) + f = file(self.fname, 'w') + filetensor.write(f, gen) + f.close() + f = file(self.fname, 'r') + mat = filetensor.read(f, None, debug=False) #load from filename + f.close() + self.failUnless(gen.shape == mat.shape) + self.failUnless(numpy.all(gen == mat)) + + def testNd(self): + """shape and values are stored correctly for tensors of rank 0 to 5""" + whole_shape = [5, 6, 7, 8, 9] + for i in xrange(5): + gen = numpy.asarray(numpy.random.rand(*whole_shape[:i])) + f = file(self.fname, 'w'); + filetensor.write(f, gen) + f.flush() + f = file(self.fname, 'r'); + mat = filetensor.read(f, None, debug=False) #load from filename + self.failUnless(gen.shape == mat.shape) + self.failUnless(numpy.all(gen == mat)) + + def test_dtypes(self): + """shape and values are stored correctly for all dtypes """ + for dtype in filetensor._dtype_magic: + gen = numpy.asarray( + numpy.random.rand(4, 5, 2, 1) * 100, + dtype=dtype) + f = file(self.fname, 'w'); + filetensor.write(f, gen) + f.flush() + f = file(self.fname, 'r'); + mat = filetensor.read(f, None, debug=False) #load from filename + self.failUnless(gen.dtype == mat.dtype) + self.failUnless(gen.shape == mat.shape) + self.failUnless(numpy.all(gen == mat)) + + def test_dtype_invalid(self): + gen = numpy.zeros((3,4), dtype='uint16') #an unsupported dtype + f = file(self.fname, 'w') + passed = False + try: + filetensor.write(f, gen) + except TypeError, e: + if e[0].startswith('Invalid ndarray dtype'): + passed = True + f.close() + self.failUnless(passed) + + +if __name__ == '__main__': + unittest.main() + + #a small test script, starts by reading sys.argv[1] + #print 'rval', rval.shape, rval.size + + if 0: + filetensor.write(f, rval) + print '' + f.close() + f = file('/tmp/some_mat', 'r'); + rval2 = filetensor.read(f) #load from file handle + print 'rval2', rval2.shape, rval2.size + + assert rval.dtype == rval2.dtype + assert rval.shape == rval2.shape + assert numpy.all(rval == rval2) + print 'ok' + + def _unused(): + f.seek(0,2) #seek to end + f_len = f.tell() + f.seek(f_data_start,0) #seek back to where we were + + if debug: print 'length:', f_len + + + f_data_bytes = (f_len - f_data_start) + + if debug: print 'data bytes according to header: ', dim_size * elsize + if debug: print 'data bytes according to file : ', f_data_bytes + + if debug: print 'reading data...' + sys.stdout.flush() + + def read_ndarray(f, dim, dtype): + return numpy.fromfile(f, dtype=dtype, count=_prod(dim)).reshape(dim) + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/io/wavread.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/io/wavread.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,57 @@ +"""`WavRead` Op""" +__docformat__ = "restructuredtext en" + +import numpy +import theano +import wave + +class WavRead(theano.Op): + #TODO: add the samplerate as an output + """Read a wave file + + input - the path to a wave file + output - the contents of the wave file in pcm format, and the samplerate + + """ + + out_type = None + """The type for the output of this op. + + Currently only wvector (aka int16) and dvector (aka double) are supported + """ + + def __init__(self, out_type): + self.out_type = out_type + if out_type not in [theano.tensor.dvector, theano.tensor.wvector]: + raise TypeError(out_type) + def __eq__(self, other): + return (type(self) == type(other)) and (self.out_type == other.out_type) + def __hash__(self): + return hash(type(self)) ^ hash(self.out_type) + def make_node(self, path): + return theano.Apply(self, [path], [self.out_type(), theano.tensor.dscalar()]) + def perform(self, node, (path,), (out, sr)): + w = wave.open(path) + + if w.getnchannels() != 1: + raise NotImplementedError() + if w.getsampwidth() != 2: #2 bytes means 16bit samples + raise NotImplementedError() + + samples = numpy.frombuffer(w.readframes(w.getnframes()), dtype='int16') + + if self.out_type == theano.tensor.wvector: + out[0] = samples + elif self.out_type == theano.tensor.dvector: + out[0] = samples * (1.0 / 2**15) + else: + raise NotImplementedError() + + sr[0] = w.getframerate() + + def grad(self, inputs, g_output): + return [None for i in inputs] + +wav_read_int16 = WavRead(theano.tensor.wvector) +wav_read_double = WavRead(theano.tensor.dvector) + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/old_dataset/_test_dataset.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/old_dataset/_test_dataset.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,680 @@ +#!/bin/env python +from dataset import * +from math import * +import numpy, unittest, sys +#from misc import * +from lookup_list import LookupList + +def have_raised(to_eval, **var): + have_thrown = False + try: + eval(to_eval) + except : + have_thrown = True + return have_thrown + +def have_raised2(f, *args, **kwargs): + have_thrown = False + try: + f(*args, **kwargs) + except : + have_thrown = True + return have_thrown + +def test1(): + print "test1" + global a,ds + a = numpy.random.rand(10,4) + print a + ds = ArrayDataSet(a,{'x':slice(3),'y':3,'z':[0,2]}) + print "len(ds)=",len(ds) + assert(len(ds)==10) + print "example 0 = ",ds[0] +# assert + print "x=",ds["x"] + print "x|y" + for x,y in ds("x","y"): + print x,y + minibatch_iterator = ds.minibatches(fieldnames=['z','y'],n_batches=1,minibatch_size=3,offset=4) + minibatch = minibatch_iterator.__iter__().next() + print "minibatch=",minibatch + for var in minibatch: + print "var=",var + print "take a slice and look at field y",ds[1:6:2]["y"] + + del a,ds,x,y,minibatch_iterator,minibatch,var + +def test_iterate_over_examples(array,ds): +#not in doc!!! + i=0 + for example in range(len(ds)): + wanted = array[example][:3] + returned = ds[example]['x'] + if (wanted != returned).all(): + print 'returned:', returned + print 'wanted:', wanted + assert (ds[example]['x']==array[example][:3]).all() + assert ds[example]['y']==array[example][3] + assert (ds[example]['z']==array[example][[0,2]]).all() + i+=1 + assert i==len(ds) + del example,i + +# - for example in dataset: + i=0 + for example in ds: + assert len(example)==3 + assert (example['x']==array[i][:3]).all() + assert example['y']==array[i][3] + assert (example['z']==array[i][0:3:2]).all() + assert (numpy.append(example['x'],example['y'])==array[i]).all() + i+=1 + assert i==len(ds) + del example,i + +# - for val1,val2,... in dataset: + i=0 + for x,y,z in ds: + assert (x==array[i][:3]).all() + assert y==array[i][3] + assert (z==array[i][0:3:2]).all() + assert (numpy.append(x,y)==array[i]).all() + i+=1 + assert i==len(ds) + del x,y,z,i + +# - for example in dataset(field1, field2,field3, ...): + i=0 + for example in ds('x','y','z'): + assert len(example)==3 + assert (example['x']==array[i][:3]).all() + assert example['y']==array[i][3] + assert (example['z']==array[i][0:3:2]).all() + assert (numpy.append(example['x'],example['y'])==array[i]).all() + i+=1 + assert i==len(ds) + del example,i + i=0 + for example in ds('y','x'): + assert len(example)==2 + assert (example['x']==array[i][:3]).all() + assert example['y']==array[i][3] + assert (numpy.append(example['x'],example['y'])==array[i]).all() + i+=1 + assert i==len(ds) + del example,i + +# - for val1,val2,val3 in dataset(field1, field2,field3): + i=0 + for x,y,z in ds('x','y','z'): + assert (x==array[i][:3]).all() + assert y==array[i][3] + assert (z==array[i][0:3:2]).all() + assert (numpy.append(x,y)==array[i]).all() + i+=1 + assert i==len(ds) + del x,y,z,i + i=0 + for y,x in ds('y','x',): + assert (x==array[i][:3]).all() + assert y==array[i][3] + assert (numpy.append(x,y)==array[i]).all() + i+=1 + assert i==len(ds) + del x,y,i + + def test_minibatch_size(minibatch,minibatch_size,len_ds,nb_field,nb_iter_finished): + ##full minibatch or the last minibatch + for idx in range(nb_field): + test_minibatch_field_size(minibatch[idx],minibatch_size,len_ds,nb_iter_finished) + del idx + def test_minibatch_field_size(minibatch_field,minibatch_size,len_ds,nb_iter_finished): + assert len(minibatch_field)==minibatch_size or ((nb_iter_finished*minibatch_size+len(minibatch_field))==len_ds and len(minibatch_field)2: + ds[:1] + ds[1:1] + ds[1:1:1] + if len(ds)>5: + ds[[1,2,3]] + for x in ds: + pass + +#ds[:n] returns a LookupList with the n first examples. + ds2=ds[:3] + test_ds(ds,ds2,index=[0,1,2]) + del ds2 + +#ds[i:j] returns a LookupList with examples i,i+1,...,j-1. + ds2=ds[1:3] + test_ds(ds,ds2,index=[1,2]) + del ds2 + +#ds[i1:i2:s] returns a LookupList with the examples i1,i1+s,...i2-s. + ds2=ds[1:7:2] + test_ds(ds,ds2,[1,3,5]) + del ds2 + +#ds[i] returns the (i+1)-th example of the dataset. + ds2=ds[5] + assert isinstance(ds2,Example) + test_ds(ds,ds2,[5]) + assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds) # index not defined + assert not have_raised("var['ds']["+str(len(ds)-1)+"]",ds=ds) + del ds2 + +#ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in. + ds2=ds[[4,7,2,8]] +# assert isinstance(ds2,DataSet) + test_ds(ds,ds2,[4,7,2,8]) + del ds2 + + #ds.# returns the value of a property associated with + #the name . The following properties should be supported: + # - 'description': a textual description or name for the ds + # - 'fieldtypes': a list of types (one per field) + + #* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#???? + #assert hstack([ds('x','y'),ds('z')])==ds + #hstack([ds('z','y'),ds('x')])==ds + assert have_raised2(hstack,[ds('x'),ds('x')]) + assert have_raised2(hstack,[ds('y','x'),ds('x')]) + assert not have_raised2(hstack,[ds('x'),ds('y')]) + + # i=0 + # for example in hstack([ds('x'),ds('y'),ds('z')]): + # example==ds[i] + # i+=1 + # del i,example + #* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#???? + +def test_subset(array,ds): + def test_ds(orig,ds,index): + i=0 + assert isinstance(ds2,DataSet) + assert len(ds)==len(index) + for x,z,y in ds('x','z','y'): + assert (orig[index[i]]['x']==array[index[i]][:3]).all() + assert (orig[index[i]]['x']==x).all() + assert orig[index[i]]['y']==array[index[i]][3] + assert orig[index[i]]['y']==y + assert (orig[index[i]]['z']==array[index[i]][0:3:2]).all() + assert (orig[index[i]]['z']==z).all() + i+=1 + del i + ds[0] + if len(ds)>2: + ds[:1] + ds[1:1] + ds[1:1:1] + if len(ds)>5: + ds[[1,2,3]] + for x in ds: + pass + +#ds[:n] returns a dataset with the n first examples. + ds2=ds.subset[:3] + test_ds(ds,ds2,index=[0,1,2]) +# del ds2 + +#ds[i1:i2:s]# returns a ds with the examples i1,i1+s,...i2-s. + ds2=ds.subset[1:7:2] + test_ds(ds,ds2,[1,3,5]) +# del ds2 + +# #ds[i] +# ds2=ds.subset[5] +# assert isinstance(ds2,Example) +# assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds) # index not defined +# assert not have_raised("var['ds']["+str(len(ds)-1)+"]",ds=ds) +# del ds2 + +#ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in. + ds2=ds.subset[[4,7,2,8]] + test_ds(ds,ds2,[4,7,2,8]) +# del ds2 + +#ds.# returns the value of a property associated with + #the name . The following properties should be supported: + # - 'description': a textual description or name for the ds + # - 'fieldtypes': a list of types (one per field) + +#* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#???? + #assert hstack([ds('x','y'),ds('z')])==ds + #hstack([ds('z','y'),ds('x')])==ds + assert have_raised2(hstack,[ds('x'),ds('x')]) + assert have_raised2(hstack,[ds('y','x'),ds('x')]) + assert not have_raised2(hstack,[ds('x'),ds('y')]) + +# i=0 +# for example in hstack([ds('x'),ds('y'),ds('z')]): +# example==ds[i] +# i+=1 +# del i,example +#* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#???? + +def test_fields_fct(ds): + #@todo, fill correctly + assert len(ds.fields())==3 + i=0 + v=0 + for field in ds.fields(): + for field_value in field: # iterate over the values associated to that field for all the ds examples + v+=1 + i+=1 + assert i==3 + assert v==3*10 + del i,v + + i=0 + v=0 + for field in ds('x','z').fields(): + i+=1 + for val in field: + v+=1 + assert i==2 + assert v==2*10 + del i,v + + i=0 + v=0 + for field in ds.fields('x','y'): + i+=1 + for val in field: + v+=1 + assert i==2 + assert v==2*10 + del i,v + + i=0 + v=0 + for field_examples in ds.fields(): + for example_value in field_examples: + v+=1 + i+=1 + assert i==3 + assert v==3*10 + del i,v + + assert ds == ds.fields().examples() + assert len(ds('x','y').fields()) == 2 + assert len(ds('x','z').fields()) == 2 + assert len(ds('y').fields()) == 1 + + del field + +def test_overrides(ds) : + """ Test for examples that an override __getitem__ acts as the one in DataSet """ + def ndarray_list_equal(nda,l) : + """ + Compares if a ndarray is the same as the list. Do it by converting the list into + an numpy.ndarray, if possible + """ + try : + l = numpy.asmatrix(l) + except : + return False + return smart_equal(nda,l) + + def smart_equal(a1,a2) : + """ + Handles numpy.ndarray, LookupList, and basic containers + """ + if not isinstance(a1,type(a2)) and not isinstance(a2,type(a1)): + #special case: matrix vs list of arrays + if isinstance(a1,numpy.ndarray) : + return ndarray_list_equal(a1,a2) + elif isinstance(a2,numpy.ndarray) : + return ndarray_list_equal(a2,a1) + return False + # compares 2 numpy.ndarray + if isinstance(a1,numpy.ndarray): + if len(a1.shape) != len(a2.shape): + return False + for k in range(len(a1.shape)) : + if a1.shape[k] != a2.shape[k]: + return False + return (a1==a2).all() + # compares 2 lookuplists + if isinstance(a1,LookupList) : + if len(a1._names) != len(a2._names) : + return False + for k in a1._names : + if k not in a2._names : + return False + if not smart_equal(a1[k],a2[k]) : + return False + return True + # compares 2 basic containers + if hasattr(a1,'__len__'): + if len(a1) != len(a2) : + return False + for k in range(len(a1)) : + if not smart_equal(a1[k],a2[k]): + return False + return True + # try basic equals + return a1 is a2 + + def mask(ds) : + class TestOverride(type(ds)): + def __init__(self,ds) : + self.ds = ds + def __getitem__(self,key) : + res1 = self.ds[key] + res2 = DataSet.__getitem__(ds,key) + assert smart_equal(res1,res2) + return res1 + return TestOverride(ds) + # test getitem + ds2 = mask(ds) + for k in range(10): + res = ds2[k] + res = ds2[1:len(ds):3] + + + + + + +def test_all(array,ds): + assert len(ds)==10 + test_iterate_over_examples(array, ds) + test_overrides(ds) + test_getitem(array, ds) + test_subset(array, ds) + test_ds_iterator(array,ds('x','y'),ds('y','z'),ds('x','y','z')) + test_fields_fct(ds) + + +class T_DataSet(unittest.TestCase): + def test_ArrayDataSet(self): + #don't test stream + #tested only with float value + #don't always test with y + #don't test missing value + #don't test with tuple + #don't test proterties + a2 = numpy.random.rand(10,4) + ds = ArrayDataSet(a2,{'x':slice(3),'y':3,'z':[0,2]})###???tuple not tested + ds = ArrayDataSet(a2,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested + #assert ds==a? should this work? + + test_all(a2,ds) + + del a2, ds + + def test_CachedDataSet(self): + a = numpy.random.rand(10,4) + ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested + ds2 = CachedDataSet(ds1) + ds3 = CachedDataSet(ds1,cache_all_upon_construction=True) + + test_all(a,ds2) + test_all(a,ds3) + + del a,ds1,ds2,ds3 + + + def test_DataSetFields(self): + raise NotImplementedError() + + def test_ApplyFunctionDataSet(self): + a = numpy.random.rand(10,4) + a2 = a+1 + ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested + + ds2 = ApplyFunctionDataSet(ds1,lambda x,y,z: (x+1,y+1,z+1), ['x','y','z'],minibatch_mode=False) + ds3 = ApplyFunctionDataSet(ds1,lambda x,y,z: (numpy.array(x)+1,numpy.array(y)+1,numpy.array(z)+1), + ['x','y','z'], + minibatch_mode=True) + + test_all(a2,ds2) + test_all(a2,ds3) + + del a,ds1,ds2,ds3 + + def test_FieldsSubsetDataSet(self): + a = numpy.random.rand(10,4) + ds = ArrayDataSet(a,Example(['x','y','z','w'],[slice(3),3,[0,2],0])) + ds = FieldsSubsetDataSet(ds,['x','y','z']) + + test_all(a,ds) + + del a, ds + + def test_RenamedFieldsDataSet(self): + a = numpy.random.rand(10,4) + ds = ArrayDataSet(a,Example(['x1','y1','z1','w1'],[slice(3),3,[0,2],0])) + ds = RenamedFieldsDataSet(ds,['x1','y1','z1'],['x','y','z']) + + test_all(a,ds) + + del a, ds + + def test_MinibatchDataSet(self): + raise NotImplementedError() + def test_HStackedDataSet(self): + raise NotImplementedError() + def test_VStackedDataSet(self): + raise NotImplementedError() + def test_ArrayFieldsDataSet(self): + raise NotImplementedError() + + +class T_Exotic1(unittest.TestCase): + class DataSet(DataSet): + """ Dummy dataset, where one field is a ndarray of variables size. """ + def __len__(self) : + return 100 + def fieldNames(self) : + return 'input','target','name' + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + class MultiLengthDataSetIterator(object): + def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset): + if fieldnames is None: fieldnames = dataset.fieldNames() + self.minibatch = Example(fieldnames,range(len(fieldnames))) + self.dataset, self.minibatch_size, self.current = dataset, minibatch_size, offset + def __iter__(self): + return self + def next(self): + for k in self.minibatch._names : + self.minibatch[k] = [] + for ex in range(self.minibatch_size) : + if 'input' in self.minibatch._names: + self.minibatch['input'].append( numpy.array( range(self.current + 1) ) ) + if 'target' in self.minibatch._names: + self.minibatch['target'].append( self.current % 2 ) + if 'name' in self.minibatch._names: + self.minibatch['name'].append( str(self.current) ) + self.current += 1 + return self.minibatch + return MultiLengthDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) + + def test_ApplyFunctionDataSet(self): + ds = T_Exotic1.DataSet() + dsa = ApplyFunctionDataSet(ds,lambda x,y,z: (x[-1],y*10,int(z)),['input','target','name'],minibatch_mode=False) #broken!!!!!! + for k in range(len(dsa)): + res = dsa[k] + self.failUnless(ds[k]('input')[0][-1] == res('input')[0] , 'problem in first applied function') + res = dsa[33:96:3] + + def test_CachedDataSet(self): + ds = T_Exotic1.DataSet() + dsc = CachedDataSet(ds) + for k in range(len(dsc)) : + self.failUnless(numpy.all( dsc[k]('input')[0] == ds[k]('input')[0] ) , (dsc[k],ds[k]) ) + res = dsc[:] + +if __name__=='__main__': + tests = [] + debug=False + if len(sys.argv)==1: + unittest.main() + else: + assert sys.argv[1]=="--debug" + for arg in sys.argv[2:]: + tests.append(arg) + if tests: + unittest.TestSuite(map(T_DataSet, tests)).debug() + else: + module = __import__("_test_dataset") + tests = unittest.TestLoader().loadTestsFromModule(module) + tests.debug() diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/old_dataset/_test_lookup_list.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/old_dataset/_test_lookup_list.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,24 @@ +from lookup_list import * +import unittest + +class T_LookUpList(unittest.TestCase): + def test_LookupList(self): + #test only the example in the doc??? + example = LookupList(['x','y','z'],[1,2,3]) + example['x'] = [1, 2, 3] # set or change a field + x, y, z = example + x = example[0] + x = example["x"] + assert example.keys()==['x','y','z'] + assert example.values()==[[1,2,3],2,3] + assert example.items()==[('x',[1,2,3]),('y',2),('z',3)] + example.append_keyval('u',0) # adds item with name 'u' and value 0 + assert len(example)==4 # number of items = 4 here + example2 = LookupList(['v','w'], ['a','b']) + example3 = LookupList(['x','y','z','u','v','w'], [[1, 2, 3],2,3,0,'a','b']) + assert example+example2==example3 + self.assertRaises(AssertionError,example.__add__,example) + del example, example2, example3, x, y ,z + +if __name__=='__main__': + unittest.main() diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/old_dataset/dataset.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/old_dataset/dataset.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,1533 @@ + +from lookup_list import LookupList as Example +from common.misc import unique_elements_list_intersection +from string import join +from sys import maxint +import numpy, copy + +from exceptions import * + +class AttributesHolder(object): + def __init__(self): pass + + def attributeNames(self): + raise AbstractFunction() + + def setAttributes(self,attribute_names,attribute_values,make_copies=False): + """ + Allow the attribute_values to not be a list (but a single value) if the attribute_names is of length 1. + """ + if len(attribute_names)==1 and not (isinstance(attribute_values,list) or isinstance(attribute_values,tuple) ): + attribute_values = [attribute_values] + if make_copies: + for name,value in zip(attribute_names,attribute_values): + self.__setattr__(name,copy.deepcopy(value)) + else: + for name,value in zip(attribute_names,attribute_values): + self.__setattr__(name,value) + + def getAttributes(self,attribute_names=None, return_copy=False): + """ + Return all (if attribute_names=None, in the order of attributeNames()) or a specified subset of attributes. + """ + if attribute_names is None: + attribute_names = self.attributeNames() + if return_copy: + return [copy.copy(self.__getattribute__(name)) for name in attribute_names] + else: + return [self.__getattribute__(name) for name in attribute_names] + +class DataSet(AttributesHolder): + """A virtual base class for datasets. + + A DataSet can be seen as a generalization of a matrix, meant to be used in conjunction + with learning algorithms (for training and testing them): rows/records are called examples, and + columns/attributes are called fields. The field value for a particular example can be an arbitrary + python object, which depends on the particular dataset. + + We call a DataSet a 'stream' when its length is unbounded (in which case its __len__ method + should return sys.maxint). + + A DataSet is a generator of iterators; these iterators can run through the + examples or the fields in a variety of ways. A DataSet need not necessarily have a finite + or known length, so this class can be used to interface to a 'stream' which + feeds on-line learning (however, as noted below, some operations are not + feasible or not recommended on streams). + + To iterate over examples, there are several possibilities: + - for example in dataset: + - for val1,val2,... in dataset: + - for example in dataset(field1, field2,field3, ...): + - for val1,val2,val3 in dataset(field1, field2,field3): + - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N): + - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N): + Each of these is documented below. All of these iterators are expected + to provide, in addition to the usual 'next()' method, a 'next_index()' method + which returns a non-negative integer pointing to the position of the next + example that will be returned by 'next()' (or of the first example in the + next minibatch returned). This is important because these iterators + can wrap around the dataset in order to do multiple passes through it, + in possibly unregular ways if the minibatch size is not a divisor of the + dataset length. + + To iterate over fields, one can do + - for field in dataset.fields(): + for field_value in field: # iterate over the values associated to that field for all the dataset examples + - for field in dataset(field1,field2,...).fields() to select a subset of fields + - for field in dataset.fields(field1,field2,...) to select a subset of fields + and each of these fields is iterable over the examples: + - for field_examples in dataset.fields(): + for example_value in field_examples: + ... + but when the dataset is a stream (unbounded length), it is not recommended to do + such things because the underlying dataset may refuse to access the different fields in + an unsynchronized ways. Hence the fields() method is illegal for streams, by default. + The result of fields() is a L{DataSetFields} object, which iterates over fields, + and whose elements are iterable over examples. A DataSetFields object can + be turned back into a DataSet with its examples() method:: + dataset2 = dataset1.fields().examples() + and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1). + + Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. + + Note: The content of a field can be of any type. Field values can also be 'missing' + (e.g. to handle semi-supervised learning), and in the case of numeric (numpy array) + fields (i.e. an ArrayFieldsDataSet), NaN plays the role of a missing value. + What about non-numeric values? None. + + Dataset elements can be indexed and sub-datasets (with a subset + of examples) can be extracted. These operations are not supported + by default in the case of streams. + + - dataset[:n] returns an Example with the n first examples. + + - dataset[i1:i2:s] returns an Example with the examples i1,i1+s,...i2-s. + + - dataset[i] returns an Example. + + - dataset[[i1,i2,...in]] returns an Example with examples i1,i2,...in. + + A similar command gives you a DataSet instead of Examples : + + - dataset.subset[:n] returns a DataSet with the n first examples. + + - dataset.subset[i1:i2:s] returns a DataSet with the examples i1,i1+s,...i2-s. + + - dataset.subset[i] returns a DataSet. + + - dataset.subset[[i1,i2,...in]] returns a DataSet with examples i1,i2,...in. + + + - dataset. returns the value of a property associated with + the name . The following properties should be supported: + - 'description': a textual description or name for the dataset + - 'fieldtypes': a list of types (one per field) + A DataSet may have other attributes that it makes visible to other objects. These are + used to store information that is not example-wise but global to the dataset. + The list of names of these attributes is given by the attribute_names() method. + + Datasets can be concatenated either vertically (increasing the length) or + horizontally (augmenting the set of fields), if they are compatible, using + the following operations (with the same basic semantics as numpy.hstack + and numpy.vstack): + + - dataset1 | dataset2 | dataset3 == dataset.hstack([dataset1,dataset2,dataset3]) + + creates a new dataset whose list of fields is the concatenation of the list of + fields of the argument datasets. This only works if they all have the same length. + + - dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3]) + + creates a new dataset that concatenates the examples from the argument datasets + (and whose length is the sum of the length of the argument datasets). This only + works if they all have the same fields. + + According to the same logic, and viewing a DataSetFields object associated to + a DataSet as a kind of transpose of it, fields1 & fields2 concatenates fields of + a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their + examples. + + A dataset can hold arbitrary key-value pairs that may be used to access meta-data + or other properties of the dataset or associated with the dataset or the result + of a computation stored in a dataset. These can be accessed through the [key] syntax + when key is a string (or more specifically, neither an integer, a slice, nor a list). + + A DataSet sub-class should always redefine the following methods: + - __len__ if it is not a stream + - fieldNames + - minibatches_nowrap (called by DataSet.minibatches()) + For efficiency of implementation, a sub-class might also want to redefine + - valuesHStack + - valuesVStack + - hasFields + - __getitem__ may not be feasible with some streams + - __iter__ + A sub-class should also append attributes to self._attribute_names + (the default value returned by attributeNames()). + By convention, attributes not in attributeNames() should have a name + starting with an underscore. + @todo enforce/test that convention! + """ + + numpy_vstack = lambda fieldname,values: numpy.vstack(values) + numpy_hstack = lambda fieldnames,values: numpy.hstack(values) + + def __init__(self, description=None, fieldnames=None, fieldtypes=None): + """ + @type fieldnames: list of strings + @type fieldtypes: list of python types, same length as fieldnames + @type description: string + @param description: description/name for this dataset + """ + def default_desc(): + return type(self).__name__ \ + + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" + + #self.fieldnames = fieldnames + + self.fieldtypes = fieldtypes if fieldtypes is not None \ + else [None]*1 #len(fieldnames) + + self.description = default_desc() if description is None \ + else description + self._attribute_names = ["description"] + + + attributeNames = property(lambda self: copy.copy(self._attribute_names)) + + def __contains__(self, fieldname): + return (fieldname in self.fieldNames()) \ + or (fieldname in self.attributeNames()) + + def __iter__(self): + """Supports the syntax "for i in dataset: ..." + + Using this syntax, "i" will be an Example instance (or equivalent) with + all the fields of DataSet self. Every field of "i" will give access to + a field of a single example. Fields should be accessible via + i["fielname"] or i[3] (in the order defined by the elements of the + Example returned by this iterator), but the derived class is free + to accept any type of identifier, and add extra functionality to the iterator. + + The default implementation calls the minibatches iterator and extracts the first example of each field. + """ + return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1)) + + def __len__(self): + """ + len(dataset) returns the number of examples in the dataset. + By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint). + Sub-classes which implement finite-length datasets should redefine this method. + Some methods only make sense for finite-length datasets. + """ + from sys import maxint + return maxint + + + class MinibatchToSingleExampleIterator(object): + """ + Converts the result of minibatch iterator with minibatch_size==1 into + single-example values in the result. Therefore the result of + iterating on the dataset itself gives a sequence of single examples + (whereas the result of iterating over minibatches gives in each + Example field an iterable object over the individual examples in + the minibatch). + """ + def __init__(self, minibatch_iterator): + self.minibatch_iterator = minibatch_iterator + self.minibatch = None + def __iter__(self): #makes for loop work + return self + def next(self): + size1_minibatch = self.minibatch_iterator.next() + if not self.minibatch: + names = size1_minibatch.keys() + # next lines are a hack, but there was problem when we were getting [array(327)] for instance + try: + values = [value[0] for value in size1_minibatch.values()] + except : + values = [value for value in size1_minibatch.values()] + self.minibatch = Example(names,values) + else: + self.minibatch._values = [value[0] for value in size1_minibatch.values()] + return self.minibatch + + def next_index(self): + return self.minibatch_iterator.next_index() + + class MinibatchWrapAroundIterator(object): + """ + An iterator for minibatches that handles the case where we need to wrap around the + dataset because n_batches*minibatch_size > len(dataset). It is constructed from + a dataset that provides a minibatch iterator that does not need to handle that problem. + This class is a utility for dataset subclass writers, so that they do not have to handle + this issue multiple times, nor check that fieldnames are valid, nor handle the + empty fieldnames (meaning 'use all the fields'). + """ + def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset): + self.dataset=dataset + self.fieldnames=fieldnames + self.minibatch_size=minibatch_size + self.n_batches=n_batches + self.n_batches_done=0 + self.next_row=offset + self.L=len(dataset) + self.offset=offset % self.L + ds_nbatches = (self.L-self.next_row)/self.minibatch_size + if n_batches is not None: + ds_nbatches = min(n_batches,ds_nbatches) + if fieldnames: + assert dataset.hasFields(*fieldnames) + else: + self.fieldnames=dataset.fieldNames() + self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, ds_nbatches,self.next_row) + + def __iter__(self): + return self + + def next_index(self): + return self.next_row + + def next(self): + if self.n_batches and self.n_batches_done==self.n_batches: + raise StopIteration + elif not self.n_batches and self.next_row ==self.L: + raise StopIteration + upper = self.next_row+self.minibatch_size + if upper <=self.L: + minibatch = self.iterator.next() + else: + if not self.n_batches: + upper=min(upper, self.L) + # if their is not a fixed number of batch, we continue to the end of the dataset. + # this can create a minibatch that is smaller then the minibatch_size + assert (self.L-self.next_row)<=self.minibatch_size + minibatch = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next() + else: + # we must concatenate (vstack) the bottom and top parts of our minibatch + # first get the beginning of our minibatch (top of dataset) + first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next() + second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next() + minibatch = Example(self.fieldnames, + [self.dataset.valuesVStack(name,[first_part[name],second_part[name]]) + for name in self.fieldnames]) + self.next_row=upper + self.n_batches_done+=1 + if upper >= self.L and self.n_batches: + self.next_row -= self.L + ds_nbatches = (self.L-self.next_row)/self.minibatch_size + if self.n_batches is not None: + ds_nbatches = min(self.n_batches,ds_nbatches) + self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, + ds_nbatches,self.next_row) + return DataSetFields(MinibatchDataSet(minibatch,self.dataset.valuesVStack, + self.dataset.valuesHStack), + minibatch.keys()) + + + minibatches_fieldnames = None + minibatches_minibatch_size = 1 + minibatches_n_batches = None + def minibatches(self, + fieldnames = minibatches_fieldnames, + minibatch_size = minibatches_minibatch_size, + n_batches = minibatches_n_batches, + offset = 0): + """ + Return an iterator that supports three forms of syntax: + + for i in dataset.minibatches(None,**kwargs): ... + + for i in dataset.minibatches([f1, f2, f3],**kwargs): ... + + for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ... + + Using the first two syntaxes, "i" will be an indexable object, such as a list, + tuple, or Example instance. In both cases, i[k] is a list-like container + of a batch of current examples. In the second case, i[0] is + list-like container of the f1 field of a batch current examples, i[1] is + a list-like container of the f2 field, etc. + + Using the first syntax, all the fields will be returned in "i". + Using the third syntax, i1, i2, i3 will be list-like containers of the + f1, f2, and f3 fields of a batch of examples on each loop iteration. + + The minibatches iterator is expected to return upon each call to next() + a DataSetFields object, which is a Example (indexed by the field names) whose + elements are iterable and indexable over the minibatch examples, and which keeps a pointer to + a sub-dataset that can be used to iterate over the individual examples + in the minibatch. Hence a minibatch can be converted back to a regular + dataset or its fields can be looked at individually (and possibly iterated over). + + PARAMETERS + - fieldnames (list of any type, default None): + The loop variables i1, i2, i3 (in the example above) should contain the + f1, f2, and f3 fields of the current batch of examples. If None, the + derived class can choose a default, e.g. all fields. + + - minibatch_size (integer, default 1) + On every iteration, the variables i1, i2, i3 will have + exactly minibatch_size elements. e.g. len(i1) == minibatch_size + + @DEPRECATED n_batches : not used anywhere + - n_batches (integer, default None) + The iterator will loop exactly this many times, and then stop. If None, + the derived class can choose a default. If (-1), then the returned + iterator should support looping indefinitely. + + - offset (integer, default 0) + The iterator will start at example 'offset' in the dataset, rather than the default. + + Note: A list-like container is something like a tuple, list, numpy.ndarray or + any other object that supports integer indexing and slicing. + + @ATTENTION: now minibatches returns minibatches_nowrap, which is supposed to return complete + batches only, raise StopIteration. + @ATTENTION: minibatches returns a LookupList, we can't iterate over examples on it. + + """ + #return DataSet.MinibatchWrapAroundIterator(self, fieldnames, minibatch_size, n_batches,offset) + assert offset >= 0 + assert offset < len(self) + assert offset + minibatch_size -1 < len(self) + if fieldnames == None : + fieldnames = self.fieldNames() + return self.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset) + + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + """ + This is the minibatches iterator generator that sub-classes must define. + It does not need to worry about wrapping around multiple times across the dataset, + as this is handled by MinibatchWrapAroundIterator when DataSet.minibatches() is called. + The next() method of the returned iterator does not even need to worry about + the termination condition (as StopIteration will be raised by DataSet.minibatches + before an improper call to minibatches_nowrap's next() is made). + That next() method can assert that its next row will always be within [0,len(dataset)). + The iterator returned by minibatches_nowrap does not need to implement + a next_index() method either, as this will be provided by MinibatchWrapAroundIterator. + """ + raise AbstractFunction() + + def is_unbounded(self): + """ + Tests whether a dataset is unbounded (e.g. a stream). + """ + return len(self)==maxint + + def hasFields(self,*fieldnames): + """ + Return true if the given field name (or field names, if multiple arguments are + given) is recognized by the DataSet (i.e. can be used as a field name in one + of the iterators). + + The default implementation may be inefficient (O(# fields in dataset)), as it calls the fieldNames() + method. Many datasets may store their field names in a dictionary, which would allow more efficiency. + """ + return len(unique_elements_list_intersection(fieldnames,self.fieldNames()))>0 + + def fieldNames(self): + """ + Return the list of field names that are supported by the iterators, + and for which hasFields(fieldname) would return True. + """ + raise AbstractFunction() + + def __call__(self,*fieldnames): + """ + Return a dataset that sees only the fields whose name are specified. + """ + assert self.hasFields(*fieldnames) + #return self.fields(*fieldnames).examples() + fieldnames_list = list(fieldnames) + return FieldsSubsetDataSet(self,fieldnames_list) + + def cached_fields_subset(self,*fieldnames) : + """ + Behaviour is supposed to be the same as __call__(*fieldnames), but the dataset returned is cached. + @see : dataset.__call__ + """ + assert self.hasFields(*fieldnames) + return self.fields(*fieldnames).examples() + + def fields(self,*fieldnames): + """ + Return a DataSetFields object associated with this dataset. + """ + return DataSetFields(self,fieldnames) + + def getitem_key(self, fieldname): + """A not-so-well thought-out place to put code that used to be in + getitem. + """ + #removing as per discussion June 4. --JSB + + i = fieldname + # else check for a fieldname + if self.hasFields(i): + return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] + # else we are trying to access a property of the dataset + assert i in self.__dict__ # else it means we are trying to access a non-existing property + return self.__dict__[i] + + def __getitem__(self,i): + """ + @rtype: Example + @returns: single or multiple examples + + @type i: integer or slice or of integers + @param i: + dataset[i] returns the (i+1)-th example of the dataset. + dataset[i:j] returns a LookupList with examples i,i+1,...,j-1. + dataset[i:j:s] returns a LookupList with examples i,i+2,i+4...,j-2. + dataset[[i1,i2,..,in]] returns a LookupList with examples i1,i2,...,in. + + @note: + Some stream datasets may be unable to implement random access, i.e. + arbitrary slicing/indexing because they can only iterate through + examples one or a minibatch at a time and do not actually store or keep + past (or future) examples. + + The default implementation of getitem uses the minibatches iterator + to obtain one example, one slice, or a list of examples. It may not + always be the most efficient way to obtain the result, especially if + the data are actually stored in a memory array. + """ + + if type(i) is int: + assert i >= 0 # TBM: see if someone complains and want negative i + if i >= len(self) : + raise IndexError + i_batch = self.minibatches_nowrap(self.fieldNames(), + minibatch_size=1, n_batches=1, offset=i) + return DataSet.MinibatchToSingleExampleIterator(i_batch).next() + + #if i is a contiguous slice + if type(i) is slice and (i.step in (None, 1)): + offset = 0 if i.start is None else i.start + upper_bound = len(self) if i.stop is None else i.stop + upper_bound = min(len(self) , upper_bound) + #return MinibatchDataSet(self.minibatches_nowrap(self.fieldNames(), + # minibatch_size=upper_bound - offset, + # n_batches=1, + # offset=offset).next()) + # now returns a LookupList + return self.minibatches_nowrap(self.fieldNames(), + minibatch_size=upper_bound - offset, + n_batches=1, + offset=offset).next() + + # if slice has a step param, convert it to list and handle it with the + # list code + if type(i) is slice: + offset = 0 if i.start is None else i.start + upper_bound = len(self) if i.stop is None else i.stop + upper_bound = min(len(self) , upper_bound) + i = list(range(offset, upper_bound, i.step)) + + # handle tuples, arrays, lists + if hasattr(i, '__getitem__'): + for idx in i: + #dis-allow nested slices + if not isinstance(idx, int): + raise TypeError(idx) + if idx >= len(self) : + raise IndexError + # call back into self.__getitem__ + examples = [self.minibatches_nowrap(self.fieldNames(), + minibatch_size=1, n_batches=1, offset=ii).next() + for ii in i] + # re-index the fields in each example by field instead of by example + field_values = [[] for blah in self.fieldNames()] + for e in examples: + for f,v in zip(field_values, e): + f.append(v) + #build them into a LookupList (a.ka. Example) + zz = zip(self.fieldNames(),field_values) + vst = [self.valuesVStack(fieldname,field_values) for fieldname,field_values in zz] + example = Example(self.fieldNames(), vst) + #return MinibatchDataSet(example, self.valuesVStack, self.valuesHStack) + # now returns a LookupList + return example + + # what in the world is i? + raise TypeError(i, type(i)) + + + """ + Enables the call dataset.subset[a:b:c] that will return a DataSet + around the examples returned by __getitem__(slice(a,b,c)) + + @SEE DataSet.__getsubset(self) + """ + subset = property(lambda s : s.__getsubset(),doc="returns a subset as a DataSet") + + + def __getsubset(self) : + """ + Enables the call data.subset[a:b:c], returns a DataSet. + Default implementation is a simple wrap around __getitem__() using MinibatchDataSet. + + @RETURN DataSet + @SEE DataSet.subset = property(lambda s : s.__getsubset()) + """ + _self = self + class GetSliceReturnsDataSet(object) : + def __getitem__(self,slice) : + return MinibatchDataSet(_self.__getitem__(slice)) + return GetSliceReturnsDataSet() + + + + def valuesHStack(self,fieldnames,fieldvalues): + """ + Return a value that corresponds to concatenating (horizontally) several field values. + This can be useful to merge some fields. The implementation of this operation is likely + to involve a copy of the original values. When the values are numpy arrays, the + result should be numpy.hstack(values). If it makes sense, this operation should + work as well when each value corresponds to multiple examples in a minibatch + e.g. if each value is a Ni-vector and a minibatch of length L is a LxNi matrix, + then the result should be a Lx(N1+N2+..) matrix equal to numpy.hstack(values). + The default is to use numpy.hstack for numpy.ndarray values, and a list + pointing to the original values for other data types. + """ + all_numpy=True + for value in fieldvalues: + if not type(value) is numpy.ndarray: + all_numpy=False + if all_numpy: + return numpy.hstack(fieldvalues) + # the default implementation of horizontal stacking is to put values in a list + return fieldvalues + + def valuesVStack(self,fieldname,values): + """ + @param fieldname: the name of the field from which the values were taken + @type fieldname: any type + + @param values: bits near the beginning or end of the dataset + @type values: list of minibatches (returned by minibatches_nowrap) + + @return: the concatenation (stacking) of the values + @rtype: something suitable as a minibatch field + """ + rval = [] + for v in values: + rval.extend(v) + return rval + + def __or__(self,other): + """ + dataset1 | dataset2 returns a dataset whose list of fields is the concatenation of the list of + fields of the argument datasets. This only works if they all have the same length. + """ + return HStackedDataSet([self,other]) + + def __and__(self,other): + """ + dataset1 & dataset2 is a dataset that concatenates the examples from the argument datasets + (and whose length is the sum of the length of the argument datasets). This only + works if they all have the same fields. + """ + return VStackedDataSet([self,other]) + +def hstack(datasets): + """ + hstack(dataset1,dataset2,...) returns dataset1 | datataset2 | ... + which is a dataset whose fields list is the concatenation of the fields + of the individual datasets. + """ + assert len(datasets)>0 + if len(datasets)==1: + return datasets[0] + return HStackedDataSet(datasets) + +def vstack(datasets): + """ + vstack(dataset1,dataset2,...) returns dataset1 & datataset2 & ... + which is a dataset which iterates first over the examples of dataset1, then + over those of dataset2, etc. + """ + assert len(datasets)>0 + if len(datasets)==1: + return datasets[0] + return VStackedDataSet(datasets) + +class FieldsSubsetDataSet(DataSet): + """ + A sub-class of L{DataSet} that selects a subset of the fields. + """ + def __init__(self,src,fieldnames): + self.src=src + self.fieldnames=fieldnames + assert src.hasFields(*fieldnames) + self.valuesHStack = src.valuesHStack + self.valuesVStack = src.valuesVStack + + def __len__(self): return len(self.src) + + def fieldNames(self): + return self.fieldnames + + def __iter__(self): + class FieldsSubsetIterator(object): + def __init__(self,ds): + self.ds=ds + self.src_iter=ds.src.__iter__() + self.example=None + def __iter__(self): return self + def next(self): + complete_example = self.src_iter.next() + if self.example: + self.example._values=[complete_example[field] + for field in self.ds.fieldnames] + else: + self.example=Example(self.ds.fieldnames, + [complete_example[field] for field in self.ds.fieldnames]) + return self.example + return FieldsSubsetIterator(self) + + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + assert self.hasFields(*fieldnames) + return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset) + def dontuse__getitem__(self,i): + return FieldsSubsetDataSet(self.src[i],self.fieldnames) + +class RenamedFieldsDataSet(DataSet): + """ + A sub-class of L{DataSet} that selects and renames a subset of the fields. + """ + def __init__(self,src,src_fieldnames,new_fieldnames): + self.src=src + self.src_fieldnames=src_fieldnames + self.new_fieldnames=new_fieldnames + assert src.hasFields(*src_fieldnames) + assert len(src_fieldnames)==len(new_fieldnames) + self.valuesHStack = src.valuesHStack + self.valuesVStack = src.valuesVStack + self.lookup_fields = Example(new_fieldnames,src_fieldnames) + + def __len__(self): return len(self.src) + + def fieldNames(self): + return self.new_fieldnames + + def __iter__(self): + class FieldsSubsetIterator(object): + def __init__(self,ds): + self.ds=ds + self.src_iter=ds.src.__iter__() + self.example=None + def __iter__(self): return self + def next(self): + complete_example = self.src_iter.next() + if self.example: + self.example._values=[complete_example[field] + for field in self.ds.src_fieldnames] + else: + self.example=Example(self.ds.new_fieldnames, + [complete_example[field] + for field in self.ds.src_fieldnames]) + return self.example + return FieldsSubsetIterator(self) + + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + assert self.hasFields(*fieldnames) + cursor = Example(fieldnames,[0]*len(fieldnames)) + for batch in self.src.minibatches_nowrap([self.lookup_fields[f] for f in fieldnames],minibatch_size,n_batches,offset): + cursor._values=batch._values + yield cursor + + def __getitem__(self,i): +# return FieldsSubsetDataSet(self.src[i],self.new_fieldnames) + complete_example = self.src[i] + return Example(self.new_fieldnames, + [complete_example[field] + for field in self.src_fieldnames]) + + + +class DataSetFields(Example): + """ + Although a L{DataSet} iterates over examples (like rows of a matrix), an associated + DataSetFields iterates over fields (like columns of a matrix), and can be understood + as a transpose of the associated dataset. + + To iterate over fields, one can do + * for fields in dataset.fields() + * for fields in dataset(field1,field2,...).fields() to select a subset of fields + * for fields in dataset.fields(field1,field2,...) to select a subset of fields + and each of these fields is iterable over the examples: + * for field_examples in dataset.fields(): + for example_value in field_examples: + ... + but when the dataset is a stream (unbounded length), it is not recommended to do + such things because the underlying dataset may refuse to access the different fields in + an unsynchronized ways. Hence the fields() method is illegal for streams, by default. + The result of fields() is a DataSetFields object, which iterates over fields, + and whose elements are iterable over examples. A DataSetFields object can + be turned back into a DataSet with its examples() method: + dataset2 = dataset1.fields().examples() + and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1). + + DataSetFields can be concatenated vertically or horizontally. To be consistent with + the syntax used for DataSets, the | concatenates the fields and the & concatenates + the examples. + """ + def __init__(self,dataset,fieldnames): + original_dataset=dataset + if not fieldnames: + fieldnames=dataset.fieldNames() + elif not list(fieldnames)==list(dataset.fieldNames()): + #we must cast to list, othersize('x','y')!=['x','y'] + dataset = FieldsSubsetDataSet(dataset,fieldnames) + assert dataset.hasFields(*fieldnames) + self.dataset=dataset + + if isinstance(dataset,MinibatchDataSet): + Example.__init__(self,fieldnames,list(dataset._fields)) + elif isinstance(original_dataset,MinibatchDataSet): + Example.__init__(self,fieldnames, + [original_dataset._fields[field] + for field in fieldnames]) + else: + minibatch_iterator = dataset.minibatches(fieldnames, + minibatch_size=len(dataset), + n_batches=1) + minibatch=minibatch_iterator.next() + Example.__init__(self,fieldnames,minibatch) + + def examples(self): + return self.dataset + + def __or__(self,other): + """ + fields1 | fields2 is a DataSetFields that whose list of examples is the concatenation + of the list of examples of DataSetFields fields1 and fields2. + """ + return (self.examples() + other.examples()).fields() + + def __and__(self,other): + """ + fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation + of the fields of DataSetFields fields1 and fields2. + """ + return (self.examples() | other.examples()).fields() + + +class MinibatchDataSet(DataSet): + """ + Turn a L{Example} of same-length (iterable) fields into an example-iterable dataset. + Each element of the lookup-list should be an iterable and sliceable, all of the same length. + """ + def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack, + values_hstack=DataSet().valuesHStack): + """ + The user can (and generally should) also provide values_vstack(fieldname,fieldvalues) + and a values_hstack(fieldnames,fieldvalues) functions behaving with the same + semantics as the DataSet methods of the same name (but without the self argument). + """ + + self._fields=fields_lookuplist + assert len(fields_lookuplist)>0 + self.length=len(fields_lookuplist[0]) + for field in fields_lookuplist[1:]: + if self.length != len(field) : + print 'self.length = ',self.length + print 'len(field) = ', len(field) + print 'self._fields.keys() = ', self._fields.keys() + print 'field=',field + print 'fields_lookuplist=', fields_lookuplist + assert self.length==len(field) + self.valuesVStack=values_vstack + self.valuesHStack=values_hstack + + def __len__(self): + return self.length + + def dontuse__getitem__(self,i): + if type(i) in (slice,list): + return DataSetFields(MinibatchDataSet( + Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames()) + if type(i) is int: + return Example(self._fields.keys(),[field[i] for field in self._fields]) + if self.hasFields(i): + return self._fields[i] + assert i in self.__dict__ # else it means we are trying to access a non-existing property + return self.__dict__[i] + + def fieldNames(self): + return self._fields.keys() + + def hasFields(self,*fieldnames): + for fieldname in fieldnames: + if fieldname not in self._fields.keys(): + return False + return True + + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + #@TODO bug somewhere here, fieldnames doesnt seem to be well handled + class Iterator(object): + def __init__(self,ds,fieldnames): + # tbm: added two next lines to handle fieldnames + if fieldnames is None: fieldnames = ds._fields.keys() + self.fieldnames = fieldnames + + self.ds=ds + self.next_example=offset + assert minibatch_size >= 0 + if offset+minibatch_size > ds.length: + raise NotImplementedError() + def __iter__(self): + return self + def next(self): + upper = self.next_example+minibatch_size + if upper > len(self.ds) : + raise StopIteration() + assert upper<=len(self.ds) # instead of self.ds.length + #minibatch = Example(self.ds._fields.keys(), + # [field[self.next_example:upper] + # for field in self.ds._fields]) + # tbm: modif to use fieldnames + values = [] + for f in self.fieldnames : + #print 'we have field',f,'in fieldnames' + values.append( self.ds._fields[f][self.next_example:upper] ) + minibatch = Example(self.fieldnames,values) + #print minibatch + self.next_example+=minibatch_size + return minibatch + + # tbm: added fieldnames to handle subset of fieldnames + return Iterator(self,fieldnames) + +class HStackedDataSet(DataSet): + """ + A L{DataSet} that wraps several datasets and shows a view that includes all their fields, + i.e. whose list of fields is the concatenation of their lists of fields. + + If a field name is found in more than one of the datasets, then either an error is + raised or the fields are renamed (either by prefixing the __name__ attribute + of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list). + + @todo: automatically detect a chain of stacked datasets due to A | B | C | D ... + """ + def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None): + DataSet.__init__(self,description,field_types) + self.datasets=datasets + self.accept_nonunique_names=accept_nonunique_names + self.fieldname2dataset={} + + def rename_field(fieldname,dataset,i): + if hasattr(dataset,"__name__"): + return dataset.__name__ + "." + fieldname + return fieldname+"."+str(i) + + # make sure all datasets have the same length and unique field names + self.length=None + names_to_change=[] + for i in xrange(len(datasets)): + dataset = datasets[i] + length=len(dataset) + if self.length: + assert self.length==length + else: + self.length=length + for fieldname in dataset.fieldNames(): + if fieldname in self.fieldname2dataset: # name conflict! + if accept_nonunique_names: + fieldname=rename_field(fieldname,dataset,i) + names2change.append((fieldname,i)) + else: + raise ValueError("Incompatible datasets: non-unique field name = "+fieldname) + self.fieldname2dataset[fieldname]=i + for fieldname,i in names_to_change: + del self.fieldname2dataset[fieldname] + self.fieldname2dataset[rename_field(fieldname,self.datasets[i],i)]=i + + def __len__(self): + return len(self.datasets[0]) + + def hasFields(self,*fieldnames): + for fieldname in fieldnames: + if not fieldname in self.fieldname2dataset: + return False + return True + + def fieldNames(self): + return self.fieldname2dataset.keys() + + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + + class HStackedIterator(object): + def __init__(self,hsds,iterators): + self.hsds=hsds + self.iterators=iterators + def __iter__(self): + return self + def next(self): + # concatenate all the fields of the minibatches + l=Example() + for iter in self.iterators: + l.append_lookuplist(iter.next()) + return l + + assert self.hasFields(*fieldnames) + # find out which underlying datasets are necessary to service the required fields + # and construct corresponding minibatch iterators + if fieldnames and fieldnames!=self.fieldNames(): + datasets=set([]) + fields_in_dataset=dict([(dataset,[]) for dataset in datasets]) + for fieldname in fieldnames: + dataset=self.datasets[self.fieldname2dataset[fieldname]] + datasets.add(dataset) + fields_in_dataset[dataset].append(fieldname) + datasets=list(datasets) + iterators=[dataset.minibatches(fields_in_dataset[dataset],minibatch_size,n_batches,offset) + for dataset in datasets] + else: + datasets=self.datasets + iterators=[dataset.minibatches(None,minibatch_size,n_batches,offset) for dataset in datasets] + return HStackedIterator(self,iterators) + + + def untested_valuesVStack(self,fieldname,fieldvalues): + return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues) + + def untested_valuesHStack(self,fieldnames,fieldvalues): + """ + We will use the sub-dataset associated with the first fieldname in the fieldnames list + to do the work, hoping that it can cope with the other values (i.e. won't care + about the incompatible fieldnames). Hence this heuristic will always work if + all the fieldnames are of the same sub-dataset. + """ + return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues) + +class VStackedDataSet(DataSet): + """ + A L{DataSet} that wraps several datasets and shows a view that includes all their examples, + in the order provided. This clearly assumes that they all have the same field names + and all (except possibly the last one) are of finite length. + + @todo: automatically detect a chain of stacked datasets due to A + B + C + D ... + """ + def __init__(self,datasets): + self.datasets=datasets + self.length=0 + self.index2dataset={} + assert len(datasets)>0 + fieldnames = datasets[-1].fieldNames() + self.datasets_start_row=[] + # We use this map from row index to dataset index for constant-time random access of examples, + # to avoid having to search for the appropriate dataset each time and slice is asked for. + for dataset,k in enumerate(datasets[0:-1]): + assert dataset.is_unbounded() # All VStacked datasets (except possibly the last) must be bounded (have a length). + L=len(dataset) + for i in xrange(L): + self.index2dataset[self.length+i]=k + self.datasets_start_row.append(self.length) + self.length+=L + assert dataset.fieldNames()==fieldnames + self.datasets_start_row.append(self.length) + self.length+=len(datasets[-1]) + # If length is very large, we should use a more memory-efficient mechanism + # that does not store all indices + if self.length>1000000: + # 1 million entries would require about 60 meg for the index2dataset map + # TODO + print "A more efficient mechanism for index2dataset should be implemented" + + def __len__(self): + return self.length + + def fieldNames(self): + return self.datasets[0].fieldNames() + + def hasFields(self,*fieldnames): + return self.datasets[0].hasFields(*fieldnames) + + def locate_row(self,row): + """Return (dataset_index, row_within_dataset) for global row number""" + dataset_index = self.index2dataset[row] + row_within_dataset = self.datasets_start_row[dataset_index] + return dataset_index, row_within_dataset + + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + + class VStackedIterator(object): + def __init__(self,vsds): + self.vsds=vsds + self.next_row=offset + self.next_dataset_index,self.next_dataset_row=self.vsds.locate_row(offset) + self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ + self.next_iterator(vsds.datasets[0],offset,n_batches) + + def next_iterator(self,dataset,starting_offset,batches_left): + L=len(dataset) + ds_nbatches = (L-starting_offset)/minibatch_size + if batches_left is not None: + ds_nbatches = max(batches_left,ds_nbatches) + if minibatch_size>L: + ds_minibatch_size=L + n_left_in_mb=minibatch_size-L + ds_nbatches=1 + else: + n_left_in_mb=0 + return dataset.minibatches(fieldnames,minibatch_size,ds_nbatches,starting_offset), \ + L-(starting_offset+ds_nbatches*minibatch_size), n_left_in_mb + + def move_to_next_dataset(self): + if self.n_left_at_the_end_of_ds>0: + self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ + self.next_iterator(vsds.datasets[self.next_dataset_index], + self.n_left_at_the_end_of_ds,1) + else: + self.next_dataset_index +=1 + if self.next_dataset_index==len(self.vsds.datasets): + self.next_dataset_index = 0 + self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ + self.next_iterator(vsds.datasets[self.next_dataset_index],starting_offset,n_batches) + + def __iter__(self): + return self + + def next(self): + dataset=self.vsds.datasets[self.next_dataset_index] + mb = self.next_iterator.next() + if self.n_left_in_mb: + extra_mb = [] + while self.n_left_in_mb>0: + self.move_to_next_dataset() + extra_mb.append(self.next_iterator.next()) + mb = Example(fieldnames, + [dataset.valuesVStack(name, + [mb[name]]+[b[name] for b in extra_mb]) + for name in fieldnames]) + + self.next_row+=minibatch_size + self.next_dataset_row+=minibatch_size + if self.next_row+minibatch_size>len(dataset): + self.move_to_next_dataset() + return examples + return VStackedIterator(self) + +class ArrayFieldsDataSet(DataSet): + """ + Virtual super-class of datasets whose field values are numpy array, + thus defining valuesHStack and valuesVStack for sub-classes. + """ + def __init__(self,description=None,field_types=None): + DataSet.__init__(self,description,field_types) + def untested_valuesHStack(self,fieldnames,fieldvalues): + """Concatenate field values horizontally, e.g. two vectors + become a longer vector, two matrices become a wider matrix, etc.""" + return numpy.hstack(fieldvalues) + def untested_valuesVStack(self,fieldname,values): + """Concatenate field values vertically, e.g. two vectors + become a two-row matrix, two matrices become a longer matrix, etc.""" + return numpy.vstack(values) + + + +class NArraysDataSet(ArrayFieldsDataSet) : + """ + An NArraysDataSet stores fields that are numpy tensor, whose first axis + iterates over examples. It's a generalization of ArrayDataSet. + """ + #@TODO not completely implemented yet + def __init__(self, data_arrays, fieldnames, **kwargs) : + """ + Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list + of fieldnames. The number of arrays must be the same as the number of + fieldnames. Each set of numpy tensor must have the same first dimension (first + axis) corresponding to the number of examples. + + Every tensor is treated as a numpy array (using numpy.asarray) + """ + ArrayFieldsDataSet.__init__(self,**kwargs) + assert len(data_arrays) == len(fieldnames) + assert len(fieldnames) > 0 + ndarrays = [numpy.asarray(a) for a in data_arrays] + lens = [a.shape[0] for a in ndarrays] + num_examples = lens[0] #they must all be equal anyway + self._fieldnames = fieldnames + for k in ndarrays : + assert k.shape[0] == num_examples + self._datas = ndarrays + # create dict + self.map_field_idx = dict() + for k in range(len(fieldnames)): + self.map_field_idx[fieldnames[k]] = k + + + def __len__(self) : + """ + Length of the dataset is based on the first array = data_arrays[0], using its shape + """ + return self._datas[0].shape[0] + + def fieldNames(self) : + """ + Returns the fieldnames as set in self.__init__ + """ + return self._fieldnames + + def field_pos(self,fieldname) : + """ + Returns the index of a given fieldname. Fieldname must exists! see fieldNames(). + """ + return self.map_field_idx[fieldname] + + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + cursor = Example(fieldnames,[0]*len(fieldnames)) + fieldnames = self.fieldNames() if fieldnames is None else fieldnames + for n in xrange(n_batches): + if offset == len(self): + break + for f in range(len(cursor._names)) : + idx = self.field_pos(cursor._names[f]) + sub_data = self._datas[idx][offset : offset+minibatch_size] + cursor._values[f] = sub_data + offset += len(sub_data) #can be less than minibatch_size at end + yield cursor + + #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) + + + + +class ArrayDataSet(ArrayFieldsDataSet): + """ + An ArrayDataSet stores the fields as groups of columns in a numpy tensor, + whose first axis iterates over examples, second axis determines fields. + If the underlying array is N-dimensional (has N axes), then the field + values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2). + """ + + def __init__(self, data_array, fields_columns, **kwargs): + """ + Construct an ArrayDataSet from the underlying numpy array (data) and + a map (fields_columns) from fieldnames to field columns. The columns of a field are specified + using the standard arguments for indexing/slicing: integer for a column index, + slice for an interval of columns (with possible stride), or iterable of column indices. + """ + ArrayFieldsDataSet.__init__(self, **kwargs) + self.data=data_array + self.fields_columns=fields_columns + + # check consistency and complete slices definitions + for fieldname, fieldcolumns in self.fields_columns.items(): + if type(fieldcolumns) is int: + assert fieldcolumns>=0 and fieldcolumns=0 and i=self.l: + raise StopIteration + sub_data = self.dataset.data[self.current] + self.minibatch._values = [sub_data[c] for c in self.columns] + + self.current+=1 + return self.minibatch + + return ArrayDataSetIteratorIter(self,self.fieldNames()) + + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + cursor = Example(fieldnames,[0]*len(fieldnames)) + fieldnames = self.fieldNames() if fieldnames is None else fieldnames + if n_batches == None: + n_batches = (len(self) - offset) / minibatch_size + for n in xrange(n_batches): + if offset == len(self): + break + sub_data = self.data[offset : offset+minibatch_size] + offset += len(sub_data) #can be less than minibatch_size at end + cursor._values = [sub_data[:,self.fields_columns[f]] for f in cursor._names] + yield cursor + + #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) + + +class CachedDataSet(DataSet): + """ + Wrap a L{DataSet} whose values are computationally expensive to obtain + (e.g. because they involve some computation, or disk access), + so that repeated accesses to the same example are done cheaply, + by caching every example value that has been accessed at least once. + + Optionally, for finite-length dataset, all the values can be computed + (and cached) upon construction of the CachedDataSet, rather at the + first access. + + @todo: when cache_all_upon_construction create mini-batches that are as + large as possible but not so large as to fill up memory. + + @todo: add disk-buffering capability, so that when the cache becomes too + big for memory, we cache things on disk, trying to keep in memory only + the record most likely to be accessed next. + """ + def __init__(self,source_dataset,cache_all_upon_construction=False): + self.source_dataset=source_dataset + self.cache_all_upon_construction=cache_all_upon_construction + self.cached_examples = [] + if cache_all_upon_construction: + # this potentially brings all the source examples + # into memory at once, which may be too much + # the work could possibly be done by minibatches + # that are as large as possible but no more than what memory allows. + # + # field_values is supposed to be an DataSetFields, that inherits from LookupList + #fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next() + fields_values = DataSetFields(source_dataset,None) + assert all([len(self)==len(field_values) for field_values in fields_values]) + for example in fields_values.examples(): + self.cached_examples.append(copy.copy(example)) + + self.fieldNames = source_dataset.fieldNames + self.hasFields = source_dataset.hasFields + self.valuesHStack = source_dataset.valuesHStack + self.valuesVStack = source_dataset.valuesVStack + + def __len__(self): + return len(self.source_dataset) + + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + class CacheIterator(object): + def __init__(self,dataset): + self.dataset=dataset + self.current=offset + self.all_fields = self.dataset.fieldNames()==fieldnames + self.n_batches = n_batches + self.batch_counter = 0 + def __iter__(self): return self + def next(self): + self.batch_counter += 1 + if self.n_batches and self.batch_counter > self.n_batches : + raise StopIteration() + upper = self.current+minibatch_size + if upper > len(self.dataset.source_dataset): + raise StopIteration() + cache_len = len(self.dataset.cached_examples) + if upper>cache_len: # whole minibatch is not already in cache + # cache everything from current length to upper + #for example in self.dataset.source_dataset[cache_len:upper]: + for example in self.dataset.source_dataset.subset[cache_len:upper]: + self.dataset.cached_examples.append(example) + all_fields_minibatch = Example(self.dataset.fieldNames(), + zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size])) + + self.current+=minibatch_size + if self.all_fields: + return all_fields_minibatch + return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames]) + return CacheIterator(self) + + def dontuse__getitem__(self,i): + if type(i)==int and len(self.cached_examples)>i: + return self.cached_examples[i] + else: + return self.source_dataset[i] + + def __iter__(self): + class CacheIteratorIter(object): + def __init__(self,dataset): + self.dataset=dataset + self.l = len(dataset) + self.current = 0 + self.fieldnames = self.dataset.fieldNames() + self.example = Example(self.fieldnames,[0]*len(self.fieldnames)) + def __iter__(self): return self + def next(self): + if self.current>=self.l: + raise StopIteration + cache_len = len(self.dataset.cached_examples) + if self.current>=cache_len: # whole minibatch is not already in cache + # cache everything from current length to upper + self.dataset.cached_examples.append( + self.dataset.source_dataset[self.current]) + self.example._values = self.dataset.cached_examples[self.current] + self.current+=1 + return self.example + + return CacheIteratorIter(self) + +class ApplyFunctionDataSet(DataSet): + """ + A L{DataSet} that contains as fields the results of applying a + given function example-wise or minibatch-wise to all the fields of + an input dataset. The output of the function should be an iterable + (e.g. a list or a LookupList) over the resulting values. + + The function take as input the fields of the dataset, not the examples. + + In minibatch mode, the function is expected to work on minibatches + (takes a minibatch in input and returns a minibatch in output). More + precisely, it means that each element of the input or output list + should be iterable and indexable over the individual example values + (typically these elements will be numpy arrays). All of the elements + in the input and output lists should have the same length, which is + the length of the minibatch. + + The function is applied each time an example or a minibatch is accessed. + To avoid re-doing computation, wrap this dataset inside a CachedDataSet. + + If the values_{h,v}stack functions are not provided, then + the input_dataset.values{H,V}Stack functions are used by default. + + """ + + def __init__(self,input_dataset,function,output_names,minibatch_mode=True, + values_hstack=None,values_vstack=None, + description=None,fieldtypes=None): + """ + Constructor takes an input dataset that has as many fields as the function + expects as inputs. The resulting dataset has as many fields as the function + produces as outputs, and that should correspond to the number of output names + (provided in a list). + + Note that the expected semantics of the function differs in minibatch mode + (it takes minibatches of inputs and produces minibatches of outputs, as + documented in the class comment). + + TBM: are fieldtypes the old field types (from input_dataset) or the new ones + (for the new dataset created)? + """ + self.input_dataset=input_dataset + self.function=function + self.output_names=output_names + #print 'self.output_names in afds:', self.output_names + #print 'length in afds:', len(self.output_names) + self.minibatch_mode=minibatch_mode + DataSet.__init__(self,description,fieldtypes) + self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack + self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack + + def __len__(self): + return len(self.input_dataset) + + def fieldNames(self): + return self.output_names + + def minibatches_nowrap(self, fieldnames, *args, **kwargs): + all_input_fieldNames = self.input_dataset.fieldNames() + mbnw = self.input_dataset.minibatches_nowrap + + for input_fields in mbnw(all_input_fieldNames, *args, **kwargs): + if self.minibatch_mode: + all_output_fields = self.function(*input_fields) + else: + input_examples = zip(*input_fields) #makes so that [i] means example i + output_examples = [self.function(*input_example) + for input_example in input_examples] + all_output_fields = zip(*output_examples) + + #print 'output_names=', self.output_names + #print 'all_output_fields', all_output_fields + #print 'len(all_output_fields)=', len(all_output_fields) + all_outputs = Example(self.output_names, all_output_fields) + if fieldnames==self.output_names: + rval = all_outputs + else: + rval = Example(fieldnames,[all_outputs[name] for name in fieldnames]) + #print 'rval', rval + #print '--------' + yield rval + + def untested__iter__(self): # only implemented for increased efficiency + class ApplyFunctionSingleExampleIterator(object): + def __init__(self,output_dataset): + self.current=0 + self.output_dataset=output_dataset + self.input_iterator=output_dataset.input_dataset.__iter__() + def __iter__(self): return self + def next(self): + if self.output_dataset.minibatch_mode: + function_inputs = [[input] for input in self.input_iterator.next()] + outputs = self.output_dataset.function(*function_inputs) + assert all([hasattr(output,'__iter__') for output in outputs]) + function_outputs = [output[0] for output in outputs] + else: + function_inputs = self.input_iterator.next() + function_outputs = self.output_dataset.function(*function_inputs) + return Example(self.output_dataset.output_names,function_outputs) + return ApplyFunctionSingleExampleIterator(self) + +def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): + """ + Wraps an arbitrary L{DataSet} into one for supervised learning tasks + by forcing the user to define a set of fields as the 'input' field + and a set of fields as the 'target' field. Optionally, a single + weight_field can also be defined. + """ + args = ((input_fields,'input'),(output_fields,'target')) + if weight_field: args+=(([weight_field],'weight')) + return src_dataset.merge_fields(*args) + + + + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/old_dataset/learner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/old_dataset/learner.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,135 @@ + + +from exceptions import * +from dataset import AttributesHolder + +class OfflineLearningAlgorithm(object): + """ + Base class for offline learning algorithms, provides an interface + that allows various algorithms to be applicable to generic learning + algorithms. It is only given here to define the expected semantics. + + An offline learning algorithm can be seen as a function that when + applied to training data returns a learned function (which is an object that + can be applied to other data and return some output data). + + The offline learning scenario is the standard and most common one + in machine learning: an offline learning algorithm is applied + to a training dataset, + + model = learning_algorithm(training_set) + + resulting in a fully trained model that can be applied to another dataset + in order to perform some desired computation: + + output_dataset = model(input_dataset) + + Note that the application of a dataset has no side-effect on the model. + In that example, the training set may for example have 'input' and 'target' + fields while the input dataset may have only 'input' (or both 'input' and + 'target') and the output dataset would contain some default output fields defined + by the learning algorithm (e.g. 'output' and 'error'). The user may specifiy + what the output dataset should contain either by setting options in the + model, by the presence of particular fields in the input dataset, or with + keyword options of the __call__ method of the model (see LearnedModel.__call__). + + """ + + def __init__(self): pass + + def __call__(self, training_dataset): + """ + Return a fully trained TrainedModel. + """ + raise AbstractFunction() + +class TrainedModel(AttributesHolder): + """ + TrainedModel is a base class for models returned by instances of an + OfflineLearningAlgorithm subclass. It is only given here to define the expected semantics. + """ + def __init__(self): + pass + + def __call__(self,input_dataset,output_fieldnames=None, + test_stats_collector=None,copy_inputs=False, + put_stats_in_output_dataset=True, + output_attributes=[]): + """ + A L{TrainedModel} can be used with + with one or more calls to it. The main argument is an input L{DataSet} (possibly + containing a single example) and the result is an output L{DataSet} of the same length. + If output_fieldnames is specified, it may be use to indicate which fields should + be constructed in the output L{DataSet} (for example ['output','classification_error']). + Otherwise, some default output fields are produced (possibly depending on the input + fields available in the input_dataset). + Optionally, if copy_inputs, the input fields (of the input_dataset) can be made + visible in the output L{DataSet} returned by this method. + Optionally, attributes of the learner can be copied in the output dataset, + and statistics computed by the stats collector also put in the output dataset. + Note the distinction between fields (which are example-wise quantities, e.g. 'input') + and attributes (which are not, e.g. 'regularization_term'). + """ + raise AbstractFunction() + + +class OnlineLearningAlgorithm(object): + """ + Base class for online learning algorithms, provides an interface + that allows various algorithms to be applicable to generic online learning + algorithms. It is only given here to define the expected semantics. + + The basic setting is that the training data are only revealed in pieces + (maybe one example or a batch of example at a time): + + model = learning_algorithm() + + results in a fresh model. The model can be adapted by presenting + it with some training data, + + model.update(some_training_data) + ... + model.update(some_more_training_data) + ... + model.update(yet_more_training_data) + + and at any point one can use the model to perform some computation: + + output_dataset = model(input_dataset) + + The model should be a LearnerModel subclass instance, and LearnerModel + is a subclass of LearnedModel. + + """ + + def __init__(self): pass + + def __call__(self, training_dataset=None): + """ + Return a LearnerModel, either fresh (if training_dataset is None) or fully trained (otherwise). + """ + raise AbstractFunction() + +class LearnerModel(TrainedModel): + """ + LearnerModel is a base class for models returned by instances of a LearningAlgorithm subclass. + It is only given here to define the expected semantics. + """ + def __init__(self): + pass + + def update(self,training_set,train_stats_collector=None): + """ + Continue training a learner model, with the evidence provided by the given training set. + Hence update can be called multiple times. This is the main method used for training in the + on-line setting or the sequential (Bayesian or not) settings. + + This function has as side effect that self(data) will behave differently, + according to the adaptation achieved by update(). + + The user may optionally provide a training L{StatsCollector} that is used to record + some statistics of the outputs computed during training. It is update(d) during + training. + """ + raise AbstractFunction() + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/old_dataset/lookup_list.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/old_dataset/lookup_list.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,134 @@ + +from copy import deepcopy + +class LookupList(object): + """ + A LookupList is a sequence whose elements can be named (and unlike + a dictionary the order of the elements depends not on their key but + on the order given by the user through construction) so that + following syntactic constructions work as one would expect:: + >>> example = LookupList(['x','y','z'],[1,2,3]) + >>> example['x'] = [1, 2, 3] # set or change a field + >>> print example('z','y') # prints [3,2] + >>> x, y, z = example + >>> x = example[0] + >>> x = example["x"] + >>> print example.keys() # prints ['x','y','z'] + >>> print example.values() # prints [[1,2,3],2,3] + >>> print example.items() # prints [('x',[1,2,3]),('y',2),('z',3)] + >>> example.append_keyval('u',0) # adds item with name 'u' and value 0 + >>> print len(example) # number of items = 4 here + >>> example2 = LookupList(['v', 'w'], ['a','b']) + >>> print example+example2 # addition is like for lists, a concatenation of the items. + >>> example + example # throw an error as we can't have duplicate name. + + @note: The element names should be unique. + + @todo: Convert this documentation into doctest + format, and actually perform doctest'ing: + U{http://epydoc.sourceforge.net/manual-epytext.html#doctest-blocks} + """ + def __init__(self,names=[],values=[]): + #print 'values=', values + #print 'length=', len(values) + #print 'names=', names + #print 'length=',len(names) + assert len(values)==len(names) + self.__dict__['_values']=values + self.__dict__['_name2index']={} + self.__dict__['_names']=names + for i in xrange(len(values)): + assert names[i] not in self._name2index + self._name2index[names[i]]=i + + def keys(self): + return self._names + + def values(self): + return self._values + + def items(self): + """ + Return a list of (name,value) pairs of all the items in the look-up list. + """ + return zip(self._names,self._values) + + def __getitem__(self,key): + """ + The key in example[key] can either be an integer to index the fields + or the name of the field. + """ + if isinstance(key,int) or isinstance(key,slice) or (isinstance(key,list) and all([isinstance(i,int) for i in key])): + return self._values[key] + else: # if not an int, key must be a name + # expecting key to be a valid field name + assert isinstance(key,str) + return self._values[self._name2index[key]] + + def __setitem__(self,key,value): + if isinstance(key,int): + self._values[key]=value + else: # if not an int, key must be a name + if key in self._name2index: + self._values[self._name2index[key]]=value + else: + self.append_keyval(key,value) + + def append_keyval(self, key, value): + assert key not in self._name2index + self._name2index[key]=len(self) + self._values.append(value) + self._names.append(key) + + def append_lookuplist(self, *list): + for l in list: + for key in l.keys(): + self.append_keyval(key,l[key]) + del l + + def __len__(self): + return len(self._values) + + def __repr__(self): + return "{%s}" % ", ".join([str(k) + "=" + repr(v) for k,v in self.items()]) + + def __add__(self,rhs): + new_example = deepcopy(self) + for item in rhs.items(): + new_example.append_keyval(item[0],item[1]) + return new_example + + def __radd__(self,lhs): + new_example = deepcopy(lhs) + for item in self.items(): + new_example.append_keyval(item[0],item[1]) + return new_example + + def __eq__(self, other): + return self._values==other._values and self._name2index==other._name2index and self._names==other._names + + def __ne__(self, other): + return not self.__eq__(other) + + def __hash__(self): + raise NotImplementedError() + + def __call__(self,*names): + """ + Return a list of values associated with the given names (which must all be keys of the lookup list). + """ + if names == self._names: + return self._values + return [self[name] for name in names] + + +if __name__ == '__main__': + + a=LookupList(['a'],[1]) + print a + b=LookupList(['b'],[2]) + print b + a.append_lookuplist(b) + print a + a.append_lookuplist(b) + print a diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/README.txt Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,1 @@ +Stuff in the sandbox may be very broken and/or in flux. diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/__init__.py diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/_test_random_transformation.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/_test_random_transformation.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,84 @@ +from random_transformation import row_random_transformation + +import unittest +from theano import compile +from theano import gradient + +from theano.sparse import _is_dense, _is_sparse, _is_dense_result, _is_sparse_result +from theano.sparse import _mtypes, _mtype_to_str +from theano.sparse import as_sparse + +from theano.tensor import as_tensor +from theano.scalar import as_scalar + +import random +import numpy.random + +class T_RowRandomTransformation(unittest.TestCase): + def setUp(self): + random.seed(44) + numpy.random.seed(44) + + def test_basic(self): + rows = 4 + cols = 20 + fakeseed = 0 + length = 3 + md = numpy.random.rand(rows, cols) + for mtype in _mtypes: + m = as_sparse(mtype(md)) + o = row_random_transformation(m, length, initial_seed=fakeseed) + y = compile.eval_outputs([o]) + expected = "[[ 0.88239119 1.03244463 -1.29297503]\n [ 0.02644961 1.50119695 -0.025081 ]\n [-0.60741013 1.25424625 0.30119422]\n [-1.08659967 -0.35531544 -1.38915467]]" + self.failUnless(str(y) == expected) + + def test_length(self): + """ Test that if length is increased, we obtain the same results + (except longer). """ + + for i in range(10): + mtype = random.choice(_mtypes) + rows = random.randint(1, 20) + cols = random.randint(1, 20) + fakeseed = random.randint(0, 100) + length = random.randint(1, 10) + extralength = random.randint(1, 10) + + m = as_sparse(mtype(numpy.random.rand(rows, cols))) + o1 = row_random_transformation(m, length, initial_seed=fakeseed) + o2 = row_random_transformation(m, length + extralength, initial_seed=fakeseed) + + y1 = compile.eval_outputs([o1]) + y2 = compile.eval_outputs([o2]) + + self.failUnless((y1 == y2[:,:length]).all()) + + def test_permute(self): + """ Test that if the order of the rows is permuted, we obtain the same results. """ + for i in range(10): + mtype = random.choice(_mtypes) + rows = random.randint(2, 20) + cols = random.randint(1, 20) + fakeseed = random.randint(0, 100) + length = random.randint(1, 10) + + permute = numpy.random.permutation(rows) + + + m1 = numpy.random.rand(rows, cols) + m2 = m1[permute] + for r in range(rows): + self.failUnless((m2[r] == m1[permute[r]]).all()) + s1 = as_sparse(mtype(m1)) + s2 = as_sparse(mtype(m2)) + o1 = row_random_transformation(s1, length, initial_seed=fakeseed) + o2 = row_random_transformation(s2, length, initial_seed=fakeseed) + y1 = compile.eval_outputs([o1]) + y2 = compile.eval_outputs([o2]) + + self.failUnless(y1.shape == y2.shape) + for r in range(rows): + self.failUnless((y2[r] == y1[permute[r]]).all()) + +if __name__ == '__main__': + unittest.main() diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/denoising_aa.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/denoising_aa.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,224 @@ +""" +A denoising auto-encoder + +@warning: You should use this interface. It is not complete and is not functional. +Instead, use:: + ssh://projects@lgcm.iro.umontreal.ca/repos/denoising_aa +""" + +import theano +from theano.formula import * +from learner import * +from theano import tensor as t +from nnet_ops import * +import math +from misc import * +from misc_theano import * +from theano.tensor_random import binomial + +def hiding_corruption_formula(seed,average_fraction_hidden): + """ + Return a formula for the corruption process, in which a random + subset of the input numbers are hidden (mapped to 0). + + @param seed: seed of the random generator + @type seed: anything that numpy.random.RandomState accepts + + @param average_fraction_hidden: the probability with which each + input number is hidden (set to 0). + @type average_fraction_hidden: 0 <= real number <= 1 + """ + class HidingCorruptionFormula(Formulas): + x = t.matrix() + corrupted_x = x * binomial(seed,x,1,fraction_sampled) + + return HidingCorruptionFormula() + +def squash_affine_formula(squash_function=sigmoid): + """ + Simply does: squash_function(b + xW) + By convention prefix the parameters by _ + """ + class SquashAffineFormula(Formulas): + x = t.matrix() # of dimensions minibatch_size x n_inputs + _b = t.row() # of dimensions 1 x n_outputs + _W = t.matrix() # of dimensions n_inputs x n_outputs + a = _b + t.dot(x,_W) # of dimensions minibatch_size x n_outputs + y = squash_function(a) + return SquashAffineFormula() + +def gradient_descent_update_formula(): + class GradientDescentUpdateFormula(Formula): + param = t.matrix() + learning_rate = t.scalar() + cost = t.column() # cost of each example in a minibatch + param_update = t.add_inplace(param, -learning_rate*t.sgrad(cost)) + return gradient_descent_update_formula() + +def probabilistic_classifier_loss_formula(): + class ProbabilisticClassifierLossFormula(Formulas): + a = t.matrix() # of dimensions minibatch_size x n_classes, pre-softmax output + target_class = t.ivector() # dimension (minibatch_size) + nll, probability_predictions = crossentropy_softmax_1hot(a, target_class) # defined in nnet_ops.py + return ProbabilisticClassifierLossFormula() + +def binomial_cross_entropy_formula(): + class BinomialCrossEntropyFormula(Formulas): + a = t.matrix() # pre-sigmoid activations, minibatch_size x dim + p = sigmoid(a) # model prediction + q = t.matrix() # target binomial probabilities, minibatch_size x dim + # using the identity softplus(a) - softplus(-a) = a, + # we obtain that q log(p) + (1-q) log(1-p) = q a - softplus(a) + nll = -t.sum(q*a - softplus(-a)) + # next line was missing... hope it's all correct above + return BinomialCrossEntropyFormula() + +def squash_affine_autoencoder_formula(hidden_squash=t.tanh, + reconstruction_squash=sigmoid, + share_weights=True, + reconstruction_nll_formula=binomial_cross_entropy_formula(), + update_formula=gradient_descent_update_formula): + if share_weights: + autoencoder = squash_affine_formula(hidden_squash).rename(a='code_a') + \ + squash_affine_formula(reconstruction_squash).rename(x='hidden',y='reconstruction',_b='_c') + \ + reconstruction_nll_formula + else: + autoencoder = squash_affine_formula(hidden_squash).rename(a='code_a',_W='_W1') + \ + squash_affine_formula(reconstruction_squash).rename(x='hidden',y='reconstruction',_b='_c',_W='_W2') + \ + reconstruction_nll_formula + autoencoder = autoencoder + [update_formula().rename(cost = 'nll', + param = p) + for p in autoencoder.get_all('_.*')] + return autoencoder + + +# @todo: try other corruption formulae. The above is the default one. +# not quite used in the ICML paper... (had a fixed number of 0s). + +class DenoisingAutoEncoder(LearningAlgorithm): + + def __init__(self,n_inputs,n_hidden_per_layer, + learning_rate=0.1, + max_n_epochs=100, + L1_regularizer=0, + init_range=1., + corruption_formula = hiding_corruption_formula(), + autoencoder = squash_affine_autoencoder_formula(), + minibatch_size=None,linker = "c|py"): + for name,val in locals().items(): + if val is not self: self.__setattribute__(name,val) + self.denoising_autoencoder_formula = corruption_formula + autoencoder.rename(x='corrupted_x') + + def __call__(self, training_set=None): + """ Allocate and optionnaly train a model + + @TODO enables passing in training and valid sets, instead of cutting one set in 80/20 + """ + model = DenoisingAutoEncoderModel(self) + if training_set: + print 'DenoisingAutoEncoder(): what do I do if training_set????' + # copied from old mlp_factory_approach: + if len(trainset) == sys.maxint: + raise NotImplementedError('Learning from infinite streams is not supported') + nval = int(self.validation_portion * len(trainset)) + nmin = len(trainset) - nval + assert nmin >= 0 + minset = trainset[:nmin] #real training set for minimizing loss + valset = trainset[nmin:] #validation set for early stopping + best = model + for stp in self.early_stopper(): + model.update( + minset.minibatches([input, target], minibatch_size=min(32, + len(trainset)))) + #print 'mlp.__call__(), we did an update' + if stp.set_score: + stp.score = model(valset, ['loss_01']) + if (stp.score < stp.best_score): + best = copy.copy(model) + model = best + # end of the copy from mlp_factory_approach + + return model + + + def compile(self, inputs, outputs): + return theano.function(inputs,outputs,unpack_single=False,linker=self.linker) + +class DenoisingAutoEncoderModel(LearnerModel): + def __init__(self,learning_algorithm,params): + self.learning_algorithm=learning_algorithm + self.params=params + v = learning_algorithm.v + self.update_fn = learning_algorithm.compile(learning_algorithm.denoising_autoencoder_formula.inputs, + learning_algorithm.denoising_autoencoder_formula.outputs) + + def update(self, training_set, train_stats_collector=None): + + print 'dont update you crazy frog!' + +# old stuff + +# self._learning_rate = t.scalar('learning_rate') # this is the symbol +# self.L1_regularizer = L1_regularizer +# self._L1_regularizer = t.scalar('L1_regularizer') +# self._input = t.matrix('input') # n_examples x n_inputs +# self._W = t.matrix('W') +# self._b = t.row('b') +# self._c = t.row('b') +# self._regularization_term = self._L1_regularizer * t.sum(t.abs(self._W)) +# self._corrupted_input = corruption_process(self._input) +# self._hidden = t.tanh(self._b + t.dot(self._input, self._W.T)) +# self._reconstruction_activations =self._c+t.dot(self._hidden,self._W) +# self._nll,self._output = crossentropy_softmax_1hot(Print("output_activations")(self._output_activations),self._target_vector) +# self._output_class = t.argmax(self._output,1) +# self._class_error = t.neq(self._output_class,self._target_vector) +# self._minibatch_criterion = self._nll + self._regularization_term / t.shape(self._input)[0] +# OnlineGradientTLearner.__init__(self) + +# def attributeNames(self): +# return ["parameters","b1","W2","b2","W2", "L2_regularizer","regularization_term"] + +# def parameterAttributes(self): +# return ["b1","W1", "b2", "W2"] + +# def updateMinibatchInputFields(self): +# return ["input","target"] + +# def updateEndOutputAttributes(self): +# return ["regularization_term"] + +# def lossAttribute(self): +# return "minibatch_criterion" + +# def defaultOutputFields(self, input_fields): +# output_fields = ["output", "output_class",] +# if "target" in input_fields: +# output_fields += ["class_error", "nll"] +# return output_fields + +# def allocate(self,minibatch): +# minibatch_n_inputs = minibatch["input"].shape[1] +# if not self._n_inputs: +# self._n_inputs = minibatch_n_inputs +# self.b1 = numpy.zeros((1,self._n_hidden)) +# self.b2 = numpy.zeros((1,self._n_outputs)) +# self.forget() +# elif self._n_inputs!=minibatch_n_inputs: +# # if the input changes dimension on the fly, we resize and forget everything +# self.forget() + +# def forget(self): +# if self._n_inputs: +# r = self._init_range/math.sqrt(self._n_inputs) +# self.W1 = numpy.random.uniform(low=-r,high=r, +# size=(self._n_hidden,self._n_inputs)) +# r = self._init_range/math.sqrt(self._n_hidden) +# self.W2 = numpy.random.uniform(low=-r,high=r, +# size=(self._n_outputs,self._n_hidden)) +# self.b1[:]=0 +# self.b2[:]=0 +# self._n_epochs=0 + +# def isLastEpoch(self): +# self._n_epochs +=1 +# return self._n_epochs>=self._max_n_epochs diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/gradient_learner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/gradient_learner.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,71 @@ + +from learner import * +from tensor import * +import gradient +from compile import Function + +class GradientLearner(Learner): + """ + Base class for gradient-based optimization of a training criterion + that can consist in two parts, an additive part over examples, and + an example-independent part (usually called the regularizer). + The user provides a Theano formula that maps the fields of a minibatch (each being a tensor with the + same number of rows = minibatch size) and parameters to output fields (for the use function), one of which + must be a cost that is the training criterion to be minimized. Subclasses implement + a training strategy that uses the Theano formula to compute gradients and + to compute outputs in the update method. + The inputs, parameters, and outputs are lists of Theano tensors, + while the example_wise_cost and regularization_term are Theano tensors. + The user can specify a regularization coefficient that multiplies the regularization term. + The training algorithm looks for parameters that minimize + regularization_coefficient * regularization_term(parameters) + + sum_{inputs in training_set} example_wise_cost(inputs,parameters) + i.e. the regularization_term should not depend on the inputs, only on the parameters. + The learned function can map a subset of inputs to a subset of outputs (as long as the inputs subset + includes all the inputs required in the Theano expression for the selected outputs). + It is assumed that all the inputs are provided in the training set (as dataset fields + with the corresponding name), but not necessarily when using the learned function. + """ + def __init__(self, inputs, parameters, outputs, example_wise_cost, regularization_term=astensor(0.0), + regularization_coefficient = astensor(1.0)): + self.inputs = inputs + self.outputs = outputs + self.parameters = parameters + self.example_wise_cost = example_wise_cost + self.regularization_term = regularization_term + self.regularization_coefficient = regularization_coefficient + self.parameters_example_wise_gradient = gradient.grad(example_wise_cost, parameters) + self.parameters_regularization_gradient = gradient.grad(self.regularization_coefficient * regularization_term, parameters) + if example_wise_cost not in outputs: + outputs.append(example_wise_cost) + if regularization_term not in outputs: + outputs.append(regularization_term) + self.example_wise_gradient_fn = Function(inputs + parameters, + [self.parameters_example_wise_gradient + self.parameters_regularization_gradient]) + self.use_functions = {frozenset([input.name for input in inputs]+[output.name for output in outputs]) + : Function(inputs, outputs)} + + def use(self,input_dataset,output_fields=None,copy_inputs=True): + # obtain the function that maps the desired inputs to desired outputs + input_fields = input_dataset.fieldNames() + # map names of input fields to Theano tensors in self.inputs + input_variables = ??? + if output_fields is None: output_fields = [output.name for output in outputs] + # handle special case of inputs that are directly copied into outputs + # map names of output fields to Theano tensors in self.outputs + output_variables = ??? + use_function_key = input_fields+output_fields + if not self.use_functions.has_key(use_function_key): + self.use_function[use_function_key]=Function(input_variables,output_variables) + use_function = self.use_functions[use_function_key] + # return a dataset that computes the outputs + return input_dataset.apply_function(use_function,input_fields,output_fields,copy_inputs,compute_now=True) + + +class StochasticGradientDescent(object): + def update_parameters(self): + +class StochasticGradientLearner(GradientLearner,StochasticGradientDescent): + def __init__(self,inputs, parameters, outputs, example_wise_cost, regularization_term=astensor(0.0), + regularization_coefficient = astensor(1.0),) + def update() diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/image_tools.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/image_tools.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,39 @@ + +import numpy + + +def make_weights_image(mat, xres, yres, i, j, nrow, ncol): + """ + Displays the filters implemented by a weight matrix. + + Each filter corresponds to a row of mat and will be represented + by a xres*yres image. + + Units from i to j will be included in the picture. + + The picture will have nrow rows of filters and ncol columns + of filters. Unused spots for filters will be filled with zeros. + + The return value is a matrix suitable for display with + matplotlib's imshow. + """ + + assert j > i + n = j - i + result = numpy.zeros((ncol * xres, nrow * yres)) + submat = mat[i:j] + for k, row in enumerate(submat): + x = (k % ncol)*xres + y = (k / ncol)*yres + entry = row.reshape((xres, yres)) + lmin, lmax = numpy.min(entry), numpy.max(entry) + ldiff = lmax - lmin + #entry = (entry - lmin) / ldiff + result[x:x + xres, y:y + yres] = entry + return result.T + + + + + + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/random_transformation.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/random_transformation.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,132 @@ +""" +New L{Op}s that aren't in core theano +""" + +from theano import sparse +from theano import tensor +from theano import scalar +from theano.gof import op + +from theano.sparse import _is_dense, _is_sparse, _is_dense_result, _is_sparse_result + +import scipy.sparse + +import numpy + +class RowRandomTransformation(op.Op): + """ + Given C{x}, a (sparse) matrix with shape (exmpls, dimensions), we + multiply it by a deterministic random matrix of shape (dimensions, + length) to obtain random transformation output of shape (exmpls, + length). + + Each element of the deterministic random matrix is selected uniformly + from [-1, +1). + @todo: Use another random distribution? + + @note: This function should be written such that if length is + increased, we obtain the same results (except longer). Similarly, + the rows should be able to be permuted and get the same result in + the same fashion. + + @todo: This may be slow? + @todo: Rewrite for dense matrices too? + @todo: Is there any way to verify the convention that each row is + an example? Should I rename the variables in the code to make the + semantics more explicit? + @todo: AUTOTEST: Autotest that dense and spare versions of this are identical. + @todo: Rename? Is Row the correct name? Maybe column-wise? + + @type x: L{scipy.sparse.spmatrix} + @param x: Sparse matrix to be randomly transformed with shape (exmpls, dimensions) + @type length: int + @param length: The number of transformations of C{x} to be performed. + @param initial_seed: Initial seed for the RNG. + @rtype: L{numpy.ndarray} + @return: Array with C{length} random transformations, with shape (exmpls, length) + """ + + import random + """ + RNG used for random transformations. + Does not share state with rest of program. + @todo: Make STATIC and private. Ask James or Olivier how to make this more Pythonic. + """ + _trng = random.Random() + + def __init__(self, x, length, initial_seed=0, **kwargs): + """ + @todo: Which broadcastable values should I use? + """ + assert 0 # Needs to be updated to Olivier's new Op creation approach + op.Op.__init__(self, **kwargs) + x = sparse.as_sparse(x) + self.initial_seed = initial_seed + self.length = length + self.inputs = [x] + self.outputs = [tensor.Tensor(x.dtype, broadcastable=[False, False])] +# self.outputs = [tensor.Tensor(x.dtype, broadcastable=[True, True])] + + def _random_matrix_value(self, row, col, rows): + """ + From a deterministic random matrix, find one element. + @param row: The row of the element to be read. + @param col: The column of the element to be read. + @param row: The number of rows in the matrix. + @type row: int + @type col: int + @type rows: int + @note: This function is designed such that if we extend + the number of columns in the random matrix, the values of + the earlier entries is unchanged. + @todo: Make this static + """ + # Choose the random entry at (l, c) + rngidx = col * rows + row + # Set the random number state for this random entry + # Note: This may be slow + self._trng.seed(rngidx + self.initial_seed) + + # Determine the value for this entry + val = self._trng.uniform(-1, +1) +# print "Exmpl #%d, dimension #%d => Random projection #%d has idx %d (+ seed %d) and value %f" % (r, c, j, rngidx, self.initial_seed, val) + return val + + def impl(self, xorig): + assert _is_sparse(xorig) + assert len(xorig.shape) == 2 + # Since conversions to and from the COO format are quite fast, you + # can use this approach to efficiently implement lots computations + # on sparse matrices. + x = xorig.tocoo() + (rows, cols) = x.shape + tot = rows * cols + out = numpy.zeros((rows, self.length)) +# print "l = %d" % self.length +# print "x.getnnz() = %d" % x.getnnz() + all = zip(x.col, x.row, x.data) + all.sort() # TODO: Maybe this is very slow? + lastc = None + lastl = None + lastval = None + for l in range(self.length): + for (c, r, data) in all: + assert c < cols + assert r < rows + if not c == lastc or not l == lastl: + lastc = c + lastl = l + lastval = self._random_matrix_value(c, l, cols) + val = lastval +# val = self._random_matrix_value(c, l, cols) +# val = self._trng.uniform(-1, +1) +# val = 1.0 + out[r][l] += val * data + return out + def __copy__(self): + return self.__class__(self.inputs[0], self.length, self.initial_seed) + def clone_with_new_inputs(self, *new_inputs): + return self.__class__(new_inputs[0], self.length, self.initial_seed) + def desc(self, *new_inputs): + return (self.__class__, self.length, self.initial_seed) +row_random_transformation = RowRandomTransformation() diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/rbm/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/rbm/README.txt Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,4 @@ +An RBM with binomial units trained with CD-1. +by Joseph Turian + +This seems to work fine. diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/rbm/__init__.py diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/rbm/main.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/rbm/main.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,26 @@ +#!/usr/bin/python +""" +Simple SGD RBM training. +(An example of how to use the model.) +""" + + +import numpy + +nonzero_instances = [] +#nonzero_instances.append({0: 1, 1: 1}) +#nonzero_instances.append({0: 1, 2: 1}) + +nonzero_instances.append({1: 0.1, 5: 0.5, 9: 1}) +nonzero_instances.append({2: 0.3, 5: 0.5, 8: 0.8}) +nonzero_instances.append({1: 0.2, 2: 0.3, 5: 0.5}) + +import model +model = model.Model(input_dimension=10, hidden_dimension=6) + +for i in xrange(100000): + # Select an instance + instance = nonzero_instances[i % len(nonzero_instances)] + + # SGD update over instance + model.update([instance]) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/rbm/model.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/rbm/model.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,139 @@ +""" +The model for an autoassociator for sparse inputs, using Ronan Collobert + Jason +Weston's sampling trick (2008). +""" + +import parameters + +import numpy +from numpy import dot +import random + +import pylearn.nnet_ops +import pylearn.sparse_instance + +def sigmoid(v): + """ + @todo: Move to pylearn.more_numpy + @todo: Fix to avoid floating point overflow. + """ +# if x < -30.0: return 0.0 +# if x > 30.0: return 1.0 + return 1.0 / (1.0 + numpy.exp(-v)) + +def sample(v): + """ + @todo: Move to pylearn.more_numpy + """ + assert len(v.shape) == 2 + x = numpy.zeros(v.shape) + for j in range(v.shape[0]): + for i in range(v.shape[1]): + assert v[j][i] >= 0 and v[j][i] <= 1 + if random.random() < v[j][i]: x[j][i] = 1 + else: x[j][i] = 0 + return x + +def crossentropy(output, target): + """ + Compute the crossentropy of binary output wrt binary target. + @note: We do not sum, crossentropy is computed by component. + @todo: Rewrite as a scalar, and then broadcast to tensor. + @todo: Move to pylearn.more_numpy + @todo: Fix to avoid floating point overflow. + """ + return -(target * numpy.log(output) + (1 - target) * numpy.log(1 - output)) + + +class Model: + """ + @todo: input dimensions should be stored here! not as a global. + """ + def __init__(self, input_dimension, hidden_dimension, learning_rate = 0.1, momentum = 0.9, weight_decay = 0.0002, random_seed = 666): + self.input_dimension = input_dimension + self.hidden_dimension = hidden_dimension + self.learning_rate = learning_rate + self.momentum = momentum + self.weight_decay = weight_decay + self.random_seed = random_seed + + random.seed(random_seed) + + self.parameters = parameters.Parameters(input_dimension=self.input_dimension, hidden_dimension=self.hidden_dimension, randomly_initialize=True, random_seed=self.random_seed) + self.prev_dw = 0 + self.prev_db = 0 + self.prev_dc = 0 + + def deterministic_reconstruction(self, v0): + """ + One up-down cycle, but a mean-field approximation (no sampling). + """ + q = sigmoid(self.parameters.b + dot(v0, self.parameters.w)) + p = sigmoid(self.parameters.c + dot(q, self.parameters.w.T)) + return p + + def deterministic_reconstruction_error(self, v0): + """ + @note: According to Yoshua, -log P(V1 = v0 | tilde(h)(v0)). + """ + return crossentropy(self.deterministic_reconstruction(v0), v0) + + def update(self, instances): + """ + Update the L{Model} using one training instance. + @param instance: A dict from feature index to (non-zero) value. + @todo: Should assert that nonzero_indices and zero_indices + are correct (i.e. are truly nonzero/zero). + @todo: Multiply L{self.weight_decay} by L{self.learning_rate}, as done in Semantic Hashing? + @todo: Decay the biases too? + """ + minibatch = len(instances) + v0 = pylearn.sparse_instance.to_vector(instances, self.input_dimension) + print "old XENT per instance:", numpy.sum(self.deterministic_reconstruction_error(v0))/minibatch + q0 = sigmoid(self.parameters.b + dot(v0, self.parameters.w)) + h0 = sample(q0) + p0 = sigmoid(self.parameters.c + dot(h0, self.parameters.w.T)) + v1 = sample(p0) + q1 = sigmoid(self.parameters.b + dot(v1, self.parameters.w)) + + dw = self.learning_rate * (dot(v0.T, h0) - dot(v1.T, q1)) / minibatch + self.momentum * self.prev_dw + db = self.learning_rate * numpy.sum(h0 - q1, axis=0) / minibatch + self.momentum * self.prev_db + dc = self.learning_rate * numpy.sum(v0 - v1, axis=0) / minibatch + self.momentum * self.prev_dc + + self.parameters.w *= (1 - self.weight_decay) + + self.parameters.w += dw + self.parameters.b += db + self.parameters.c += dc + + self.last_dw = dw + self.last_db = db + self.last_dc = dc + + print "new XENT per instance:", numpy.sum(self.deterministic_reconstruction_error(v0))/minibatch + +# print +# print "v[0]:", v0 +# print "Q(h[0][i] = 1 | v[0]):", q0 +# print "h[0]:", h0 +# print "P(v[1][j] = 1 | h[0]):", p0 +# print "XENT(P(v[1][j] = 1 | h[0]) | v0):", numpy.sum(crossentropy(p0, v0)) +# print "v[1]:", v1 +# print "Q(h[1][i] = 1 | v[1]):", q1 +# +# print +# print v0.T.shape +# print h0.shape +# print dot(v0.T, h0).shape +# print self.parameters.w.shape +# self.parameters.w += self.learning_rate * (dot(v0.T, h0) - dot(v1.T, q1)) / minibatch +# print +# print h0.shape +# print q1.shape +# print self.parameters.b.shape +# self.parameters.b += self.learning_rate * numpy.sum(h0 - q1, axis=0) / minibatch +# print v0.shape, v1.shape +# print +# print self.parameters.c.shape +# self.parameters.c += self.learning_rate * numpy.sum(v0 - v1, axis=0) / minibatch +# print self.parameters diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/rbm/parameters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/rbm/parameters.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,32 @@ +""" +Parameters (weights) used by the L{Model}. +""" + +import numpy + +class Parameters: + """ + Parameters used by the L{Model}. + """ + def __init__(self, input_dimension, hidden_dimension, randomly_initialize, random_seed): + """ + Initialize L{Model} parameters. + @param randomly_initialize: If True, then randomly initialize + according to the given random_seed. If False, then just use zeroes. + """ + if randomly_initialize: + numpy.random.random_seed(random_seed) + self.w = (numpy.random.rand(input_dimension, hidden_dimension)-0.5)/input_dimension + self.b = numpy.zeros((1, hidden_dimension)) + self.c = numpy.zeros((1, input_dimension)) + else: + self.w = numpy.zeros((input_dimension, hidden_dimension)) + self.b = numpy.zeros((1, hidden_dimension)) + self.c = numpy.zeros((1, input_dimension)) + + def __str__(self): + s = "" + s += "w: %s\n" % self.w + s += "b: %s\n" % self.b + s += "c: %s\n" % self.c + return s diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/simple_autoassociator/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/simple_autoassociator/README.txt Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,5 @@ +This seems to work. + +@todo: + * Add momentum. + * Add learning rate decay schedule. diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/simple_autoassociator/__init__.py diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/simple_autoassociator/graph.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/simple_autoassociator/graph.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,26 @@ +""" +Theano graph for a simple autoassociator. +@todo: Make nearly everything private. +""" + +from pylearn.nnet_ops import sigmoid, binary_crossentropy +from theano import tensor as t +from theano.tensor import dot +x = t.dmatrix() +w1 = t.dmatrix() +b1 = t.dvector() +w2 = t.dmatrix() +b2 = t.dvector() +h = sigmoid(dot(x, w1) + b1) +y = sigmoid(dot(h, w2) + b2) + +loss_unsummed = binary_crossentropy(y, x) +loss = t.sum(loss_unsummed) + +(gw1, gb1, gw2, gb2) = t.grad(loss, [w1, b1, w2, b2]) + +import theano.compile + +inputs = [x, w1, b1, w2, b2] +outputs = [y, h, loss, gw1, gb1, gw2, gb2] +trainfn = theano.compile.function(inputs, outputs) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/simple_autoassociator/main.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/simple_autoassociator/main.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,31 @@ +#!/usr/bin/python +""" + A simple autoassociator. + + The learned model is:: + h = sigmoid(dot(x, w1) + b1) + y = sigmoid(dot(h, w2) + b2) + + Binary xent loss. +""" + + +import numpy + +nonzero_instances = [] +nonzero_instances.append({0: 1, 1: 1}) +nonzero_instances.append({0: 1, 2: 1}) + +#nonzero_instances.append({1: 0.1, 5: 0.5, 9: 1}) +#nonzero_instances.append({2: 0.3, 5: 0.5, 8: 0.8}) +##nonzero_instances.append({1: 0.2, 2: 0.3, 5: 0.5}) + +import model +model = model.Model(input_dimension=10, hidden_dimension=4) + +for i in xrange(100000): +# # Select an instance +# instance = nonzero_instances[i % len(nonzero_instances)] + + # Update over instance + model.update(nonzero_instances) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/simple_autoassociator/model.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/simple_autoassociator/model.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,71 @@ +""" +The model for an autoassociator for sparse inputs, using Ronan Collobert + Jason +Weston's sampling trick (2008). +""" + +from graph import trainfn +import parameters + +import numpy +import random + +import pylearn.sparse_instance + +class Model: + """ + @todo: Add momentum. + @todo: Add learning rate decay schedule. + """ + def __init__(self, input_dimension, hidden_dimension, learning_rate = 0.1, weight_decay = 0.0002, random_seed = 666): + self.input_dimension = input_dimension + self.hidden_dimension = hidden_dimension + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.random_seed = random_seed + + random.seed(random_seed) + + self.parameters = parameters.Parameters(input_dimension=self.input_dimension, hidden_dimension=self.hidden_dimension, randomly_initialize=True, random_seed=self.random_seed) + + def deterministic_reconstruction(self, x): + (y, h, loss, gw1, gb1, gw2, gb2) = trainfn(x, self.parameters.w1, self.parameters.b1, self.parameters.w2, self.parameters.b2) + return y + + def update(self, instances): + """ + Update the L{Model} using one training instance. + @param instances: A list of dict from feature index to (non-zero) value. + @todo: Should assert that nonzero_indices and zero_indices + are correct (i.e. are truly nonzero/zero). + @todo: Multiply L{self.weight_decay} by L{self.learning_rate}, as done in Semantic Hashing? + @todo: Decay the biases too? + """ + minibatch = len(instances) + x = pylearn.sparse_instance.to_vector(instances, self.input_dimension) + + (y, h, loss, gw1, gb1, gw2, gb2) = trainfn(x, self.parameters.w1, self.parameters.b1, self.parameters.w2, self.parameters.b2) +# print +# print "instance:", instance +# print "x:", x +# print "OLD y:", y + print "OLD total loss:", loss +# print "gw1:", gw1 +# print "gb1:", gb1 +# print "gw2:", gw2 +# print "gb2:", gb2 + + self.parameters.w1 *= (1 - self.weight_decay) + self.parameters.w2 *= (1 - self.weight_decay) + + # SGD update + self.parameters.w1 -= self.learning_rate * gw1 / minibatch + self.parameters.b1 -= self.learning_rate * gb1 / minibatch + self.parameters.w2 -= self.learning_rate * gw2 / minibatch + self.parameters.b2 -= self.learning_rate * gb2 / minibatch + +# # Recompute the loss, to make sure it's descreasing +# (y, h, loss, gw1, gb1, gw2, gb2) = trainfn(x, self.parameters.w1, self.parameters.b1, self.parameters.w2, self.parameters.b2) +## print "NEW y:", y +# print "NEW total loss:", loss +## print "h:", h +## print self.parameters diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/simple_autoassociator/parameters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/simple_autoassociator/parameters.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,36 @@ +""" +Parameters (weights) used by the L{Model}. +""" + +import numpy + +class Parameters: + """ + Parameters used by the L{Model}. + """ + def __init__(self, input_dimension, hidden_dimension, randomly_initialize, random_seed): + """ + Initialize L{Model} parameters. + @param randomly_initialize: If True, then randomly initialize + according to the given seed. If False, then just use zeroes. + """ + if randomly_initialize: + numpy.random.seed(random_seed) + self.w1 = (numpy.random.rand(input_dimension, hidden_dimension)-0.5)/input_dimension + self.w2 = (numpy.random.rand(hidden_dimension, input_dimension)-0.5)/hidden_dimension + self.b1 = numpy.zeros(hidden_dimension) + self.b2 = numpy.zeros(input_dimension) + #self.b2 = numpy.array([10, 0, 0, -10]) + else: + self.w1 = numpy.zeros((input_dimension, hidden_dimension)) + self.w2 = numpy.zeros((hidden_dimension, input_dimension)) + self.b1 = numpy.zeros(hidden_dimension) + self.b2 = numpy.zeros(input_dimension) + + def __str__(self): + s = "" + s += "w1: %s\n" % self.w1 + s += "b1: %s\n" % self.b1 + s += "w2: %s\n" % self.w2 + s += "b2: %s\n" % self.b2 + return s diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/sparse_instance.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/sparse_instance.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,22 @@ +""" +Sparse instances. +Each instance is represented as dict with key dimension. +Dimensions not present in the dict have value 0. +""" + +from numpy import zeros + +def to_vector(instances, dimensions): + """ + Convert sparse instances to vectors. + @type instances: list of sparse instances + @param dimensions: The number of dimensions in each instance. + @rtype: numpy matrix (instances x dimensions) + @todo: Allow this function to convert SINGLE instances (not lists). + """ + v = zeros((len(instances), dimensions)) + l = len(instances) + for i in range(l): + for idx in instances[i].keys(): + v[i][idx] = instances[i][idx] + return v diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/sparse_random_autoassociator/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/sparse_random_autoassociator/README.txt Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,1 @@ +Since simple_aa doesn't work, this probably doesn't either. diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/sparse_random_autoassociator/__init__.py diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/sparse_random_autoassociator/globals.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/sparse_random_autoassociator/globals.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,13 @@ +""" +Global variables. +""" + +INPUT_DIMENSION = 1000 +HIDDEN_DIMENSION = 20 +LEARNING_RATE = 0.1 +LR = LEARNING_RATE +SEED = 666 +ZERO_SAMPLE_SIZE = 50 +#ZERO_SAMPLE_SIZE = 250 +MARGIN = 0.25 +#MARGIN = 0.0 diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/sparse_random_autoassociator/graph.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/sparse_random_autoassociator/graph.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,42 @@ +""" +Theano graph for an autoassociator for sparse inputs, which will be trained +using Ronan Collobert + Jason Weston's sampling trick (2008). +@todo: Make nearly everything private. +""" + +from globals import MARGIN + +from pylearn.nnet_ops import sigmoid, binary_crossentropy +from theano import tensor as t +from theano.tensor import dot +xnonzero = t.dvector() +w1nonzero = t.dmatrix() +b1 = t.dvector() +w2nonzero = t.dmatrix() +w2zero = t.dmatrix() +b2nonzero = t.dvector() +b2zero = t.dvector() +h = sigmoid(dot(xnonzero, w1nonzero) + b1) +ynonzero = sigmoid(dot(h, w2nonzero) + b2nonzero) +yzero = sigmoid(dot(h, w2zero) + b2zero) + +# May want to weight loss wrt nonzero value? e.g. MARGIN violation for +# 0.1 nonzero is not as bad as MARGIN violation for 0.2 nonzero. +def hingeloss(MARGIN): + return -MARGIN * (MARGIN < 0) +nonzeroloss = hingeloss(ynonzero - t.max(yzero) - MARGIN) +zeroloss = hingeloss(-t.max(-(ynonzero)) - yzero - MARGIN) +# xnonzero sensitive loss: +#nonzeroloss = hingeloss(ynonzero - t.max(yzero) - MARGIN - xnonzero) +#zeroloss = hingeloss(-t.max(-(ynonzero - xnonzero)) - yzero - MARGIN) +loss = t.sum(nonzeroloss) + t.sum(zeroloss) + +#loss = t.sum(binary_crossentropy(ynonzero, xnonzero)) + t.sum(binary_crossentropy(yzero, t.constant(0))) + +(gw1nonzero, gb1, gw2nonzero, gw2zero, gb2nonzero, gb2zero) = t.grad(loss, [w1nonzero, b1, w2nonzero, w2zero, b2nonzero, b2zero]) + +import theano.compile + +inputs = [xnonzero, w1nonzero, b1, w2nonzero, w2zero, b2nonzero, b2zero] +outputs = [ynonzero, yzero, loss, gw1nonzero, gb1, gw2nonzero, gw2zero, gb2nonzero, gb2zero] +trainfn = theano.compile.function(inputs, outputs) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/sparse_random_autoassociator/main.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/sparse_random_autoassociator/main.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,48 @@ +#!/usr/bin/python +""" + An autoassociator for sparse inputs, using Ronan Collobert + Jason + Weston's sampling trick (2008). + + The learned model is:: + h = sigmoid(dot(x, w1) + b1) + y = sigmoid(dot(h, w2) + b2) + + We assume that most of the inputs are zero, and hence that + we can separate x into xnonzero, x's nonzero components, and + xzero, a sample of the zeros. We sample---randomly without + replacement---ZERO_SAMPLE_SIZE zero columns from x. + + The desideratum is that every nonzero entry is separated from every + zero entry by margin at least MARGIN. + For each ynonzero, we want it to exceed max(yzero) by at least MARGIN. + For each yzero, we want it to be exceed by min(ynonzero) by at least MARGIN. + The loss is a hinge loss (linear). The loss is irrespective of the + xnonzero magnitude (this may be a limitation). Hence, all nonzeroes + are equally important to exceed the maximum yzero. + + (Alternately, there is a commented out binary xent loss.) + + LIMITATIONS: + - Only does pure stochastic gradient (batchsize = 1). + - Loss is irrespective of the xnonzero magnitude. + - We will always use all nonzero entries, even if the training + instance is very non-sparse. +""" + + +import numpy + +nonzero_instances = [] +nonzero_instances.append({1: 0.1, 5: 0.5, 9: 1}) +nonzero_instances.append({2: 0.3, 5: 0.5, 8: 0.8}) +nonzero_instances.append({1: 0.2, 2: 0.3, 5: 0.5}) + +import model +model = model.Model() + +for i in xrange(100000): + # Select an instance + instance = nonzero_instances[i % len(nonzero_instances)] + + # SGD update over instance + model.update(instance) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/sparse_random_autoassociator/model.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/sparse_random_autoassociator/model.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,76 @@ +""" +The model for an autoassociator for sparse inputs, using Ronan Collobert + Jason +Weston's sampling trick (2008). +""" + +from graph import trainfn +import parameters + +import globals +from globals import LR + +import numpy +import random +random.seed(globals.SEED) + +def _select_indices(instance): + """ + Choose nonzero and zero indices (feature columns) of the instance. + We select B{all} nonzero indices. + We select L{globals.ZERO_SAMPLE_SIZE} zero indices randomly, + without replacement. + @bug: If there are not ZERO_SAMPLE_SIZE zeroes, we will enter + an endless loop. + @return: (nonzero_indices, zero_indices) + """ + # Get the nonzero indices + nonzero_indices = instance.keys() + nonzero_indices.sort() + + # Get the zero indices + # @bug: If there are not ZERO_SAMPLE_SIZE zeroes, we will enter an endless loop. + zero_indices = [] + while len(zero_indices) < globals.ZERO_SAMPLE_SIZE: + idx = random.randint(0, globals.INPUT_DIMENSION - 1) + if idx in nonzero_indices or idx in zero_indices: continue + zero_indices.append(idx) + zero_indices.sort() + + return (nonzero_indices, zero_indices) + +class Model: + def __init__(self): + self.parameters = parameters.Parameters(randomly_initialize=True) + + def update(self, instance): + """ + Update the L{Model} using one training instance. + @param instance: A dict from feature index to (non-zero) value. + @todo: Should assert that nonzero_indices and zero_indices + are correct (i.e. are truly nonzero/zero). + """ + (nonzero_indices, zero_indices) = _select_indices(instance) + # No update if there aren't any non-zeros. + if len(nonzero_indices) == 0: return + xnonzero = numpy.asarray([instance[idx] for idx in nonzero_indices]) + print + print "xnonzero:", xnonzero + + (ynonzero, yzero, loss, gw1nonzero, gb1, gw2nonzero, gw2zero, gb2nonzero, gb2zero) = trainfn(xnonzero, self.parameters.w1[nonzero_indices, :], self.parameters.b1, self.parameters.w2[:, nonzero_indices], self.parameters.w2[:, zero_indices], self.parameters.b2[nonzero_indices], self.parameters.b2[zero_indices]) + print "OLD ynonzero:", ynonzero + print "OLD yzero:", yzero + print "OLD total loss:", loss + + # SGD update + self.parameters.w1[nonzero_indices, :] -= LR * gw1nonzero + self.parameters.b1 -= LR * gb1 + self.parameters.w2[:, nonzero_indices] -= LR * gw2nonzero + self.parameters.w2[:, zero_indices] -= LR * gw2zero + self.parameters.b2[nonzero_indices] -= LR * gb2nonzero + self.parameters.b2[zero_indices] -= LR * gb2zero + + # Recompute the loss, to make sure it's descreasing + (ynonzero, yzero, loss, gw1nonzero, gb1, gw2nonzero, gw2zero, gb2nonzero, gb2zero) = trainfn(xnonzero, self.parameters.w1[nonzero_indices, :], self.parameters.b1, self.parameters.w2[:, nonzero_indices], self.parameters.w2[:, zero_indices], self.parameters.b2[nonzero_indices], self.parameters.b2[zero_indices]) + print "NEW ynonzero:", ynonzero + print "NEW yzero:", yzero + print "NEW total loss:", loss diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/sparse_random_autoassociator/parameters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/sparse_random_autoassociator/parameters.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,28 @@ +""" +Parameters (weights) used by the L{Model}. +""" + +import numpy +import globals + +class Parameters: + """ + Parameters used by the L{Model}. + """ + def __init__(self, input_dimension=globals.INPUT_DIMENSION, hidden_dimension=globals.HIDDEN_DIMENSION, randomly_initialize=False, seed=globals.SEED): + """ + Initialize L{Model} parameters. + @param randomly_initialize: If True, then randomly initialize + according to the given seed. If False, then just use zeroes. + """ + if randomly_initialize: + numpy.random.seed(seed) + self.w1 = (numpy.random.rand(input_dimension, hidden_dimension)-0.5)/input_dimension + self.w2 = (numpy.random.rand(hidden_dimension, input_dimension)-0.5)/hidden_dimension + self.b1 = numpy.zeros(hidden_dimension) + self.b2 = numpy.zeros(input_dimension) + else: + self.w1 = numpy.zeros((input_dimension, hidden_dimension)) + self.w2 = numpy.zeros((hidden_dimension, input_dimension)) + self.b1 = numpy.zeros(hidden_dimension) + self.b2 = numpy.zeros(input_dimension) diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/statscollector.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/statscollector.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,127 @@ + +# Here is how I see stats collectors: + +def my_stats(graph): + graph.mse=examplewise_mean(square_norm(graph.residue)) + graph.training_loss=graph.regularizer+examplewise_sum(graph.nll) + return [graph.mse,graph.training_loss] + + +# def my_stats(residue,nll,regularizer): +# mse=examplewise_mean(square_norm(residue)) +# training_loss=regularizer+examplewise_sum(nll) +# set_names(locals()) +# return ((residue,nll),(regularizer),(),(mse,training_loss)) +# my_stats_collector = make_stats_collector(my_stats) +# +# where make_stats_collector calls my_stats(examplewise_fields, attributes) to +# construct its update function, and figure out what are the input fields (here "residue" +# and "nll") and input attributes (here "regularizer") it needs, and the output +# attributes that it computes (here "mse" and "training_loss"). Remember that +# fields are examplewise quantities, but attributes are not, in my jargon. +# In the above example, I am highlighting that some operations done in my_stats +# are examplewise and some are not. I am hoping that theano Ops can do these +# kinds of internal side-effect operations (and proper initialization of these hidden +# variables). I expect that a StatsCollector (returned by make_stats_collector) +# knows the following methods: +# stats_collector.input_fieldnames +# stats_collector.input_attribute_names +# stats_collector.output_attribute_names +# stats_collector.update(mini_dataset) +# stats_collector['mse'] +# where mini_dataset has the input_fieldnames() as fields and the input_attribute_names() +# as attributes, and in the resulting dataset the output_attribute_names() are set to the +# proper numeric values. + + + +import theano +from theano import tensor as t +from Learner import Learner +from lookup_list import LookupList + +class StatsCollectorModel(AttributesHolder): + def __init__(self,stats_collector): + self.stats_collector = stats_collector + self.outputs = LookupList(stats_collector.output_names,[None for name in stats_collector.output_names]) + # the statistics get initialized here + self.update_function = theano.function(input_attributes+input_fields,output_attributes+output_fields,linker="c|py") + for name,value in self.outputs.items(): + self.__setattribute__(name,value) + def update(self,dataset): + input_fields = dataset.fields()(self.stats_collector.input_field_names) + input_attributes = dataset.getAttributes(self.stats_collector.input_attribute_names) + self.outputs._values = self.update_function(input_attributes+input_fields) + for name,value in self.outputs.items(): + self.__setattribute__(name,value) + def __call__(self): + return self.outputs + def attributeNames(self): + return self.outputs.keys() + +class StatsCollector(AttributesHolder): + + def __init__(self,input_attributes, input_fields, outputs): + self.input_attributes = input_attributes + self.input_fields = input_fields + self.outputs = outputs + self.input_attribute_names = [v.name for v in input_attributes] + self.input_field_names = [v.name for v in input_fields] + self.output_names = [v.name for v in output_attributes] + + def __call__(self,dataset=None): + model = StatsCollectorModel(self) + if dataset: + self.update(dataset) + return model + +if __name__ == '__main__': + def my_statscollector(): + regularizer = t.scalar() + nll = t.matrix() + class_error = t.matrix() + total_loss = regularizer+t.examplewise_sum(nll) + avg_nll = t.examplewise_mean(nll) + avg_class_error = t.examplewise_mean(class_error) + for name,val in locals().items(): val.name = name + return StatsCollector([regularizer],[nll,class_error],[total_loss,avg_nll,avg_class_error]) + + + + +# OLD DESIGN: +# +# class StatsCollector(object): +# """A StatsCollector object is used to record performance statistics during training +# or testing of a learner. It can be configured to measure different things and +# accumulate the appropriate statistics. From these statistics it can be interrogated +# to obtain performance measures of interest (such as maxima, minima, mean, standard +# deviation, standard error, etc.). Optionally, the observations can be weighted +# (yielded weighted mean, weighted variance, etc., where applicable). The statistics +# that are desired can be specified among a list supported by the StatsCollector +# class or subclass. When some statistics are requested, others become automatically +# available (e.g., sum or mean).""" +# +# default_statistics = [mean,standard_deviation,min,max] +# +# __init__(self,n_quantities_observed, statistics=default_statistics): +# self.n_quantities_observed=n_quantities_observed +# +# clear(self): +# raise NotImplementedError +# +# update(self,observations): +# """The observations is a numpy vector of length n_quantities_observed. Some +# entries can be 'missing' (with a NaN entry) and will not be counted in the +# statistics.""" +# raise NotImplementedError +# +# __getattr__(self, statistic) +# """Return a particular statistic, which may be inferred from the collected statistics. +# The argument is a string naming that statistic.""" + + + + + + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/sandbox/test_speed.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/test_speed.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,79 @@ +import numpy +from pylearn.datasets import * +from misc import * +def test_speed(array, ds): + print "test_speed", ds.__class__ + + mat = numpy.random.rand(400,100) + + @print_timing + def f_array_full(a): + a+1 + @print_timing + def f_array_index(a): + for id in range(a.shape[0]): +# pass + a[id]+1 +# a[id]*mat + @print_timing + def f_array_iter(a): + for r in a: +# pass + r+1 +# r*mat + @print_timing + def f_ds_index(ds): + for id in range(len(ds)): +# pass + ds[id][0]+1 +# ds[id][0]*mat + @print_timing + def f_ds_iter(ds): + for ex in ds: +# pass + ex[0]+1 +# a[0]*mat + @print_timing + def f_ds_mb1(ds,mb_size): + for exs in ds.minibatches(minibatch_size = mb_size): + for ex in exs: +# pass + ex[0]+1 +# ex[0]*mat + @print_timing + def f_ds_mb2(ds,mb_size): + for exs in ds.minibatches(minibatch_size = mb_size): +# pass + exs[0]+1 +# ex[0]*mat + + f_array_full(array) + f_array_index(array) + f_array_iter(array) + + f_ds_index(ds) + f_ds_iter(ds) + + f_ds_mb1(ds,10) + f_ds_mb1(ds,100) + f_ds_mb1(ds,1000) + f_ds_mb1(ds,10000) + f_ds_mb2(ds,10) + f_ds_mb2(ds,100) + f_ds_mb2(ds,1000) + f_ds_mb2(ds,10000) + +if __name__=='__main__': + a2 = numpy.random.rand(100000,400) + ds1 = ArrayDataSet(a2,{'all':slice(0,a2.shape[1],1)}) + test_speed(a2,ds1) + a1 = numpy.random.rand(100000,40) + ds4 = ArrayDataSet(a1,LookupList(["f"+str(x)for x in range(a1.shape[1])], + range(a1.shape[1]))) + test_speed(a2,ds4) + ds2=CachedDataSet(ds1,cache_all_upon_construction=False) + test_speed(a2,ds2) + ds3=CachedDataSet(ds1,cache_all_upon_construction=True) + test_speed(a2,ds3) + del a2,ds1,ds2,ds3 + diff -r 27b1344a57b1 -r 8fff4bc26f4c pylearn/version.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/version.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,292 @@ +import subprocess as _subprocess +import imp as _imp +import sys +import os + + +_cache = dict() + +def src_version(module_name): + """Return compact identifier of module code. + + @return: compact identifier of module code. + @rtype: string + + @note: This function tries to establish that the source files and the repo + are syncronized. It raises an Exception if there are un-tracked '.py' + files, or if there are un-committed modifications. This implementation uses + "hg id" to establish this. The code returned by "hg id" is not affected by + hg pull, but pulling might remove the " tip" string which might have + appeared. This implementation ignores the " tip" information, and only + uses the code. + + @note: This implementation is assumes that the import directory is under + version control by mercurial. + + """ + + if module_name not in _cache: + + try : + location = _imp.find_module(module_name)[1] + except ImportError: + _cache[module_name] = None + return None + #print 'location:', location + isdir = False + if os.path.isdir(location) : + isdir = True + elif os.path.isfile(location) : + isdir = False + else : + # SEEMS THIS CASE EXIST, FOR WEIRD BUILTIN FUNCTIONS + #print location,": it's 'not a dir, it's not a file, it's superman!" + #raise Exception('Unknown location or file type') + _cache[module_name] = None + return None + + + # we're dealing with a dir + if isdir : + + # under hg? + if not os.path.exists( os.path.join( location , '.hg') ) : + _cache[module_name] = None + return None + + status = _subprocess.Popen(('hg','st'),cwd=location,stdout=_subprocess.PIPE).communicate()[0] + #print 'status =', status + #TODO: check that the process return code is 0 (ticket #45) + + #status_codes = [line[0] for line in if line and line[0] != '?'] + for line in status.split('\n'): + if not line: continue + if line[0] != '?': + raise Exception('Uncommitted modification to "%s" in %s (%s)' + %(line[2:], __name__,location)) + if line[0] == '?' and line[-3:] == '.py': + raise Exception('Untracked file "%s" in %s (%s)' + %(line[2:], __name__, location)) + + hg_id = _subprocess.Popen(('hg','id'),cwd=location,stdout=_subprocess.PIPE).communicate()[0] + + # This asserts my understanding of hg id return values + # There is mention in the doc that it might return two parent hash codes + # but I've never seen it, and I dont' know what it means or how it is + # formatted. + tokens = hg_id.split(' ') + assert len(tokens) <= 2 + assert len(tokens) >= 1 + assert tokens[0][-1] != '+' # the trailing + indicates uncommitted changes + if len(tokens) == 2: + assert tokens[1] == 'tip\n' + + _cache[module_name] = tokens[0] + + # we're dealing with a file + if not isdir : + + folder = os.path.split( os.path.abspath(location) )[0] + # under hg? + if not os.path.exists( os.path.join( folder , '.hg') ) : + _cache[module_name] = None + return None + + status = _subprocess.Popen(('hg','st',location),cwd=folder,stdout=_subprocess.PIPE).communicate()[0] + #print 'status =', status + + #status_codes = [line[0] for line in if line and line[0] != '?'] + for line in status.split('\n'): + if not line: continue + if line[0] != '?': + raise Exception('Uncommitted modification to "%s" in %s (%s)' + %(line[2:], location,folder)) + if line[0] == '?' and line[-3:] == '.py': + raise Exception('Untracked file "%s" in %s (%s)' + %(line[2:], location, folder)) + + hg_id = _subprocess.Popen(('hg','id'),cwd=folder,stdout=_subprocess.PIPE).communicate()[0] + + # This asserts my understanding of hg id return values + # There is mention in the doc that it might return two parent hash codes + # but I've never seen it, and I dont' know what it means or how it is + # formatted. + tokens = hg_id.split(' ') + assert len(tokens) <= 2 + assert len(tokens) >= 1 + if tokens[0][-1] == '+' : + tokens[0] = tokens[0][:-1] # the change was not on this file + if len(tokens) == 2: + assert tokens[1] == 'tip\n' + + _cache[module_name] = tokens[0] + + + return _cache[module_name] + +_unknown_version = 'unknown version' + +def hg_version(dirname, filenames=None): + """Return current changeset of directory I{dirname}. + + @type filename: list of str (or default: None) + @param filename: if specified, we ignore modifications to other files. + + @rtype: tuple (last changeset, modified) + + """ + if type(filenames) not in (list, tuple, type(None)): + raise TypeError(filenames) + + #may raise exception, for example if hg is not visible via PATH + status_proc = _subprocess.Popen(('hg','st'), cwd=dirname, + stdout=_subprocess.PIPE, stderr=_subprocess.PIPE) + status = status_proc.communicate()[0] #read stdout into buffer + if status_proc.returncode != 0: + raise OSError('hg returned %i, maybe %s is not under hg control?', + (status_proc.returncode, dirname)) + + #may raise exception, for example if hg is not visible via PATH + id_proc = _subprocess.Popen(('hg','id', '-i'), cwd=dirname, + stdout=_subprocess.PIPE, stderr=_subprocess.PIPE) + id_stdout = id_proc.communicate()[0] + if id_proc.returncode != 0: + raise OSError('hg returned %i, maybe %s is not under hg control?', + (id_proc.returncode, dirname)) + + care_about = (lambda some_file : True) if filenames is None \ + else (lambda some_file : some_file in filenames) + + # parse status codes for what we care about + care_about_mod = False + for line in status.split('\n'): + if not line: #empty lines happen + continue + line_file = line[2:] + if line[0] != '?' and care_about(line_file): + care_about_mod = True + #raise Exception('Uncommitted modification', + #os.path.join(dirname, line_file)) + if line[0] == '?' and line[-3:] == '.py': + print >> sys.stderr, 'WARNING: untracked file', os.path.join(dirname, line_file) + + # id_stdout is 12 hex digits followed by '+\n' or '\n' + # return the trailing '+' character only if there were changes to files that + # the caller cares about (named in filenames) + modified = (id_stdout[12] == '+') + assert len(id_stdout) in (13, 14) #sanity check + if modified and care_about_mod : + return id_stdout[:13] + else: + return id_stdout[:12] + +def _import_id_py_source(location): + try: + dirname = os.path.dirname(location[1]) + basename = os.path.basename(location[1]) + return hg_version(dirname, [basename]) + except OSError, e: + print >> sys.stderr, 'IGNORNING', e + return _unknown_version + ' PY_SOURCE' + +def _import_id_py_compiled(location): + #a .pyc file was found, but no corresponding .py + return _unknown_version + ' PYC_COMPILED' + +def _import_id_pkg_directory(location): + try: + return hg_version(location[1]) + except OSError, e: + print >> sys.stderr, 'IGNORNING', e + return _unknown_version + ' PKG_DIRECTORY' + +def _import_id(tag): + try : + location = _imp.find_module(tag) + except ImportError, e: #raise when tag is not found + return e #put this in the cache, import_id will raise it + + #the find_module was successful, location is valid + resource_type = location[2][2] + + if resource_type == _imp.PY_SOURCE: + return _import_id_py_source(location) + if resource_type == _imp.PY_COMPILED: + return _import_id_py_compiled(location) + if resource_type == _imp.C_EXTENSION: + raise NoteImplementedError + if resource_type == _imp.PY_RESOURCE: + raise NoteImplementedError + if resource_type == _imp.PKG_DIRECTORY: + return _import_id_pkg_directory(location) + if resource_type == _imp.C_BUILTIN: + raise NoteImplementedError + if resource_type == _imp.PY_FROZEN: + raise NoteImplementedError + + assert False #the list of resource types above should be exhaustive + +def import_id(tag): + """Return an identifier of the code imported by 'import '. + + @param tag: a module or file name + @type tag: string + + @rtype: string + @return: identifier of the code imported by 'import '. + + This high-level function might do different things depending on, for + example, whether I{tag} identifies a file or a directory, or whether the + named entity is under some sort of version/revision control. + + Versions are sought in the following order: + 0. If I{tag} is 'python' then sys.version will be returned + 1. If I{tag} names a file or folder under revision control, this function + will attempt to guess which one, and return a string that identifies the + running code (a revision id, not the whole file!) + 2. If I{tag} names a module with a __version__ attribute, then that + attribute will be returned as a string. + 3. The string starting with 'unknown version' will be returned for other valid modules. + 4. An exception will be raise for non-existent modules. + + @note: This function may import the named entity in order to return a + __version__ module attribute. + + """ + if tag not in import_id.cache: + import_id.cache[tag] = _import_id(tag) + + #in the case of bad module names, we cached the ImportError exception + rval = import_id.cache[tag] + if isinstance(rval, Exception): + raise rval + return rval +import_id.cache = {'python':sys.version} + +def get_all_src_versions() : + """ + Get the version of all loaded module. + Calls src_version on all loaded modules. These modules are found + using sys.modules. + + Returns a dictionnary: name->version. + + @RETURN dict Dictionnary (module's name) -> (version) + @SEE src_version + """ + allmodules = sys.modules + d = dict() + for m in allmodules : + try: + d[m] = import_id(m) + except: + pass + return d + + +if __name__ == "__main__" : + + if len(sys.argv) == 2 : + print 'testing on', sys.argv[1] + print import_id(sys.argv[1]) + diff -r 27b1344a57b1 -r 8fff4bc26f4c random_transformation.py --- a/random_transformation.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,132 +0,0 @@ -""" -New L{Op}s that aren't in core theano -""" - -from theano import sparse -from theano import tensor -from theano import scalar -from theano.gof import op - -from theano.sparse import _is_dense, _is_sparse, _is_dense_result, _is_sparse_result - -import scipy.sparse - -import numpy - -class RowRandomTransformation(op.Op): - """ - Given C{x}, a (sparse) matrix with shape (exmpls, dimensions), we - multiply it by a deterministic random matrix of shape (dimensions, - length) to obtain random transformation output of shape (exmpls, - length). - - Each element of the deterministic random matrix is selected uniformly - from [-1, +1). - @todo: Use another random distribution? - - @note: This function should be written such that if length is - increased, we obtain the same results (except longer). Similarly, - the rows should be able to be permuted and get the same result in - the same fashion. - - @todo: This may be slow? - @todo: Rewrite for dense matrices too? - @todo: Is there any way to verify the convention that each row is - an example? Should I rename the variables in the code to make the - semantics more explicit? - @todo: AUTOTEST: Autotest that dense and spare versions of this are identical. - @todo: Rename? Is Row the correct name? Maybe column-wise? - - @type x: L{scipy.sparse.spmatrix} - @param x: Sparse matrix to be randomly transformed with shape (exmpls, dimensions) - @type length: int - @param length: The number of transformations of C{x} to be performed. - @param initial_seed: Initial seed for the RNG. - @rtype: L{numpy.ndarray} - @return: Array with C{length} random transformations, with shape (exmpls, length) - """ - - import random - """ - RNG used for random transformations. - Does not share state with rest of program. - @todo: Make STATIC and private. Ask James or Olivier how to make this more Pythonic. - """ - _trng = random.Random() - - def __init__(self, x, length, initial_seed=0, **kwargs): - """ - @todo: Which broadcastable values should I use? - """ - assert 0 # Needs to be updated to Olivier's new Op creation approach - op.Op.__init__(self, **kwargs) - x = sparse.as_sparse(x) - self.initial_seed = initial_seed - self.length = length - self.inputs = [x] - self.outputs = [tensor.Tensor(x.dtype, broadcastable=[False, False])] -# self.outputs = [tensor.Tensor(x.dtype, broadcastable=[True, True])] - - def _random_matrix_value(self, row, col, rows): - """ - From a deterministic random matrix, find one element. - @param row: The row of the element to be read. - @param col: The column of the element to be read. - @param row: The number of rows in the matrix. - @type row: int - @type col: int - @type rows: int - @note: This function is designed such that if we extend - the number of columns in the random matrix, the values of - the earlier entries is unchanged. - @todo: Make this static - """ - # Choose the random entry at (l, c) - rngidx = col * rows + row - # Set the random number state for this random entry - # Note: This may be slow - self._trng.seed(rngidx + self.initial_seed) - - # Determine the value for this entry - val = self._trng.uniform(-1, +1) -# print "Exmpl #%d, dimension #%d => Random projection #%d has idx %d (+ seed %d) and value %f" % (r, c, j, rngidx, self.initial_seed, val) - return val - - def impl(self, xorig): - assert _is_sparse(xorig) - assert len(xorig.shape) == 2 - # Since conversions to and from the COO format are quite fast, you - # can use this approach to efficiently implement lots computations - # on sparse matrices. - x = xorig.tocoo() - (rows, cols) = x.shape - tot = rows * cols - out = numpy.zeros((rows, self.length)) -# print "l = %d" % self.length -# print "x.getnnz() = %d" % x.getnnz() - all = zip(x.col, x.row, x.data) - all.sort() # TODO: Maybe this is very slow? - lastc = None - lastl = None - lastval = None - for l in range(self.length): - for (c, r, data) in all: - assert c < cols - assert r < rows - if not c == lastc or not l == lastl: - lastc = c - lastl = l - lastval = self._random_matrix_value(c, l, cols) - val = lastval -# val = self._random_matrix_value(c, l, cols) -# val = self._trng.uniform(-1, +1) -# val = 1.0 - out[r][l] += val * data - return out - def __copy__(self): - return self.__class__(self.inputs[0], self.length, self.initial_seed) - def clone_with_new_inputs(self, *new_inputs): - return self.__class__(new_inputs[0], self.length, self.initial_seed) - def desc(self, *new_inputs): - return (self.__class__, self.length, self.initial_seed) -row_random_transformation = RowRandomTransformation() diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/README.txt --- a/sandbox/README.txt Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -Stuff in the sandbox may be very broken and/or in flux. diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/__init__.py diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/denoising_aa.py --- a/sandbox/denoising_aa.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,224 +0,0 @@ -""" -A denoising auto-encoder - -@warning: You should use this interface. It is not complete and is not functional. -Instead, use:: - ssh://projects@lgcm.iro.umontreal.ca/repos/denoising_aa -""" - -import theano -from theano.formula import * -from learner import * -from theano import tensor as t -from nnet_ops import * -import math -from misc import * -from misc_theano import * -from theano.tensor_random import binomial - -def hiding_corruption_formula(seed,average_fraction_hidden): - """ - Return a formula for the corruption process, in which a random - subset of the input numbers are hidden (mapped to 0). - - @param seed: seed of the random generator - @type seed: anything that numpy.random.RandomState accepts - - @param average_fraction_hidden: the probability with which each - input number is hidden (set to 0). - @type average_fraction_hidden: 0 <= real number <= 1 - """ - class HidingCorruptionFormula(Formulas): - x = t.matrix() - corrupted_x = x * binomial(seed,x,1,fraction_sampled) - - return HidingCorruptionFormula() - -def squash_affine_formula(squash_function=sigmoid): - """ - Simply does: squash_function(b + xW) - By convention prefix the parameters by _ - """ - class SquashAffineFormula(Formulas): - x = t.matrix() # of dimensions minibatch_size x n_inputs - _b = t.row() # of dimensions 1 x n_outputs - _W = t.matrix() # of dimensions n_inputs x n_outputs - a = _b + t.dot(x,_W) # of dimensions minibatch_size x n_outputs - y = squash_function(a) - return SquashAffineFormula() - -def gradient_descent_update_formula(): - class GradientDescentUpdateFormula(Formula): - param = t.matrix() - learning_rate = t.scalar() - cost = t.column() # cost of each example in a minibatch - param_update = t.add_inplace(param, -learning_rate*t.sgrad(cost)) - return gradient_descent_update_formula() - -def probabilistic_classifier_loss_formula(): - class ProbabilisticClassifierLossFormula(Formulas): - a = t.matrix() # of dimensions minibatch_size x n_classes, pre-softmax output - target_class = t.ivector() # dimension (minibatch_size) - nll, probability_predictions = crossentropy_softmax_1hot(a, target_class) # defined in nnet_ops.py - return ProbabilisticClassifierLossFormula() - -def binomial_cross_entropy_formula(): - class BinomialCrossEntropyFormula(Formulas): - a = t.matrix() # pre-sigmoid activations, minibatch_size x dim - p = sigmoid(a) # model prediction - q = t.matrix() # target binomial probabilities, minibatch_size x dim - # using the identity softplus(a) - softplus(-a) = a, - # we obtain that q log(p) + (1-q) log(1-p) = q a - softplus(a) - nll = -t.sum(q*a - softplus(-a)) - # next line was missing... hope it's all correct above - return BinomialCrossEntropyFormula() - -def squash_affine_autoencoder_formula(hidden_squash=t.tanh, - reconstruction_squash=sigmoid, - share_weights=True, - reconstruction_nll_formula=binomial_cross_entropy_formula(), - update_formula=gradient_descent_update_formula): - if share_weights: - autoencoder = squash_affine_formula(hidden_squash).rename(a='code_a') + \ - squash_affine_formula(reconstruction_squash).rename(x='hidden',y='reconstruction',_b='_c') + \ - reconstruction_nll_formula - else: - autoencoder = squash_affine_formula(hidden_squash).rename(a='code_a',_W='_W1') + \ - squash_affine_formula(reconstruction_squash).rename(x='hidden',y='reconstruction',_b='_c',_W='_W2') + \ - reconstruction_nll_formula - autoencoder = autoencoder + [update_formula().rename(cost = 'nll', - param = p) - for p in autoencoder.get_all('_.*')] - return autoencoder - - -# @todo: try other corruption formulae. The above is the default one. -# not quite used in the ICML paper... (had a fixed number of 0s). - -class DenoisingAutoEncoder(LearningAlgorithm): - - def __init__(self,n_inputs,n_hidden_per_layer, - learning_rate=0.1, - max_n_epochs=100, - L1_regularizer=0, - init_range=1., - corruption_formula = hiding_corruption_formula(), - autoencoder = squash_affine_autoencoder_formula(), - minibatch_size=None,linker = "c|py"): - for name,val in locals().items(): - if val is not self: self.__setattribute__(name,val) - self.denoising_autoencoder_formula = corruption_formula + autoencoder.rename(x='corrupted_x') - - def __call__(self, training_set=None): - """ Allocate and optionnaly train a model - - @TODO enables passing in training and valid sets, instead of cutting one set in 80/20 - """ - model = DenoisingAutoEncoderModel(self) - if training_set: - print 'DenoisingAutoEncoder(): what do I do if training_set????' - # copied from old mlp_factory_approach: - if len(trainset) == sys.maxint: - raise NotImplementedError('Learning from infinite streams is not supported') - nval = int(self.validation_portion * len(trainset)) - nmin = len(trainset) - nval - assert nmin >= 0 - minset = trainset[:nmin] #real training set for minimizing loss - valset = trainset[nmin:] #validation set for early stopping - best = model - for stp in self.early_stopper(): - model.update( - minset.minibatches([input, target], minibatch_size=min(32, - len(trainset)))) - #print 'mlp.__call__(), we did an update' - if stp.set_score: - stp.score = model(valset, ['loss_01']) - if (stp.score < stp.best_score): - best = copy.copy(model) - model = best - # end of the copy from mlp_factory_approach - - return model - - - def compile(self, inputs, outputs): - return theano.function(inputs,outputs,unpack_single=False,linker=self.linker) - -class DenoisingAutoEncoderModel(LearnerModel): - def __init__(self,learning_algorithm,params): - self.learning_algorithm=learning_algorithm - self.params=params - v = learning_algorithm.v - self.update_fn = learning_algorithm.compile(learning_algorithm.denoising_autoencoder_formula.inputs, - learning_algorithm.denoising_autoencoder_formula.outputs) - - def update(self, training_set, train_stats_collector=None): - - print 'dont update you crazy frog!' - -# old stuff - -# self._learning_rate = t.scalar('learning_rate') # this is the symbol -# self.L1_regularizer = L1_regularizer -# self._L1_regularizer = t.scalar('L1_regularizer') -# self._input = t.matrix('input') # n_examples x n_inputs -# self._W = t.matrix('W') -# self._b = t.row('b') -# self._c = t.row('b') -# self._regularization_term = self._L1_regularizer * t.sum(t.abs(self._W)) -# self._corrupted_input = corruption_process(self._input) -# self._hidden = t.tanh(self._b + t.dot(self._input, self._W.T)) -# self._reconstruction_activations =self._c+t.dot(self._hidden,self._W) -# self._nll,self._output = crossentropy_softmax_1hot(Print("output_activations")(self._output_activations),self._target_vector) -# self._output_class = t.argmax(self._output,1) -# self._class_error = t.neq(self._output_class,self._target_vector) -# self._minibatch_criterion = self._nll + self._regularization_term / t.shape(self._input)[0] -# OnlineGradientTLearner.__init__(self) - -# def attributeNames(self): -# return ["parameters","b1","W2","b2","W2", "L2_regularizer","regularization_term"] - -# def parameterAttributes(self): -# return ["b1","W1", "b2", "W2"] - -# def updateMinibatchInputFields(self): -# return ["input","target"] - -# def updateEndOutputAttributes(self): -# return ["regularization_term"] - -# def lossAttribute(self): -# return "minibatch_criterion" - -# def defaultOutputFields(self, input_fields): -# output_fields = ["output", "output_class",] -# if "target" in input_fields: -# output_fields += ["class_error", "nll"] -# return output_fields - -# def allocate(self,minibatch): -# minibatch_n_inputs = minibatch["input"].shape[1] -# if not self._n_inputs: -# self._n_inputs = minibatch_n_inputs -# self.b1 = numpy.zeros((1,self._n_hidden)) -# self.b2 = numpy.zeros((1,self._n_outputs)) -# self.forget() -# elif self._n_inputs!=minibatch_n_inputs: -# # if the input changes dimension on the fly, we resize and forget everything -# self.forget() - -# def forget(self): -# if self._n_inputs: -# r = self._init_range/math.sqrt(self._n_inputs) -# self.W1 = numpy.random.uniform(low=-r,high=r, -# size=(self._n_hidden,self._n_inputs)) -# r = self._init_range/math.sqrt(self._n_hidden) -# self.W2 = numpy.random.uniform(low=-r,high=r, -# size=(self._n_outputs,self._n_hidden)) -# self.b1[:]=0 -# self.b2[:]=0 -# self._n_epochs=0 - -# def isLastEpoch(self): -# self._n_epochs +=1 -# return self._n_epochs>=self._max_n_epochs diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/gradient_learner.py --- a/sandbox/gradient_learner.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ - -from learner import * -from tensor import * -import gradient -from compile import Function - -class GradientLearner(Learner): - """ - Base class for gradient-based optimization of a training criterion - that can consist in two parts, an additive part over examples, and - an example-independent part (usually called the regularizer). - The user provides a Theano formula that maps the fields of a minibatch (each being a tensor with the - same number of rows = minibatch size) and parameters to output fields (for the use function), one of which - must be a cost that is the training criterion to be minimized. Subclasses implement - a training strategy that uses the Theano formula to compute gradients and - to compute outputs in the update method. - The inputs, parameters, and outputs are lists of Theano tensors, - while the example_wise_cost and regularization_term are Theano tensors. - The user can specify a regularization coefficient that multiplies the regularization term. - The training algorithm looks for parameters that minimize - regularization_coefficient * regularization_term(parameters) + - sum_{inputs in training_set} example_wise_cost(inputs,parameters) - i.e. the regularization_term should not depend on the inputs, only on the parameters. - The learned function can map a subset of inputs to a subset of outputs (as long as the inputs subset - includes all the inputs required in the Theano expression for the selected outputs). - It is assumed that all the inputs are provided in the training set (as dataset fields - with the corresponding name), but not necessarily when using the learned function. - """ - def __init__(self, inputs, parameters, outputs, example_wise_cost, regularization_term=astensor(0.0), - regularization_coefficient = astensor(1.0)): - self.inputs = inputs - self.outputs = outputs - self.parameters = parameters - self.example_wise_cost = example_wise_cost - self.regularization_term = regularization_term - self.regularization_coefficient = regularization_coefficient - self.parameters_example_wise_gradient = gradient.grad(example_wise_cost, parameters) - self.parameters_regularization_gradient = gradient.grad(self.regularization_coefficient * regularization_term, parameters) - if example_wise_cost not in outputs: - outputs.append(example_wise_cost) - if regularization_term not in outputs: - outputs.append(regularization_term) - self.example_wise_gradient_fn = Function(inputs + parameters, - [self.parameters_example_wise_gradient + self.parameters_regularization_gradient]) - self.use_functions = {frozenset([input.name for input in inputs]+[output.name for output in outputs]) - : Function(inputs, outputs)} - - def use(self,input_dataset,output_fields=None,copy_inputs=True): - # obtain the function that maps the desired inputs to desired outputs - input_fields = input_dataset.fieldNames() - # map names of input fields to Theano tensors in self.inputs - input_variables = ??? - if output_fields is None: output_fields = [output.name for output in outputs] - # handle special case of inputs that are directly copied into outputs - # map names of output fields to Theano tensors in self.outputs - output_variables = ??? - use_function_key = input_fields+output_fields - if not self.use_functions.has_key(use_function_key): - self.use_function[use_function_key]=Function(input_variables,output_variables) - use_function = self.use_functions[use_function_key] - # return a dataset that computes the outputs - return input_dataset.apply_function(use_function,input_fields,output_fields,copy_inputs,compute_now=True) - - -class StochasticGradientDescent(object): - def update_parameters(self): - -class StochasticGradientLearner(GradientLearner,StochasticGradientDescent): - def __init__(self,inputs, parameters, outputs, example_wise_cost, regularization_term=astensor(0.0), - regularization_coefficient = astensor(1.0),) - def update() diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/rbm/README.txt --- a/sandbox/rbm/README.txt Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -An RBM with binomial units trained with CD-1. -by Joseph Turian - -This seems to work fine. diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/rbm/__init__.py diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/rbm/main.py --- a/sandbox/rbm/main.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -#!/usr/bin/python -""" -Simple SGD RBM training. -(An example of how to use the model.) -""" - - -import numpy - -nonzero_instances = [] -#nonzero_instances.append({0: 1, 1: 1}) -#nonzero_instances.append({0: 1, 2: 1}) - -nonzero_instances.append({1: 0.1, 5: 0.5, 9: 1}) -nonzero_instances.append({2: 0.3, 5: 0.5, 8: 0.8}) -nonzero_instances.append({1: 0.2, 2: 0.3, 5: 0.5}) - -import model -model = model.Model(input_dimension=10, hidden_dimension=6) - -for i in xrange(100000): - # Select an instance - instance = nonzero_instances[i % len(nonzero_instances)] - - # SGD update over instance - model.update([instance]) diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/rbm/model.py --- a/sandbox/rbm/model.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,139 +0,0 @@ -""" -The model for an autoassociator for sparse inputs, using Ronan Collobert + Jason -Weston's sampling trick (2008). -""" - -import parameters - -import numpy -from numpy import dot -import random - -import pylearn.nnet_ops -import pylearn.sparse_instance - -def sigmoid(v): - """ - @todo: Move to pylearn.more_numpy - @todo: Fix to avoid floating point overflow. - """ -# if x < -30.0: return 0.0 -# if x > 30.0: return 1.0 - return 1.0 / (1.0 + numpy.exp(-v)) - -def sample(v): - """ - @todo: Move to pylearn.more_numpy - """ - assert len(v.shape) == 2 - x = numpy.zeros(v.shape) - for j in range(v.shape[0]): - for i in range(v.shape[1]): - assert v[j][i] >= 0 and v[j][i] <= 1 - if random.random() < v[j][i]: x[j][i] = 1 - else: x[j][i] = 0 - return x - -def crossentropy(output, target): - """ - Compute the crossentropy of binary output wrt binary target. - @note: We do not sum, crossentropy is computed by component. - @todo: Rewrite as a scalar, and then broadcast to tensor. - @todo: Move to pylearn.more_numpy - @todo: Fix to avoid floating point overflow. - """ - return -(target * numpy.log(output) + (1 - target) * numpy.log(1 - output)) - - -class Model: - """ - @todo: input dimensions should be stored here! not as a global. - """ - def __init__(self, input_dimension, hidden_dimension, learning_rate = 0.1, momentum = 0.9, weight_decay = 0.0002, random_seed = 666): - self.input_dimension = input_dimension - self.hidden_dimension = hidden_dimension - self.learning_rate = learning_rate - self.momentum = momentum - self.weight_decay = weight_decay - self.random_seed = random_seed - - random.seed(random_seed) - - self.parameters = parameters.Parameters(input_dimension=self.input_dimension, hidden_dimension=self.hidden_dimension, randomly_initialize=True, random_seed=self.random_seed) - self.prev_dw = 0 - self.prev_db = 0 - self.prev_dc = 0 - - def deterministic_reconstruction(self, v0): - """ - One up-down cycle, but a mean-field approximation (no sampling). - """ - q = sigmoid(self.parameters.b + dot(v0, self.parameters.w)) - p = sigmoid(self.parameters.c + dot(q, self.parameters.w.T)) - return p - - def deterministic_reconstruction_error(self, v0): - """ - @note: According to Yoshua, -log P(V1 = v0 | tilde(h)(v0)). - """ - return crossentropy(self.deterministic_reconstruction(v0), v0) - - def update(self, instances): - """ - Update the L{Model} using one training instance. - @param instance: A dict from feature index to (non-zero) value. - @todo: Should assert that nonzero_indices and zero_indices - are correct (i.e. are truly nonzero/zero). - @todo: Multiply L{self.weight_decay} by L{self.learning_rate}, as done in Semantic Hashing? - @todo: Decay the biases too? - """ - minibatch = len(instances) - v0 = pylearn.sparse_instance.to_vector(instances, self.input_dimension) - print "old XENT per instance:", numpy.sum(self.deterministic_reconstruction_error(v0))/minibatch - q0 = sigmoid(self.parameters.b + dot(v0, self.parameters.w)) - h0 = sample(q0) - p0 = sigmoid(self.parameters.c + dot(h0, self.parameters.w.T)) - v1 = sample(p0) - q1 = sigmoid(self.parameters.b + dot(v1, self.parameters.w)) - - dw = self.learning_rate * (dot(v0.T, h0) - dot(v1.T, q1)) / minibatch + self.momentum * self.prev_dw - db = self.learning_rate * numpy.sum(h0 - q1, axis=0) / minibatch + self.momentum * self.prev_db - dc = self.learning_rate * numpy.sum(v0 - v1, axis=0) / minibatch + self.momentum * self.prev_dc - - self.parameters.w *= (1 - self.weight_decay) - - self.parameters.w += dw - self.parameters.b += db - self.parameters.c += dc - - self.last_dw = dw - self.last_db = db - self.last_dc = dc - - print "new XENT per instance:", numpy.sum(self.deterministic_reconstruction_error(v0))/minibatch - -# print -# print "v[0]:", v0 -# print "Q(h[0][i] = 1 | v[0]):", q0 -# print "h[0]:", h0 -# print "P(v[1][j] = 1 | h[0]):", p0 -# print "XENT(P(v[1][j] = 1 | h[0]) | v0):", numpy.sum(crossentropy(p0, v0)) -# print "v[1]:", v1 -# print "Q(h[1][i] = 1 | v[1]):", q1 -# -# print -# print v0.T.shape -# print h0.shape -# print dot(v0.T, h0).shape -# print self.parameters.w.shape -# self.parameters.w += self.learning_rate * (dot(v0.T, h0) - dot(v1.T, q1)) / minibatch -# print -# print h0.shape -# print q1.shape -# print self.parameters.b.shape -# self.parameters.b += self.learning_rate * numpy.sum(h0 - q1, axis=0) / minibatch -# print v0.shape, v1.shape -# print -# print self.parameters.c.shape -# self.parameters.c += self.learning_rate * numpy.sum(v0 - v1, axis=0) / minibatch -# print self.parameters diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/rbm/parameters.py --- a/sandbox/rbm/parameters.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -""" -Parameters (weights) used by the L{Model}. -""" - -import numpy - -class Parameters: - """ - Parameters used by the L{Model}. - """ - def __init__(self, input_dimension, hidden_dimension, randomly_initialize, random_seed): - """ - Initialize L{Model} parameters. - @param randomly_initialize: If True, then randomly initialize - according to the given random_seed. If False, then just use zeroes. - """ - if randomly_initialize: - numpy.random.random_seed(random_seed) - self.w = (numpy.random.rand(input_dimension, hidden_dimension)-0.5)/input_dimension - self.b = numpy.zeros((1, hidden_dimension)) - self.c = numpy.zeros((1, input_dimension)) - else: - self.w = numpy.zeros((input_dimension, hidden_dimension)) - self.b = numpy.zeros((1, hidden_dimension)) - self.c = numpy.zeros((1, input_dimension)) - - def __str__(self): - s = "" - s += "w: %s\n" % self.w - s += "b: %s\n" % self.b - s += "c: %s\n" % self.c - return s diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/simple_autoassociator/README.txt --- a/sandbox/simple_autoassociator/README.txt Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -This seems to work. - -@todo: - * Add momentum. - * Add learning rate decay schedule. diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/simple_autoassociator/__init__.py diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/simple_autoassociator/graph.py --- a/sandbox/simple_autoassociator/graph.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -""" -Theano graph for a simple autoassociator. -@todo: Make nearly everything private. -""" - -from pylearn.nnet_ops import sigmoid, binary_crossentropy -from theano import tensor as t -from theano.tensor import dot -x = t.dmatrix() -w1 = t.dmatrix() -b1 = t.dvector() -w2 = t.dmatrix() -b2 = t.dvector() -h = sigmoid(dot(x, w1) + b1) -y = sigmoid(dot(h, w2) + b2) - -loss_unsummed = binary_crossentropy(y, x) -loss = t.sum(loss_unsummed) - -(gw1, gb1, gw2, gb2) = t.grad(loss, [w1, b1, w2, b2]) - -import theano.compile - -inputs = [x, w1, b1, w2, b2] -outputs = [y, h, loss, gw1, gb1, gw2, gb2] -trainfn = theano.compile.function(inputs, outputs) diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/simple_autoassociator/main.py --- a/sandbox/simple_autoassociator/main.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ -#!/usr/bin/python -""" - A simple autoassociator. - - The learned model is:: - h = sigmoid(dot(x, w1) + b1) - y = sigmoid(dot(h, w2) + b2) - - Binary xent loss. -""" - - -import numpy - -nonzero_instances = [] -nonzero_instances.append({0: 1, 1: 1}) -nonzero_instances.append({0: 1, 2: 1}) - -#nonzero_instances.append({1: 0.1, 5: 0.5, 9: 1}) -#nonzero_instances.append({2: 0.3, 5: 0.5, 8: 0.8}) -##nonzero_instances.append({1: 0.2, 2: 0.3, 5: 0.5}) - -import model -model = model.Model(input_dimension=10, hidden_dimension=4) - -for i in xrange(100000): -# # Select an instance -# instance = nonzero_instances[i % len(nonzero_instances)] - - # Update over instance - model.update(nonzero_instances) diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/simple_autoassociator/model.py --- a/sandbox/simple_autoassociator/model.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ -""" -The model for an autoassociator for sparse inputs, using Ronan Collobert + Jason -Weston's sampling trick (2008). -""" - -from graph import trainfn -import parameters - -import numpy -import random - -import pylearn.sparse_instance - -class Model: - """ - @todo: Add momentum. - @todo: Add learning rate decay schedule. - """ - def __init__(self, input_dimension, hidden_dimension, learning_rate = 0.1, weight_decay = 0.0002, random_seed = 666): - self.input_dimension = input_dimension - self.hidden_dimension = hidden_dimension - self.learning_rate = learning_rate - self.weight_decay = weight_decay - self.random_seed = random_seed - - random.seed(random_seed) - - self.parameters = parameters.Parameters(input_dimension=self.input_dimension, hidden_dimension=self.hidden_dimension, randomly_initialize=True, random_seed=self.random_seed) - - def deterministic_reconstruction(self, x): - (y, h, loss, gw1, gb1, gw2, gb2) = trainfn(x, self.parameters.w1, self.parameters.b1, self.parameters.w2, self.parameters.b2) - return y - - def update(self, instances): - """ - Update the L{Model} using one training instance. - @param instances: A list of dict from feature index to (non-zero) value. - @todo: Should assert that nonzero_indices and zero_indices - are correct (i.e. are truly nonzero/zero). - @todo: Multiply L{self.weight_decay} by L{self.learning_rate}, as done in Semantic Hashing? - @todo: Decay the biases too? - """ - minibatch = len(instances) - x = pylearn.sparse_instance.to_vector(instances, self.input_dimension) - - (y, h, loss, gw1, gb1, gw2, gb2) = trainfn(x, self.parameters.w1, self.parameters.b1, self.parameters.w2, self.parameters.b2) -# print -# print "instance:", instance -# print "x:", x -# print "OLD y:", y - print "OLD total loss:", loss -# print "gw1:", gw1 -# print "gb1:", gb1 -# print "gw2:", gw2 -# print "gb2:", gb2 - - self.parameters.w1 *= (1 - self.weight_decay) - self.parameters.w2 *= (1 - self.weight_decay) - - # SGD update - self.parameters.w1 -= self.learning_rate * gw1 / minibatch - self.parameters.b1 -= self.learning_rate * gb1 / minibatch - self.parameters.w2 -= self.learning_rate * gw2 / minibatch - self.parameters.b2 -= self.learning_rate * gb2 / minibatch - -# # Recompute the loss, to make sure it's descreasing -# (y, h, loss, gw1, gb1, gw2, gb2) = trainfn(x, self.parameters.w1, self.parameters.b1, self.parameters.w2, self.parameters.b2) -## print "NEW y:", y -# print "NEW total loss:", loss -## print "h:", h -## print self.parameters diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/simple_autoassociator/parameters.py --- a/sandbox/simple_autoassociator/parameters.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -""" -Parameters (weights) used by the L{Model}. -""" - -import numpy - -class Parameters: - """ - Parameters used by the L{Model}. - """ - def __init__(self, input_dimension, hidden_dimension, randomly_initialize, random_seed): - """ - Initialize L{Model} parameters. - @param randomly_initialize: If True, then randomly initialize - according to the given seed. If False, then just use zeroes. - """ - if randomly_initialize: - numpy.random.seed(random_seed) - self.w1 = (numpy.random.rand(input_dimension, hidden_dimension)-0.5)/input_dimension - self.w2 = (numpy.random.rand(hidden_dimension, input_dimension)-0.5)/hidden_dimension - self.b1 = numpy.zeros(hidden_dimension) - self.b2 = numpy.zeros(input_dimension) - #self.b2 = numpy.array([10, 0, 0, -10]) - else: - self.w1 = numpy.zeros((input_dimension, hidden_dimension)) - self.w2 = numpy.zeros((hidden_dimension, input_dimension)) - self.b1 = numpy.zeros(hidden_dimension) - self.b2 = numpy.zeros(input_dimension) - - def __str__(self): - s = "" - s += "w1: %s\n" % self.w1 - s += "b1: %s\n" % self.b1 - s += "w2: %s\n" % self.w2 - s += "b2: %s\n" % self.b2 - return s diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/sparse_random_autoassociator/README.txt --- a/sandbox/sparse_random_autoassociator/README.txt Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -Since simple_aa doesn't work, this probably doesn't either. diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/sparse_random_autoassociator/__init__.py diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/sparse_random_autoassociator/globals.py --- a/sandbox/sparse_random_autoassociator/globals.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ -""" -Global variables. -""" - -INPUT_DIMENSION = 1000 -HIDDEN_DIMENSION = 20 -LEARNING_RATE = 0.1 -LR = LEARNING_RATE -SEED = 666 -ZERO_SAMPLE_SIZE = 50 -#ZERO_SAMPLE_SIZE = 250 -MARGIN = 0.25 -#MARGIN = 0.0 diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/sparse_random_autoassociator/graph.py --- a/sandbox/sparse_random_autoassociator/graph.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ -""" -Theano graph for an autoassociator for sparse inputs, which will be trained -using Ronan Collobert + Jason Weston's sampling trick (2008). -@todo: Make nearly everything private. -""" - -from globals import MARGIN - -from pylearn.nnet_ops import sigmoid, binary_crossentropy -from theano import tensor as t -from theano.tensor import dot -xnonzero = t.dvector() -w1nonzero = t.dmatrix() -b1 = t.dvector() -w2nonzero = t.dmatrix() -w2zero = t.dmatrix() -b2nonzero = t.dvector() -b2zero = t.dvector() -h = sigmoid(dot(xnonzero, w1nonzero) + b1) -ynonzero = sigmoid(dot(h, w2nonzero) + b2nonzero) -yzero = sigmoid(dot(h, w2zero) + b2zero) - -# May want to weight loss wrt nonzero value? e.g. MARGIN violation for -# 0.1 nonzero is not as bad as MARGIN violation for 0.2 nonzero. -def hingeloss(MARGIN): - return -MARGIN * (MARGIN < 0) -nonzeroloss = hingeloss(ynonzero - t.max(yzero) - MARGIN) -zeroloss = hingeloss(-t.max(-(ynonzero)) - yzero - MARGIN) -# xnonzero sensitive loss: -#nonzeroloss = hingeloss(ynonzero - t.max(yzero) - MARGIN - xnonzero) -#zeroloss = hingeloss(-t.max(-(ynonzero - xnonzero)) - yzero - MARGIN) -loss = t.sum(nonzeroloss) + t.sum(zeroloss) - -#loss = t.sum(binary_crossentropy(ynonzero, xnonzero)) + t.sum(binary_crossentropy(yzero, t.constant(0))) - -(gw1nonzero, gb1, gw2nonzero, gw2zero, gb2nonzero, gb2zero) = t.grad(loss, [w1nonzero, b1, w2nonzero, w2zero, b2nonzero, b2zero]) - -import theano.compile - -inputs = [xnonzero, w1nonzero, b1, w2nonzero, w2zero, b2nonzero, b2zero] -outputs = [ynonzero, yzero, loss, gw1nonzero, gb1, gw2nonzero, gw2zero, gb2nonzero, gb2zero] -trainfn = theano.compile.function(inputs, outputs) diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/sparse_random_autoassociator/main.py --- a/sandbox/sparse_random_autoassociator/main.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ -#!/usr/bin/python -""" - An autoassociator for sparse inputs, using Ronan Collobert + Jason - Weston's sampling trick (2008). - - The learned model is:: - h = sigmoid(dot(x, w1) + b1) - y = sigmoid(dot(h, w2) + b2) - - We assume that most of the inputs are zero, and hence that - we can separate x into xnonzero, x's nonzero components, and - xzero, a sample of the zeros. We sample---randomly without - replacement---ZERO_SAMPLE_SIZE zero columns from x. - - The desideratum is that every nonzero entry is separated from every - zero entry by margin at least MARGIN. - For each ynonzero, we want it to exceed max(yzero) by at least MARGIN. - For each yzero, we want it to be exceed by min(ynonzero) by at least MARGIN. - The loss is a hinge loss (linear). The loss is irrespective of the - xnonzero magnitude (this may be a limitation). Hence, all nonzeroes - are equally important to exceed the maximum yzero. - - (Alternately, there is a commented out binary xent loss.) - - LIMITATIONS: - - Only does pure stochastic gradient (batchsize = 1). - - Loss is irrespective of the xnonzero magnitude. - - We will always use all nonzero entries, even if the training - instance is very non-sparse. -""" - - -import numpy - -nonzero_instances = [] -nonzero_instances.append({1: 0.1, 5: 0.5, 9: 1}) -nonzero_instances.append({2: 0.3, 5: 0.5, 8: 0.8}) -nonzero_instances.append({1: 0.2, 2: 0.3, 5: 0.5}) - -import model -model = model.Model() - -for i in xrange(100000): - # Select an instance - instance = nonzero_instances[i % len(nonzero_instances)] - - # SGD update over instance - model.update(instance) diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/sparse_random_autoassociator/model.py --- a/sandbox/sparse_random_autoassociator/model.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ -""" -The model for an autoassociator for sparse inputs, using Ronan Collobert + Jason -Weston's sampling trick (2008). -""" - -from graph import trainfn -import parameters - -import globals -from globals import LR - -import numpy -import random -random.seed(globals.SEED) - -def _select_indices(instance): - """ - Choose nonzero and zero indices (feature columns) of the instance. - We select B{all} nonzero indices. - We select L{globals.ZERO_SAMPLE_SIZE} zero indices randomly, - without replacement. - @bug: If there are not ZERO_SAMPLE_SIZE zeroes, we will enter - an endless loop. - @return: (nonzero_indices, zero_indices) - """ - # Get the nonzero indices - nonzero_indices = instance.keys() - nonzero_indices.sort() - - # Get the zero indices - # @bug: If there are not ZERO_SAMPLE_SIZE zeroes, we will enter an endless loop. - zero_indices = [] - while len(zero_indices) < globals.ZERO_SAMPLE_SIZE: - idx = random.randint(0, globals.INPUT_DIMENSION - 1) - if idx in nonzero_indices or idx in zero_indices: continue - zero_indices.append(idx) - zero_indices.sort() - - return (nonzero_indices, zero_indices) - -class Model: - def __init__(self): - self.parameters = parameters.Parameters(randomly_initialize=True) - - def update(self, instance): - """ - Update the L{Model} using one training instance. - @param instance: A dict from feature index to (non-zero) value. - @todo: Should assert that nonzero_indices and zero_indices - are correct (i.e. are truly nonzero/zero). - """ - (nonzero_indices, zero_indices) = _select_indices(instance) - # No update if there aren't any non-zeros. - if len(nonzero_indices) == 0: return - xnonzero = numpy.asarray([instance[idx] for idx in nonzero_indices]) - print - print "xnonzero:", xnonzero - - (ynonzero, yzero, loss, gw1nonzero, gb1, gw2nonzero, gw2zero, gb2nonzero, gb2zero) = trainfn(xnonzero, self.parameters.w1[nonzero_indices, :], self.parameters.b1, self.parameters.w2[:, nonzero_indices], self.parameters.w2[:, zero_indices], self.parameters.b2[nonzero_indices], self.parameters.b2[zero_indices]) - print "OLD ynonzero:", ynonzero - print "OLD yzero:", yzero - print "OLD total loss:", loss - - # SGD update - self.parameters.w1[nonzero_indices, :] -= LR * gw1nonzero - self.parameters.b1 -= LR * gb1 - self.parameters.w2[:, nonzero_indices] -= LR * gw2nonzero - self.parameters.w2[:, zero_indices] -= LR * gw2zero - self.parameters.b2[nonzero_indices] -= LR * gb2nonzero - self.parameters.b2[zero_indices] -= LR * gb2zero - - # Recompute the loss, to make sure it's descreasing - (ynonzero, yzero, loss, gw1nonzero, gb1, gw2nonzero, gw2zero, gb2nonzero, gb2zero) = trainfn(xnonzero, self.parameters.w1[nonzero_indices, :], self.parameters.b1, self.parameters.w2[:, nonzero_indices], self.parameters.w2[:, zero_indices], self.parameters.b2[nonzero_indices], self.parameters.b2[zero_indices]) - print "NEW ynonzero:", ynonzero - print "NEW yzero:", yzero - print "NEW total loss:", loss diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/sparse_random_autoassociator/parameters.py --- a/sandbox/sparse_random_autoassociator/parameters.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,28 +0,0 @@ -""" -Parameters (weights) used by the L{Model}. -""" - -import numpy -import globals - -class Parameters: - """ - Parameters used by the L{Model}. - """ - def __init__(self, input_dimension=globals.INPUT_DIMENSION, hidden_dimension=globals.HIDDEN_DIMENSION, randomly_initialize=False, seed=globals.SEED): - """ - Initialize L{Model} parameters. - @param randomly_initialize: If True, then randomly initialize - according to the given seed. If False, then just use zeroes. - """ - if randomly_initialize: - numpy.random.seed(seed) - self.w1 = (numpy.random.rand(input_dimension, hidden_dimension)-0.5)/input_dimension - self.w2 = (numpy.random.rand(hidden_dimension, input_dimension)-0.5)/hidden_dimension - self.b1 = numpy.zeros(hidden_dimension) - self.b2 = numpy.zeros(input_dimension) - else: - self.w1 = numpy.zeros((input_dimension, hidden_dimension)) - self.w2 = numpy.zeros((hidden_dimension, input_dimension)) - self.b1 = numpy.zeros(hidden_dimension) - self.b2 = numpy.zeros(input_dimension) diff -r 27b1344a57b1 -r 8fff4bc26f4c sandbox/statscollector.py --- a/sandbox/statscollector.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,127 +0,0 @@ - -# Here is how I see stats collectors: - -def my_stats(graph): - graph.mse=examplewise_mean(square_norm(graph.residue)) - graph.training_loss=graph.regularizer+examplewise_sum(graph.nll) - return [graph.mse,graph.training_loss] - - -# def my_stats(residue,nll,regularizer): -# mse=examplewise_mean(square_norm(residue)) -# training_loss=regularizer+examplewise_sum(nll) -# set_names(locals()) -# return ((residue,nll),(regularizer),(),(mse,training_loss)) -# my_stats_collector = make_stats_collector(my_stats) -# -# where make_stats_collector calls my_stats(examplewise_fields, attributes) to -# construct its update function, and figure out what are the input fields (here "residue" -# and "nll") and input attributes (here "regularizer") it needs, and the output -# attributes that it computes (here "mse" and "training_loss"). Remember that -# fields are examplewise quantities, but attributes are not, in my jargon. -# In the above example, I am highlighting that some operations done in my_stats -# are examplewise and some are not. I am hoping that theano Ops can do these -# kinds of internal side-effect operations (and proper initialization of these hidden -# variables). I expect that a StatsCollector (returned by make_stats_collector) -# knows the following methods: -# stats_collector.input_fieldnames -# stats_collector.input_attribute_names -# stats_collector.output_attribute_names -# stats_collector.update(mini_dataset) -# stats_collector['mse'] -# where mini_dataset has the input_fieldnames() as fields and the input_attribute_names() -# as attributes, and in the resulting dataset the output_attribute_names() are set to the -# proper numeric values. - - - -import theano -from theano import tensor as t -from Learner import Learner -from lookup_list import LookupList - -class StatsCollectorModel(AttributesHolder): - def __init__(self,stats_collector): - self.stats_collector = stats_collector - self.outputs = LookupList(stats_collector.output_names,[None for name in stats_collector.output_names]) - # the statistics get initialized here - self.update_function = theano.function(input_attributes+input_fields,output_attributes+output_fields,linker="c|py") - for name,value in self.outputs.items(): - self.__setattribute__(name,value) - def update(self,dataset): - input_fields = dataset.fields()(self.stats_collector.input_field_names) - input_attributes = dataset.getAttributes(self.stats_collector.input_attribute_names) - self.outputs._values = self.update_function(input_attributes+input_fields) - for name,value in self.outputs.items(): - self.__setattribute__(name,value) - def __call__(self): - return self.outputs - def attributeNames(self): - return self.outputs.keys() - -class StatsCollector(AttributesHolder): - - def __init__(self,input_attributes, input_fields, outputs): - self.input_attributes = input_attributes - self.input_fields = input_fields - self.outputs = outputs - self.input_attribute_names = [v.name for v in input_attributes] - self.input_field_names = [v.name for v in input_fields] - self.output_names = [v.name for v in output_attributes] - - def __call__(self,dataset=None): - model = StatsCollectorModel(self) - if dataset: - self.update(dataset) - return model - -if __name__ == '__main__': - def my_statscollector(): - regularizer = t.scalar() - nll = t.matrix() - class_error = t.matrix() - total_loss = regularizer+t.examplewise_sum(nll) - avg_nll = t.examplewise_mean(nll) - avg_class_error = t.examplewise_mean(class_error) - for name,val in locals().items(): val.name = name - return StatsCollector([regularizer],[nll,class_error],[total_loss,avg_nll,avg_class_error]) - - - - -# OLD DESIGN: -# -# class StatsCollector(object): -# """A StatsCollector object is used to record performance statistics during training -# or testing of a learner. It can be configured to measure different things and -# accumulate the appropriate statistics. From these statistics it can be interrogated -# to obtain performance measures of interest (such as maxima, minima, mean, standard -# deviation, standard error, etc.). Optionally, the observations can be weighted -# (yielded weighted mean, weighted variance, etc., where applicable). The statistics -# that are desired can be specified among a list supported by the StatsCollector -# class or subclass. When some statistics are requested, others become automatically -# available (e.g., sum or mean).""" -# -# default_statistics = [mean,standard_deviation,min,max] -# -# __init__(self,n_quantities_observed, statistics=default_statistics): -# self.n_quantities_observed=n_quantities_observed -# -# clear(self): -# raise NotImplementedError -# -# update(self,observations): -# """The observations is a numpy vector of length n_quantities_observed. Some -# entries can be 'missing' (with a NaN entry) and will not be counted in the -# statistics.""" -# raise NotImplementedError -# -# __getattr__(self, statistic) -# """Return a particular statistic, which may be inferred from the collected statistics. -# The argument is a string naming that statistic.""" - - - - - - diff -r 27b1344a57b1 -r 8fff4bc26f4c setup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/setup.py Mon Mar 30 20:48:04 2009 -0400 @@ -0,0 +1,14 @@ +#!/bin/env python + +from ez_setup import use_setuptools +use_setuptools() +from setuptools import setup, find_packages, Extension, Library +setup(name="Pylearn", + version="0.1", + description="Pylearn", + long_description="""Machine learning toolkit""", + author="LISA", + author_email="pylearn-dev@googlegroups.com", + packages=find_packages(exclude='tests'), +) + diff -r 27b1344a57b1 -r 8fff4bc26f4c sparse_instance.py --- a/sparse_instance.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ -""" -Sparse instances. -Each instance is represented as dict with key dimension. -Dimensions not present in the dict have value 0. -""" - -from numpy import zeros - -def to_vector(instances, dimensions): - """ - Convert sparse instances to vectors. - @type instances: list of sparse instances - @param dimensions: The number of dimensions in each instance. - @rtype: numpy matrix (instances x dimensions) - @todo: Allow this function to convert SINGLE instances (not lists). - """ - v = zeros((len(instances), dimensions)) - l = len(instances) - for i in range(l): - for idx in instances[i].keys(): - v[i][idx] = instances[i][idx] - return v diff -r 27b1344a57b1 -r 8fff4bc26f4c squashfn.py --- a/squashfn.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ - -def squashfn(str): - if str == "sigmoid": - import theano.tensor.nnet as nnet - return nnet.sigmoid - elif str == "tanh": - import theano.tensor as t - return t.tanh - elif str == "softsign": - from theano.sandbox.softsign import softsign - return softsign - else: assert 0 - - diff -r 27b1344a57b1 -r 8fff4bc26f4c stat_ops.py --- a/stat_ops.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,92 +0,0 @@ - -import theano -from theano import gof -from theano import tensor -import numpy - - -class ExampleWiseMean(gof.Op): - - def __init__(self): - self.destroy_map = {0: [1, 2]} - - def make_node(self, x): - return gof.Apply(self, - [x, tensor.value(float('nan')), tensor.value(0)], - [tensor.Tensor(dtype = 'float64', - broadcastable = x.type.broadcastable)()]) - - def perform(self, node, (x, sum, n), (out,)): - if numpy.isnan(sum).any(): - sum.resize(x.shape, refcheck=0) - sum[:] = x - else: - sum += x - n += 1 - out[0] = sum / n - - def c_code(self, name, node, (x, sum, n), (out, ), sub): - return """ - PyObject* multi; - int nelems; - if (isnan(((double*)(%(sum)s->data))[0])) { - PyArray_Dims dims; - dims.len = %(x)s->nd; - dims.ptr = %(x)s->dimensions; - PyArray_Resize(%(sum)s, &dims, 0, PyArray_CORDER); - multi = PyArray_MultiIterNew(2, %(sum)s, %(x)s); - nelems = PyArray_SIZE(%(sum)s); - while (nelems--) { - // Copy %(x)s in %(sum)s - *(double*)PyArray_MultiIter_DATA(multi, 0) = *(double*)PyArray_MultiIter_DATA(multi, 1); - PyArray_MultiIter_NEXT(multi); - } - } - else { - // Add some error checking on the size of x - multi = PyArray_MultiIterNew(2, %(sum)s, %(x)s); - nelems = PyArray_SIZE(%(sum)s); - while (nelems--) { - // Add %(x)s to %(sum)s - *(double*)PyArray_MultiIter_DATA(multi, 0) += *(double*)PyArray_MultiIter_DATA(multi, 1); - PyArray_MultiIter_NEXT(multi); - } - } - ((npy_int64*)(%(n)s->data))[0]++; - int n = ((npy_int64*)(%(n)s->data))[0]; - if (%(out)s == NULL) { - %(out)s = (PyArrayObject*)PyArray_EMPTY(%(sum)s->nd, %(sum)s->dimensions, NPY_FLOAT64, 0); - } - multi = PyArray_MultiIterNew(2, %(sum)s, %(out)s); - nelems = PyArray_SIZE(%(sum)s); - while (nelems--) { - // %(out)s <- %(sum)s / %(n)s - *(double*)PyArray_MultiIter_DATA(multi, 1) = *(double*)PyArray_MultiIter_DATA(multi, 0) / n; - PyArray_MultiIter_NEXT(multi); - } - """ % dict(locals(), **sub) - - - -if __name__ == '__main__': - - vectors = numpy.random.RandomState(666).rand(10, 2) - - x = tensor.dvector() - e = ExampleWiseMean()(x) - - # f = theano.function([x], [e], linker = 'py') - - # for i, v in enumerate(vectors): - # print v, "->", f(v), numpy.mean(vectors[:i+1], axis=0) - - # print - - f = theano.function([x], [e], linker = 'c|py') - - for i, v in enumerate(vectors): - print v, "->", f(v), numpy.mean(vectors[:i+1], axis=0) - - - - diff -r 27b1344a57b1 -r 8fff4bc26f4c stopper.py --- a/stopper.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,122 +0,0 @@ -"""Early stopping iterators - -The idea here is to supply early-stopping heuristics that can be used in the -form: - - stopper = SomeEarlyStopper() - - for i in stopper(): - # train from data - if i.set_score: - i.score = validation_score - - -So far I only have one heuristic, so maybe this won't scale. -""" - -class Stopper(object): - - def train(self, data, update_rows_fn, update, validate, save=None): - """Return the best model trained on data - - Parameters: - data - a thing that accepts getitem(), or a tuple of such things - update_rows_fn - fn : int --> - update - fn: update an internal model from elements of data - validate - fn: evaluate an internal model based on elements of data - save - fn: return a copy of the internal model - - The body of this function exhausts the iterator, and trains a - model using early stopping in the process. - """ - - best = None - for stp in self: - i = stp.iter - - # call update on some training set rows - t_rows = update_rows_fn(i) - if isinstance(data, (tuple, list)): - update(*[d[t_rows] for d in data]) - else: - update(data[t_rows]) - - if stp.set_score: - stp.score = validate() - if (stp.score < stp.best_score) and save: - best = save() - return best - - def find_min(self, step, check, save): - best = None - for stp in self: - step() - if stp.set_score: - stp.score = check() - if (stp.score < stp.best_score) and save: - best = (save(), stp.iter, stp.score) - return best - - - -class ICML08Stopper(Stopper): - @staticmethod - def icml08(ntrain, batchsize): - """Some setting similar to what I used for ICML08 submission""" - #TODO: what did I actually use? put that in here. - return ICML08Stopper(30*ntrain/batchsize, - ntrain/batchsize, 0.96, 2.0, 100000000) - - def __init__(self, i_wait, v_int, min_improvement, patience, hard_limit): - self.initial_wait = i_wait - self.set_score_interval = v_int - self.min_improvement = min_improvement - self.patience = patience - self.hard_limit = hard_limit - - self.best_score = float('inf') - self.best_iter = -1 - self.iter = -1 - - self.set_score = False - self.score = None - - def __iter__(self): - return self - - E_set_score = 'when iter.set_score is True, caller must assign a score to iter.score' - def next(self): - - #print 'ICML08 stopper, were doing a next' - - if self.set_score: #left over from last time - if self.score is None: - raise Exception(ICML08Stopper.E_set_score) - if self.score < (self.best_score * self.min_improvement): - (self.best_score, self.best_iter) = (self.score, self.iter) - self.score = None #un-set it - - - starting = self.iter < self.initial_wait - waiting = self.iter < (self.patience * self.best_iter) - if starting or waiting: - # continue to iterate - self.iter += 1 - if self.iter == self.hard_limit: - raise StopIteration - self.set_score = (self.iter % self.set_score_interval == 0) - return self - - raise StopIteration - - -class NStages(ICML08Stopper): - """Run for a fixed number of steps, checking validation set every so - often.""" - def __init__(self, hard_limit, v_int): - ICML08Stopper.__init__(self, hard_limit, v_int, 1.0, 1.0, hard_limit) - - #TODO: could optimize next() function. Most of what's in ICML08Stopper.next() - #is not necessary - - diff -r 27b1344a57b1 -r 8fff4bc26f4c test_speed.py --- a/test_speed.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ -import numpy -from dataset import * -from misc import * -def test_speed(array, ds): - print "test_speed", ds.__class__ - - mat = numpy.random.rand(400,100) - - @print_timing - def f_array_full(a): - a+1 - @print_timing - def f_array_index(a): - for id in range(a.shape[0]): -# pass - a[id]+1 -# a[id]*mat - @print_timing - def f_array_iter(a): - for r in a: -# pass - r+1 -# r*mat - @print_timing - def f_ds_index(ds): - for id in range(len(ds)): -# pass - ds[id][0]+1 -# ds[id][0]*mat - @print_timing - def f_ds_iter(ds): - for ex in ds: -# pass - ex[0]+1 -# a[0]*mat - @print_timing - def f_ds_mb1(ds,mb_size): - for exs in ds.minibatches(minibatch_size = mb_size): - for ex in exs: -# pass - ex[0]+1 -# ex[0]*mat - @print_timing - def f_ds_mb2(ds,mb_size): - for exs in ds.minibatches(minibatch_size = mb_size): -# pass - exs[0]+1 -# ex[0]*mat - - f_array_full(array) - f_array_index(array) - f_array_iter(array) - - f_ds_index(ds) - f_ds_iter(ds) - - f_ds_mb1(ds,10) - f_ds_mb1(ds,100) - f_ds_mb1(ds,1000) - f_ds_mb1(ds,10000) - f_ds_mb2(ds,10) - f_ds_mb2(ds,100) - f_ds_mb2(ds,1000) - f_ds_mb2(ds,10000) - -if __name__=='__main__': - a2 = numpy.random.rand(100000,400) - ds1 = ArrayDataSet(a2,{'all':slice(0,a2.shape[1],1)}) - test_speed(a2,ds1) - a1 = numpy.random.rand(100000,40) - ds4 = ArrayDataSet(a1,LookupList(["f"+str(x)for x in range(a1.shape[1])], - range(a1.shape[1]))) - test_speed(a2,ds4) - ds2=CachedDataSet(ds1,cache_all_upon_construction=False) - test_speed(a2,ds2) - ds3=CachedDataSet(ds1,cache_all_upon_construction=True) - test_speed(a2,ds3) - del a2,ds1,ds2,ds3 - diff -r 27b1344a57b1 -r 8fff4bc26f4c version.py --- a/version.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,292 +0,0 @@ -import subprocess as _subprocess -import imp as _imp -import sys -import os - - -_cache = dict() - -def src_version(module_name): - """Return compact identifier of module code. - - @return: compact identifier of module code. - @rtype: string - - @note: This function tries to establish that the source files and the repo - are syncronized. It raises an Exception if there are un-tracked '.py' - files, or if there are un-committed modifications. This implementation uses - "hg id" to establish this. The code returned by "hg id" is not affected by - hg pull, but pulling might remove the " tip" string which might have - appeared. This implementation ignores the " tip" information, and only - uses the code. - - @note: This implementation is assumes that the import directory is under - version control by mercurial. - - """ - - if module_name not in _cache: - - try : - location = _imp.find_module(module_name)[1] - except ImportError: - _cache[module_name] = None - return None - #print 'location:', location - isdir = False - if os.path.isdir(location) : - isdir = True - elif os.path.isfile(location) : - isdir = False - else : - # SEEMS THIS CASE EXIST, FOR WEIRD BUILTIN FUNCTIONS - #print location,": it's 'not a dir, it's not a file, it's superman!" - #raise Exception('Unknown location or file type') - _cache[module_name] = None - return None - - - # we're dealing with a dir - if isdir : - - # under hg? - if not os.path.exists( os.path.join( location , '.hg') ) : - _cache[module_name] = None - return None - - status = _subprocess.Popen(('hg','st'),cwd=location,stdout=_subprocess.PIPE).communicate()[0] - #print 'status =', status - #TODO: check that the process return code is 0 (ticket #45) - - #status_codes = [line[0] for line in if line and line[0] != '?'] - for line in status.split('\n'): - if not line: continue - if line[0] != '?': - raise Exception('Uncommitted modification to "%s" in %s (%s)' - %(line[2:], __name__,location)) - if line[0] == '?' and line[-3:] == '.py': - raise Exception('Untracked file "%s" in %s (%s)' - %(line[2:], __name__, location)) - - hg_id = _subprocess.Popen(('hg','id'),cwd=location,stdout=_subprocess.PIPE).communicate()[0] - - # This asserts my understanding of hg id return values - # There is mention in the doc that it might return two parent hash codes - # but I've never seen it, and I dont' know what it means or how it is - # formatted. - tokens = hg_id.split(' ') - assert len(tokens) <= 2 - assert len(tokens) >= 1 - assert tokens[0][-1] != '+' # the trailing + indicates uncommitted changes - if len(tokens) == 2: - assert tokens[1] == 'tip\n' - - _cache[module_name] = tokens[0] - - # we're dealing with a file - if not isdir : - - folder = os.path.split( os.path.abspath(location) )[0] - # under hg? - if not os.path.exists( os.path.join( folder , '.hg') ) : - _cache[module_name] = None - return None - - status = _subprocess.Popen(('hg','st',location),cwd=folder,stdout=_subprocess.PIPE).communicate()[0] - #print 'status =', status - - #status_codes = [line[0] for line in if line and line[0] != '?'] - for line in status.split('\n'): - if not line: continue - if line[0] != '?': - raise Exception('Uncommitted modification to "%s" in %s (%s)' - %(line[2:], location,folder)) - if line[0] == '?' and line[-3:] == '.py': - raise Exception('Untracked file "%s" in %s (%s)' - %(line[2:], location, folder)) - - hg_id = _subprocess.Popen(('hg','id'),cwd=folder,stdout=_subprocess.PIPE).communicate()[0] - - # This asserts my understanding of hg id return values - # There is mention in the doc that it might return two parent hash codes - # but I've never seen it, and I dont' know what it means or how it is - # formatted. - tokens = hg_id.split(' ') - assert len(tokens) <= 2 - assert len(tokens) >= 1 - if tokens[0][-1] == '+' : - tokens[0] = tokens[0][:-1] # the change was not on this file - if len(tokens) == 2: - assert tokens[1] == 'tip\n' - - _cache[module_name] = tokens[0] - - - return _cache[module_name] - -_unknown_version = 'unknown version' - -def hg_version(dirname, filenames=None): - """Return current changeset of directory I{dirname}. - - @type filename: list of str (or default: None) - @param filename: if specified, we ignore modifications to other files. - - @rtype: tuple (last changeset, modified) - - """ - if type(filenames) not in (list, tuple, type(None)): - raise TypeError(filenames) - - #may raise exception, for example if hg is not visible via PATH - status_proc = _subprocess.Popen(('hg','st'), cwd=dirname, - stdout=_subprocess.PIPE, stderr=_subprocess.PIPE) - status = status_proc.communicate()[0] #read stdout into buffer - if status_proc.returncode != 0: - raise OSError('hg returned %i, maybe %s is not under hg control?', - (status_proc.returncode, dirname)) - - #may raise exception, for example if hg is not visible via PATH - id_proc = _subprocess.Popen(('hg','id', '-i'), cwd=dirname, - stdout=_subprocess.PIPE, stderr=_subprocess.PIPE) - id_stdout = id_proc.communicate()[0] - if id_proc.returncode != 0: - raise OSError('hg returned %i, maybe %s is not under hg control?', - (id_proc.returncode, dirname)) - - care_about = (lambda some_file : True) if filenames is None \ - else (lambda some_file : some_file in filenames) - - # parse status codes for what we care about - care_about_mod = False - for line in status.split('\n'): - if not line: #empty lines happen - continue - line_file = line[2:] - if line[0] != '?' and care_about(line_file): - care_about_mod = True - #raise Exception('Uncommitted modification', - #os.path.join(dirname, line_file)) - if line[0] == '?' and line[-3:] == '.py': - print >> sys.stderr, 'WARNING: untracked file', os.path.join(dirname, line_file) - - # id_stdout is 12 hex digits followed by '+\n' or '\n' - # return the trailing '+' character only if there were changes to files that - # the caller cares about (named in filenames) - modified = (id_stdout[12] == '+') - assert len(id_stdout) in (13, 14) #sanity check - if modified and care_about_mod : - return id_stdout[:13] - else: - return id_stdout[:12] - -def _import_id_py_source(location): - try: - dirname = os.path.dirname(location[1]) - basename = os.path.basename(location[1]) - return hg_version(dirname, [basename]) - except OSError, e: - print >> sys.stderr, 'IGNORNING', e - return _unknown_version + ' PY_SOURCE' - -def _import_id_py_compiled(location): - #a .pyc file was found, but no corresponding .py - return _unknown_version + ' PYC_COMPILED' - -def _import_id_pkg_directory(location): - try: - return hg_version(location[1]) - except OSError, e: - print >> sys.stderr, 'IGNORNING', e - return _unknown_version + ' PKG_DIRECTORY' - -def _import_id(tag): - try : - location = _imp.find_module(tag) - except ImportError, e: #raise when tag is not found - return e #put this in the cache, import_id will raise it - - #the find_module was successful, location is valid - resource_type = location[2][2] - - if resource_type == _imp.PY_SOURCE: - return _import_id_py_source(location) - if resource_type == _imp.PY_COMPILED: - return _import_id_py_compiled(location) - if resource_type == _imp.C_EXTENSION: - raise NoteImplementedError - if resource_type == _imp.PY_RESOURCE: - raise NoteImplementedError - if resource_type == _imp.PKG_DIRECTORY: - return _import_id_pkg_directory(location) - if resource_type == _imp.C_BUILTIN: - raise NoteImplementedError - if resource_type == _imp.PY_FROZEN: - raise NoteImplementedError - - assert False #the list of resource types above should be exhaustive - -def import_id(tag): - """Return an identifier of the code imported by 'import '. - - @param tag: a module or file name - @type tag: string - - @rtype: string - @return: identifier of the code imported by 'import '. - - This high-level function might do different things depending on, for - example, whether I{tag} identifies a file or a directory, or whether the - named entity is under some sort of version/revision control. - - Versions are sought in the following order: - 0. If I{tag} is 'python' then sys.version will be returned - 1. If I{tag} names a file or folder under revision control, this function - will attempt to guess which one, and return a string that identifies the - running code (a revision id, not the whole file!) - 2. If I{tag} names a module with a __version__ attribute, then that - attribute will be returned as a string. - 3. The string starting with 'unknown version' will be returned for other valid modules. - 4. An exception will be raise for non-existent modules. - - @note: This function may import the named entity in order to return a - __version__ module attribute. - - """ - if tag not in import_id.cache: - import_id.cache[tag] = _import_id(tag) - - #in the case of bad module names, we cached the ImportError exception - rval = import_id.cache[tag] - if isinstance(rval, Exception): - raise rval - return rval -import_id.cache = {'python':sys.version} - -def get_all_src_versions() : - """ - Get the version of all loaded module. - Calls src_version on all loaded modules. These modules are found - using sys.modules. - - Returns a dictionnary: name->version. - - @RETURN dict Dictionnary (module's name) -> (version) - @SEE src_version - """ - allmodules = sys.modules - d = dict() - for m in allmodules : - try: - d[m] = import_id(m) - except: - pass - return d - - -if __name__ == "__main__" : - - if len(sys.argv) == 2 : - print 'testing on', sys.argv[1] - print import_id(sys.argv[1]) - diff -r 27b1344a57b1 -r 8fff4bc26f4c weights.py --- a/weights.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -""" -Routine to initialize weights. - -@note: We assume that numpy.random.seed() has already been performed. -""" - -from math import pow, sqrt -import numpy.random - -sqrt3 = sqrt(3.0) -def random_weights(nin, nout, scale_by=1./sqrt3, power=0.5): - """ - Generate an initial weight matrix with nin inputs (rows) and nout - outputs (cols). - Each weight is chosen uniformly at random to be in range: - [-scale_by*sqrt(3)/pow(nin,power), +scale_by*sqrt(3)/pow(nin,power)] - @note: Play with scale_by, but reasonable values are <=1, maybe 1./sqrt3 - power=0.5 is strongly recommanded (see below). - - Suppose these weights w are used in dot products as follows: - output = w' input - If w ~ Uniform(-r,r) and Var[input_i]=1 and x_i's are independent, then - Var[w]=r2/3 - Var[output] = Var[ sum_{i=1}^d w_i input_i] = d r2 / 3 - To make sure that variance is not changed after the dot product, - we therefore want Var[output]=1 and r = sqrt(3)/sqrt(d). This choice - corresponds to the default values scale_by=sqrt(3) and power=0.5. - More generally we see that Var[output] = Var[input] * scale_by. - - Now, if these are weights in a deep multi-layer neural network, - we would like the top layers to be initially more linear, so as to let - gradients flow back more easily (this is an explanation by Ronan Collobert). - To achieve this we want scale_by smaller than 1. - Ronan used scale_by=1/sqrt(3) (by mistake!) and got better results than scale_by=1 - in the experiment of his ICML'2008 paper. - Note that if we have a multi-layer network, ignoring the effect of the tanh non-linearity, - the variance of the layer outputs would go down roughly by a factor 'scale_by' at each - layer (making the layers more linear as we go up towards the output). - """ - return (numpy.random.rand(nin, nout) * 2.0 - 1) * scale_by * sqrt3 / pow(nin,power) diff -r 27b1344a57b1 -r 8fff4bc26f4c xlogx.py --- a/xlogx.py Thu Nov 20 06:38:06 2008 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,28 +0,0 @@ - -import theano -from theano import tensor, scalar -import numpy - -class XlogX(scalar.UnaryScalarOp): - """ - Compute X * log(X), with special case 0 log(0) = 0. - """ - @staticmethod - def st_impl(x): - if x == 0.0: - return 0.0 - return x * numpy.log(x) - def impl(self, x): - return XlogX.st_impl(x) - def grad(self, (x,), (gz,)): - return [gz * (1 + scalar.log(x))] - def c_code(self, node, name, (x,), (z,), sub): - if node.inputs[0].type in [scalar.float32, scalar.float64]: - return """%(z)s = - %(x)s == 0.0 - ? 0.0 - : %(x)s * log(%(x)s);""" % locals() - raise NotImplementedError('only floatingpoint is implemented') -scalar_xlogx = XlogX(scalar.upgrade_to_float, name='scalar_xlogx') -xlogx = tensor.Elemwise(scalar_xlogx, name='xlogx') -