changeset 376:c9a89be5cb0a

Redesigning linear_regression
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Mon, 07 Jul 2008 10:08:35 -0400
parents 12ce29abf27d (current diff) 90a29489b5c8 (diff)
children 67c339260875
files linear_regression.py misc.py
diffstat 10 files changed, 682 insertions(+), 174 deletions(-) [+]
line wrap: on
line diff
--- a/_test_dataset.py	Mon Jun 16 17:47:36 2008 -0400
+++ b/_test_dataset.py	Mon Jul 07 10:08:35 2008 -0400
@@ -2,7 +2,7 @@
 from dataset import *
 from math import *
 import numpy, unittest, sys
-from misc import *
+#from misc import *
 from lookup_list import LookupList
 
 def have_raised(to_eval, **var):
@@ -134,12 +134,13 @@
 #     - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N):
     i=0
     mi=0
-    m=ds.minibatches(['x','z'], minibatch_size=3)
-    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    size=3
+    m=ds.minibatches(['x','z'], minibatch_size=size)
+    assert hasattr(m,'__iter__')
     for minibatch in m:
-        assert isinstance(minibatch,DataSetFields)
+        assert isinstance(minibatch,LookupList)
         assert len(minibatch)==2
-        test_minibatch_size(minibatch,m.minibatch_size,len(ds),2,mi)
+        test_minibatch_size(minibatch,size,len(ds),2,mi)
         if type(ds)==ArrayDataSet:
             assert (minibatch[0][:,::2]==minibatch[1]).all()
         else:
@@ -147,92 +148,103 @@
                 (minibatch[0][j][::2]==minibatch[1][j]).all()
         mi+=1
         i+=len(minibatch[0])
-    assert i==len(ds)
-    assert mi==4
-    del minibatch,i,m,mi
+    assert i==(len(ds)/size)*size
+    assert mi==(len(ds)/size)
+    del minibatch,i,m,mi,size
 
     i=0
     mi=0
-    m=ds.minibatches(['x','y'], minibatch_size=3)
-    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    size=3
+    m=ds.minibatches(['x','y'], minibatch_size=size)
+    assert hasattr(m,'__iter__')
     for minibatch in m:
+        assert isinstance(minibatch,LookupList)
         assert len(minibatch)==2
-        test_minibatch_size(minibatch,m.minibatch_size,len(ds),2,mi)
+        test_minibatch_size(minibatch,size,len(ds),2,mi)
         mi+=1
         for id in range(len(minibatch[0])):
             assert (numpy.append(minibatch[0][id],minibatch[1][id])==array[i]).all()
             i+=1
-    assert i==len(ds)
-    assert mi==4
-    del minibatch,i,id,m,mi
+    assert i==(len(ds)/size)*size
+    assert mi==(len(ds)/size)
+    del minibatch,i,id,m,mi,size
 
 #     - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N):
     i=0
     mi=0
-    m=ds.minibatches(['x','z'], minibatch_size=3)
-    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    size=3
+    m=ds.minibatches(['x','z'], minibatch_size=size)
+    assert hasattr(m,'__iter__')
     for x,z in m:
-        test_minibatch_field_size(x,m.minibatch_size,len(ds),mi)
-        test_minibatch_field_size(z,m.minibatch_size,len(ds),mi)
+        test_minibatch_field_size(x,size,len(ds),mi)
+        test_minibatch_field_size(z,size,len(ds),mi)
         for id in range(len(x)):
             assert (x[id][::2]==z[id]).all()
             i+=1
         mi+=1
-    assert i==len(ds)
-    assert mi==4
-    del x,z,i,m,mi
+    assert i==(len(ds)/size)*size
+    assert mi==(len(ds)/size)
+    del x,z,i,m,mi,size
+
     i=0
     mi=0
+    size=3
     m=ds.minibatches(['x','y'], minibatch_size=3)
+    assert hasattr(m,'__iter__')
     for x,y in m:
-        test_minibatch_field_size(x,m.minibatch_size,len(ds),mi)
-        test_minibatch_field_size(y,m.minibatch_size,len(ds),mi)
+        assert len(x)==size
+        assert len(y)==size
+        test_minibatch_field_size(x,size,len(ds),mi)
+        test_minibatch_field_size(y,size,len(ds),mi)
         mi+=1
         for id in range(len(x)):
             assert (numpy.append(x[id],y[id])==array[i]).all()
             i+=1
-    assert i==len(ds)
-    assert mi==4
-    del x,y,i,id,m,mi
+    assert i==(len(ds)/size)*size
+    assert mi==(len(ds)/size)
+    del x,y,i,id,m,mi,size
 
 #not in doc
     i=0
-    m=ds.minibatches(['x','y'],n_batches=1,minibatch_size=3,offset=4)
-    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    size=3
+    m=ds.minibatches(['x','y'],n_batches=1,minibatch_size=size,offset=4)
+    assert hasattr(m,'__iter__')
     for x,y in m:
-        assert len(x)==m.minibatch_size
-        assert len(y)==m.minibatch_size
-        for id in range(m.minibatch_size):
+        assert len(x)==size
+        assert len(y)==size
+        for id in range(size):
             assert (numpy.append(x[id],y[id])==array[i+4]).all()
             i+=1
-    assert i==m.n_batches*m.minibatch_size
-    del x,y,i,id,m
+    assert i==size
+    del x,y,i,id,m,size
 
     i=0
-    m=ds.minibatches(['x','y'],n_batches=2,minibatch_size=3,offset=4)
-    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    size=3
+    m=ds.minibatches(['x','y'],n_batches=2,minibatch_size=size,offset=4)
+    assert hasattr(m,'__iter__')
     for x,y in m:
-        assert len(x)==m.minibatch_size
-        assert len(y)==m.minibatch_size
-        for id in range(m.minibatch_size):
+        assert len(x)==size
+        assert len(y)==size
+        for id in range(size):
             assert (numpy.append(x[id],y[id])==array[i+4]).all()
             i+=1
-    assert i==m.n_batches*m.minibatch_size
-    del x,y,i,id,m
+    assert i==2*size
+    del x,y,i,id,m,size
 
     i=0
-    m=ds.minibatches(['x','y'],n_batches=20,minibatch_size=3,offset=4)
-    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    size=3
+    m=ds.minibatches(['x','y'],n_batches=20,minibatch_size=size,offset=4)
+    assert hasattr(m,'__iter__')
     for x,y in m:
-        assert len(x)==m.minibatch_size
-        assert len(y)==m.minibatch_size
-        for id in range(m.minibatch_size):
+        assert len(x)==size
+        assert len(y)==size
+        for id in range(size):
             assert (numpy.append(x[id],y[id])==array[(i+4)%array.shape[0]]).all()
             i+=1
-    assert i==m.n_batches*m.minibatch_size
-    del x,y,i,id
+    assert i==2*size # should not wrap
+    del x,y,i,id,size
 
-    assert not have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array)+1,offset=0)
+    assert have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array)+1,offset=0)
     assert not have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array),offset=0)
 
 def test_ds_iterator(array,iterator1,iterator2,iterator3):
@@ -262,14 +274,17 @@
 def test_getitem(array,ds):
     def test_ds(orig,ds,index):
         i=0
-        assert len(ds)==len(index)
-        for x,z,y in ds('x','z','y'):
-            assert (orig[index[i]]['x']==array[index[i]][:3]).all()
-            assert (orig[index[i]]['x']==x).all()
-            assert orig[index[i]]['y']==array[index[i]][3]
-            assert (orig[index[i]]['y']==y).all() # why does it crash sometimes?
-            assert (orig[index[i]]['z']==array[index[i]][0:3:2]).all()
-            assert (orig[index[i]]['z']==z).all()
+        assert isinstance(ds,LookupList)
+        assert len(ds)==3
+        assert len(ds[0])==len(index)
+#        for x,z,y in ds('x','z','y'):
+        for idx in index:
+            assert (orig[idx]['x']==array[idx][:3]).all()
+            assert (orig[idx]['x']==ds['x'][i]).all()
+            assert orig[idx]['y']==array[idx][3]
+            assert (orig[idx]['y']==ds['y'][i]).all() # why does it crash sometimes?
+            assert (orig[idx]['z']==array[idx][0:3:2]).all()
+            assert (orig[idx]['z']==ds['z'][i]).all()
             i+=1
         del i
         ds[0]
@@ -282,19 +297,22 @@
         for x in ds:
             pass
 
-#ds[:n] returns a dataset with the n first examples.
+#ds[:n] returns a LookupList with the n first examples.
     ds2=ds[:3]
-    assert isinstance(ds2,LookupList)
     test_ds(ds,ds2,index=[0,1,2])
     del ds2
 
-#ds[i1:i2:s]# returns a ds with the examples i1,i1+s,...i2-s.
-    ds2=ds.subset[1:7:2]
-    assert isinstance(ds2,DataSet)
+#ds[i:j] returns a LookupList with examples i,i+1,...,j-1.
+    ds2=ds[1:3]
+    test_ds(ds,ds2,index=[1,2])
+    del ds2
+
+#ds[i1:i2:s] returns a LookupList with the examples i1,i1+s,...i2-s.
+    ds2=ds[1:7:2]
     test_ds(ds,ds2,[1,3,5])
     del ds2
 
-#ds[i]
+#ds[i] returns the (i+1)-th example of the dataset.
     ds2=ds[5]
     assert isinstance(ds2,Example)
     assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds)  # index not defined
@@ -302,8 +320,8 @@
     del ds2
 
 #ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in.
-    ds2=ds.subset[[4,7,2,8]]
-    assert isinstance(ds2,DataSet)
+    ds2=ds[[4,7,2,8]]
+#    assert isinstance(ds2,DataSet)
     test_ds(ds,ds2,[4,7,2,8])
     del ds2
 
@@ -326,6 +344,71 @@
     #        del i,example
     #* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#????
 
+def test_subset(array,ds):
+    def test_ds(orig,ds,index):
+        i=0
+        assert isinstance(ds2,DataSet)
+        assert len(ds)==len(index)
+        for x,z,y in ds('x','z','y'):
+            assert (orig[index[i]]['x']==array[index[i]][:3]).all()
+            assert (orig[index[i]]['x']==x).all()
+            assert orig[index[i]]['y']==array[index[i]][3]
+            assert orig[index[i]]['y']==y
+            assert (orig[index[i]]['z']==array[index[i]][0:3:2]).all()
+            assert (orig[index[i]]['z']==z).all()
+            i+=1
+        del i
+        ds[0]
+        if len(ds)>2:
+            ds[:1]
+            ds[1:1]
+            ds[1:1:1]
+        if len(ds)>5:
+            ds[[1,2,3]]
+        for x in ds:
+            pass
+
+#ds[:n] returns a dataset with the n first examples.
+    ds2=ds.subset[:3]
+    test_ds(ds,ds2,index=[0,1,2])
+#    del ds2
+
+#ds[i1:i2:s]# returns a ds with the examples i1,i1+s,...i2-s.
+    ds2=ds.subset[1:7:2]
+    test_ds(ds,ds2,[1,3,5])
+#     del ds2
+
+# #ds[i]
+#     ds2=ds.subset[5]
+#     assert isinstance(ds2,Example)
+#     assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds)  # index not defined
+#     assert not have_raised("var['ds']["+str(len(ds)-1)+"]",ds=ds)
+#     del ds2
+
+#ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in.
+    ds2=ds.subset[[4,7,2,8]]
+    test_ds(ds,ds2,[4,7,2,8])
+#     del ds2
+
+#ds.<property># returns the value of a property associated with
+  #the name <property>. The following properties should be supported:
+  #    - 'description': a textual description or name for the ds
+  #    - 'fieldtypes': a list of types (one per field)
+
+#* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#????
+    #assert hstack([ds('x','y'),ds('z')])==ds
+    #hstack([ds('z','y'),ds('x')])==ds
+    assert have_raised2(hstack,[ds('x'),ds('x')])
+    assert have_raised2(hstack,[ds('y','x'),ds('x')])
+    assert not have_raised2(hstack,[ds('x'),ds('y')])
+    
+#        i=0
+#        for example in hstack([ds('x'),ds('y'),ds('z')]):
+#            example==ds[i]
+#            i+=1 
+#        del i,example
+#* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#????
+
 def test_fields_fct(ds):
     #@todo, fill correctly
     assert len(ds.fields())==3
@@ -455,6 +538,7 @@
     test_iterate_over_examples(array, ds)
     test_overrides(ds)
     test_getitem(array, ds)
+    test_subset(array, ds)
     test_ds_iterator(array,ds('x','y'),ds('y','z'),ds('x','y','z'))
     test_fields_fct(ds)
 
@@ -515,6 +599,15 @@
 
         del a, ds
 
+    def test_RenamedFieldsDataSet(self):
+        a = numpy.random.rand(10,4)
+        ds = ArrayDataSet(a,Example(['x1','y1','z1','w1'],[slice(3),3,[0,2],0]))
+        ds = RenamedFieldsDataSet(ds,['x1','y1','z1'],['x','y','z'])
+
+        test_all(a,ds)
+
+        del a, ds
+
     def test_MinibatchDataSet(self):
         raise NotImplementedError()
     def test_HStackedDataSet(self):
@@ -570,14 +663,17 @@
         res = dsc[:]
 
 if __name__=='__main__':
-    if len(sys.argv)==2:
-        if sys.argv[1]=="--debug":
+    tests = []
+    debug=False
+    if len(sys.argv)==1:
+        unittest.main()
+    else:
+        assert sys.argv[1]=="--debug"
+        for arg in sys.argv[2:]:
+            tests.append(arg)
+        if tests:
+            unittest.TestSuite(map(T_DataSet, tests)).debug()
+        else:
             module = __import__("_test_dataset")
             tests = unittest.TestLoader().loadTestsFromModule(module)
             tests.debug()
-        print "bad argument: only --debug is accepted"
-    elif len(sys.argv)==1:
-        unittest.main()
-    else:
-        print "bad argument: only --debug is accepted"
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/_test_onehotop.py	Mon Jul 07 10:08:35 2008 -0400
@@ -0,0 +1,21 @@
+from onehotop import one_hot
+
+import unittest
+from theano import compile
+from theano import gradient
+
+from theano.tensor import as_tensor
+
+import random
+import numpy.random
+
+class T_OneHot(unittest.TestCase):
+    def test0(self):
+        x = as_tensor([3, 2, 1])
+        y = as_tensor(5)
+        o = one_hot(x, y)
+        y = compile.eval_outputs([o])
+        self.failUnless(numpy.all(y == numpy.asarray([[0, 0, 0, 1, 0], [0, 0, 1, 0, 0], [0, 1, 0, 0, 0]])))
+
+if __name__ == '__main__':
+    unittest.main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/_test_random_transformation.py	Mon Jul 07 10:08:35 2008 -0400
@@ -0,0 +1,84 @@
+from random_transformation import row_random_transformation
+
+import unittest
+from theano import compile
+from theano import gradient
+
+from theano.sparse import _is_dense, _is_sparse, _is_dense_result, _is_sparse_result
+from theano.sparse import _mtypes, _mtype_to_str
+from theano.sparse import as_sparse
+
+from theano.tensor import as_tensor
+from theano.scalar import as_scalar
+
+import random
+import numpy.random
+
+class T_RowRandomTransformation(unittest.TestCase):
+    def setUp(self):
+        random.seed(44)
+        numpy.random.seed(44)
+
+    def test_basic(self):
+        rows = 4
+        cols = 20
+        fakeseed = 0
+        length = 3 
+        md = numpy.random.rand(rows, cols)
+        for mtype in _mtypes:
+            m = as_sparse(mtype(md))
+            o = row_random_transformation(m, length, initial_seed=fakeseed)
+            y = compile.eval_outputs([o])
+            expected = "[[ 0.88239119  1.03244463 -1.29297503]\n [ 0.02644961  1.50119695 -0.025081  ]\n [-0.60741013  1.25424625  0.30119422]\n [-1.08659967 -0.35531544 -1.38915467]]"
+            self.failUnless(str(y) == expected)
+
+    def test_length(self):
+        """ Test that if length is increased, we obtain the same results
+        (except longer). """
+
+        for i in range(10):
+            mtype = random.choice(_mtypes)
+            rows = random.randint(1, 20)
+            cols = random.randint(1, 20)
+            fakeseed = random.randint(0, 100)
+            length = random.randint(1, 10)
+            extralength = random.randint(1, 10)
+
+            m = as_sparse(mtype(numpy.random.rand(rows, cols)))
+            o1 = row_random_transformation(m, length, initial_seed=fakeseed)
+            o2 = row_random_transformation(m, length + extralength, initial_seed=fakeseed)
+
+            y1 = compile.eval_outputs([o1])
+            y2 = compile.eval_outputs([o2])
+
+            self.failUnless((y1 == y2[:,:length]).all())
+
+    def test_permute(self):
+        """ Test that if the order of the rows is permuted, we obtain the same results. """
+        for i in range(10):
+            mtype = random.choice(_mtypes)
+            rows = random.randint(2, 20)
+            cols = random.randint(1, 20)
+            fakeseed = random.randint(0, 100)
+            length = random.randint(1, 10)
+
+            permute = numpy.random.permutation(rows)
+
+
+            m1 = numpy.random.rand(rows, cols)
+            m2 = m1[permute]
+            for r in range(rows):
+                self.failUnless((m2[r] == m1[permute[r]]).all())
+            s1 = as_sparse(mtype(m1))
+            s2 = as_sparse(mtype(m2))
+            o1 = row_random_transformation(s1, length, initial_seed=fakeseed)
+            o2 = row_random_transformation(s2, length, initial_seed=fakeseed)
+            y1 = compile.eval_outputs([o1])
+            y2 = compile.eval_outputs([o2])
+
+            self.failUnless(y1.shape == y2.shape)
+            for r in range(rows):
+                self.failUnless((y2[r] == y1[permute[r]]).all())
+
+if __name__ == '__main__':
+    unittest.main()
--- a/dataset.py	Mon Jun 16 17:47:36 2008 -0400
+++ b/dataset.py	Mon Jul 07 10:08:35 2008 -0400
@@ -1,6 +1,6 @@
 
 from lookup_list import LookupList as Example
-from misc import unique_elements_list_intersection
+from common.misc import unique_elements_list_intersection
 from string import join
 from sys import maxint
 import numpy, copy
@@ -381,7 +381,8 @@
         any other object that supports integer indexing and slicing.
 
         @ATTENTION: now minibatches returns minibatches_nowrap, which is supposed to return complete
-        batches only, raise StopIteration
+        batches only, raise StopIteration.
+        @ATTENTION: minibatches returns a LookupList, we can't iterate over examples on it.
 
         """
         #return DataSet.MinibatchWrapAroundIterator(self,fieldnames,minibatch_size,n_batches,offset)\
@@ -435,6 +436,16 @@
         Return a dataset that sees only the fields whose name are specified.
         """
         assert self.hasFields(*fieldnames)
+        #return self.fields(*fieldnames).examples()
+        fieldnames_list = list(fieldnames)
+        return FieldsSubsetDataSet(self,fieldnames_list)
+
+    def cached_fields_subset(self,*fieldnames) :
+        """
+        Behaviour is supposed to be the same as __call__(*fieldnames), but the dataset returned is cached.
+        @see : dataset.__call__
+        """
+        assert self.hasFields(*fieldnames)
         return self.fields(*fieldnames).examples()
 
     def fields(self,*fieldnames):
@@ -692,6 +703,7 @@
         assert len(src_fieldnames)==len(new_fieldnames)
         self.valuesHStack = src.valuesHStack
         self.valuesVStack = src.valuesVStack
+        self.lookup_fields = Example(new_fieldnames,src_fieldnames)
 
     def __len__(self): return len(self.src)
     
@@ -719,9 +731,18 @@
 
     def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
         assert self.hasFields(*fieldnames)
-        return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset)
+        cursor = Example(fieldnames,[0]*len(fieldnames))
+        for batch in self.src.minibatches_nowrap([self.lookup_fields[f] for f in fieldnames],minibatch_size,n_batches,offset):
+            cursor._values=batch._values
+            yield cursor
+    
     def __getitem__(self,i):
-        return FieldsSubsetDataSet(self.src[i],self.new_fieldnames)
+#        return FieldsSubsetDataSet(self.src[i],self.new_fieldnames)
+        complete_example = self.src[i]
+        return Example(self.new_fieldnames,
+                             [complete_example[field]
+                              for field in self.src_fieldnames])
+
 
 
 class DataSetFields(Example):
@@ -859,7 +880,9 @@
                 return self
             def next(self):
                 upper = self.next_example+minibatch_size
-                assert upper<=self.ds.length
+                if upper > len(self.ds) :
+                    raise StopIteration()
+                assert upper<=len(self.ds) # instead of self.ds.length
                 #minibatch = Example(self.ds._fields.keys(),
                 #                    [field[self.next_example:upper]
                 #                     for field in self.ds._fields])
@@ -1314,7 +1337,10 @@
           # into memory at once, which may be too much
           # the work could possibly be done by minibatches
           # that are as large as possible but no more than what memory allows.
-          fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()
+          #
+          # field_values is supposed to be an DataSetFields, that inherits from LookupList
+          #fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()
+          fields_values = DataSetFields(source_dataset,None)
           assert all([len(self)==len(field_values) for field_values in fields_values])
           for example in fields_values.examples():
               self.cached_examples.append(copy.copy(example))
@@ -1333,16 +1359,25 @@
               self.dataset=dataset
               self.current=offset
               self.all_fields = self.dataset.fieldNames()==fieldnames
+              self.n_batches = n_batches
+              self.batch_counter = 0
           def __iter__(self): return self
           def next(self):
+              self.batch_counter += 1
+              if self.n_batches and self.batch_counter > self.n_batches :
+                  raise StopIteration()
               upper = self.current+minibatch_size
+              if upper > len(self.dataset.source_dataset):
+                  raise StopIteration()
               cache_len = len(self.dataset.cached_examples)
               if upper>cache_len: # whole minibatch is not already in cache
                   # cache everything from current length to upper
-                  for example in self.dataset.source_dataset[cache_len:upper]:
+                  #for example in self.dataset.source_dataset[cache_len:upper]:
+                  for example in self.dataset.source_dataset.subset[cache_len:upper]:
                       self.dataset.cached_examples.append(example)
               all_fields_minibatch = Example(self.dataset.fieldNames(),
                                              zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size]))
+
               self.current+=minibatch_size
               if self.all_fields:
                   return all_fields_minibatch
--- a/linear_regression.py	Mon Jun 16 17:47:36 2008 -0400
+++ b/linear_regression.py	Mon Jul 07 10:08:35 2008 -0400
@@ -4,11 +4,12 @@
 the use of theano.
 """
 
-from learner import *
-from theano import tensor as t
+from pylearn import OfflineLearningAlgorithm
+from theano import tensor as T
 from theano.scalar import as_scalar
+from common.autoname import AutoName
 
-class LinearRegression(MinibatchUpdatesTLearner):
+class LinearRegression(OfflineLearningAlgorithm):
     """
     Implement linear regression, with or without L2 regularization
     (the former is called Ridge Regression and the latter Ordinary Least Squares).
@@ -40,96 +41,122 @@
     plus L2_regularizer on the diagonal except at (0,0),
     and XtY is a (n_inputs+1)*n_outputs matrix containing X'*Y.
 
-    The fields and attributes expected and produced by use and update are the following:
+    The dataset fields expected and produced by the learning algorithm and the trained model
+    are the following:
 
-     - Input and output fields (example-wise quantities):
+     - Input and output dataset fields (example-wise quantities):
 
-       - 'input' (always expected by use and update as an input_dataset field)
-       - 'target' (optionally expected by use and update as an input_dataset field)
-       - 'output' (optionally produced by use as an output dataset field)
-       - 'squared_error' (optionally produced by use as an output dataset field, needs 'target') = example-wise squared error
+       - 'input' (always expected as an input_dataset field)
+       - 'target' (always expected by the learning algorithm, optional for learned model)
+       - 'output' (always produced by learned model)
+       - 'squared_error' (optionally produced by learned model if 'target' is provided)
+          = example-wise squared error
+    """
+    def __init__(self, L2_regularizer=0):
+        self.predictor = LinearPredictor(None,None
+        self.L2_regularizer=L2_regularizer
+        self._XtX = T.matrix('XtX')
+        self._XtY = T.matrix('XtY')
+        self._extended_input = T.prepend_one_to_each_row(self._input)
 
-     - optional attributes (optionally expected as input_dataset attributes)
-       (warning, this may be dangerous, the 'use' method will use those provided in the 
-       input_dataset rather than those learned during 'update'; currently no support
-       for providing these to update):
-       
-       - 'L2_regularizer' 
-       - 'b' 
-       - 'W'
-       - 'parameters' = [b, W] 
-       - 'regularization_term'
-       - 'XtX'
-       - 'XtY'
-
-    """
+class LinearPredictorEquations(AutoName):
+    inputs = T.matrix() # minibatchsize x n_inputs
+    targets = T.matrix() # minibatchsize x n_outputs
+    theta = T.matrix() # (n_inputs+1) x n_outputs
+    b = theta[0]
+    Wt = theta[1:,:]
+    outputs = T.dot(inputs,Wt) + b # minibatchsize x n_outputs
+    squared_errors = T.sum(T.sqr(targets-outputs),axis=1)
 
-    def attributeNames(self):
-        return ["L2_regularizer","parameters","b","W","regularization_term","XtX","XtY"]
+    __compiled = False
+    @classmethod
+    def compile(cls,linker='c|py'):
+        if cls.__compiled:
+            return
+        def fn(input_vars,output_vars):
+            return staticmethod(theano.function(input_vars,output_vars, linker=linker))
 
-    def useInputAttributes(self):
-        return ["b","W"]
-
-    def useOutputAttributes(self):
-        return []
+        cls.compute_outputs = fn([inputs,theta],[outputs])
+        cls.compute_errors = fn([outputs,targets],[squared_errors])
 
-    def updateInputAttributes(self):
-        return ["L2_regularizer","XtX","XtY"]
+        cls.__compiled = True
 
-    def updateMinibatchInputFields(self):
-        return ["input","target"]
-    
-    def updateMinibatchInputAttributes(self):
-        return ["XtX","XtY"]
+    def __init__(self)
+        self.compile()
+        
+class LinearRegressionEquations(LinearPredictorEquations):
+    P = LinearPredictorEquations
+    XtX = T.matrix() # (n_inputs+1) x (n_inputs+1)
+    XtY = T.matrix() # (n_inputs+1) x n_outputs
+    extended_input = T.prepend_scalar_to_each_row(1,P.inputs)
+    new_XtX = add_inplace(XtX,T.dot(extended_input.T,extended_input))
+    new_XtY = add_inplace(XtY,T.dot(extended_input.T,P.targets))
     
-    def updateMinibatchOutputAttributes(self):
-        return ["new_XtX","new_XtY"]
-    
-    def updateEndInputAttributes(self):
-        return ["theta","XtX","XtY"]
-
-    def updateEndOutputAttributes(self):
-        return ["new_theta","b","W","regularization_term"] # CHECK: WILL b AND W CONTAIN OLD OR NEW THETA? @todo i.e. order of computation = ?
+class LinearPredictor(object):
+    """
+    A linear predictor has parameters theta (a bias vector and a weight matrix)
+    it can use to make a linear prediction (according to the LinearPredictorEquations).
+    It can compute its output (bias + weight * input) and a squared error (||output - target||^2).
+    """
+    def __init__(self, theta):
+        self.theta=theta
+        self.n_inputs=theta.shape[0]-1
+        self.n_outputs=theta.shape[1]
+        self.predict_equations = LinearPredictorEquations()
 
-    def parameterAttributes(self):
-        return ["b","W"]
+    def compute_outputs(self,inputs):
+        return self.predict_equations.compute_outputs(inputs,self.theta)
+    def compute_errors(self,inputs,targets):
+        return self.predict_equations.compute_errors(self.compute_outputs(inputs),targets)
+    def compute_outputs_and_errors(self,inputs,targets):
+        outputs = self.compute_outputs(inputs)
+        return [outputs,self.predict_equations.compute_errors(outputs,targets)]
     
-    def defaultOutputFields(self, input_fields):
-        output_fields = ["output"]
-        if "target" in input_fields:
-            output_fields.append("squared_error")
-        return output_fields
+    def __call__(self,dataset,output_fieldnames=None,cached_output_dataset=False):
+        assert dataset.hasFields(["input"])
+        if output_fieldnames is None:
+            if dataset.hasFields(["target"]):
+                output_fieldnames = ["output","squared_error"]
+            else:
+                output_fieldnames = ["output"]
+        output_fieldnames.sort()
+        if output_fieldnames == ["squared_error"]:
+            f = self.compute_errors
+        elif output_fieldnames == ["output"]:
+            f = self.compute_outputs
+        elif output_fieldnames == ["output","squared_error"]:
+            f = self.compute_outputs_and_errors
+        else:
+            raise ValueError("unknown field(s) in output_fieldnames: "+str(output_fieldnames))
         
-    def __init__(self):
-        self._input = t.matrix('input') # n_examples x n_inputs
-        self._target = t.matrix('target') # n_examples x n_outputs
-        self._L2_regularizer = as_scalar(0.,'L2_regularizer')
-        self._theta = t.matrix('theta')
-        self._W = self._theta[:,1:] 
-        self._b = self._theta[:,0]
-        self._XtX = t.matrix('XtX')
-        self._XtY = t.matrix('XtY')
-        self._extended_input = t.prepend_one_to_each_row(self._input)
-        self._output = t.dot(self._input,self._W.T) + self._b  # (n_examples , n_outputs) matrix
-        self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector
-        self._regularizer = self._L2_regularizer * t.dot(self._W,self._W)
-        self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input))
-        self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target))
-        self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY)
+        ds=ApplyFunctionDataSet(dataset,f,output_fieldnames)
+        if cached_output_dataset:
+            return CachedDataSet(ds)
+        else:
+            return ds
+        
 
-        MinibatchUpdatesTLearner.__init__(self)
-            
-    def allocate(self,minibatch):
-        minibatch_n_inputs  = minibatch["input"].shape[1]
-        minibatch_n_outputs = minibatch["target"].shape[1]
+        self._XtX = T.matrix('XtX')
+        self._XtY = T.matrix('XtY')
+        self._extended_input = T.prepend_one_to_each_row(self._input)
+        self._output = T.dot(self._input,self._W.T) + self._b  # (n_examples , n_outputs) matrix
+        self._squared_error = T.sum_within_rows(T.sqr(self._output-self._target)) # (n_examples ) vector
+        self._regularizer = self._L2_regularizer * T.dot(self._W,self._W)
+        self._new_XtX = add_inplace(self._XtX,T.dot(self._extended_input.T,self._extended_input))
+        self._new_XtY = add_inplace(self._XtY,T.dot(self._extended_input.T,self._target))
+        self._new_theta = T.solve_inplace(self._theta,self._XtX,self._XtY)
+
+    def allocate(self,dataset):
+        dataset_n_inputs  = dataset["input"].shape[1]
+        dataset_n_outputs = dataset["target"].shape[1]
         if not self._n_inputs:
-            self._n_inputs = minibatch_n_inputs 
-            self._n_outputs = minibatch_n_outputs
+            self._n_inputs = dataset_n_inputs 
+            self._n_outputs = dataset_n_outputs
             self.XtX = numpy.zeros((1+self._n_inputs,1+self._n_inputs))
             self.XtY = numpy.zeros((1+self._n_inputs,self._n_outputs))
             self.theta = numpy.zeros((self._n_outputs,1+self._n_inputs))
             self.forget()
-        elif self._n_inputs!=minibatch_n_inputs or self._n_outputs!=minibatch_n_outputs:
+        elif self._n_inputs!=dataset_n_inputs or self._n_outputs!=dataset_n_outputs:
             # if the input or target changes dimension on the fly, we resize and forget everything
             self.forget()
             
@@ -141,3 +168,6 @@
             self.XtY.data[:,:]=0
             numpy.diag(self.XtX.data)[1:]=self.L2_regularizer
 
+    def __call__(self,dataset):
+
+
--- a/misc.py	Mon Jun 16 17:47:36 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,18 +0,0 @@
-
-def unique_elements_list_intersection(list1,list2):
-    """
-    Return the unique elements that are in both list1 and list2
-    (repeated elements in listi will not be duplicated in the result).
-    This should run in O(n1+n2) where n1=|list1|, n2=|list2|.
-    """
-    return list(set.intersection(set(list1),set(list2)))
-import time
-#http://www.daniweb.com/code/snippet368.html
-def print_timing(func):
-    def wrapper(*arg):
-        t1 = time.time()
-        res = func(*arg)
-        t2 = time.time()
-        print '%s took %0.3f ms' % (func.func_name, (t2-t1)*1000.0)
-        return res
-    return wrapper
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/onehotop.py	Mon Jul 07 10:08:35 2008 -0400
@@ -0,0 +1,58 @@
+"""
+One hot Op
+"""
+
+#from theano import tensor
+from theano.tensor import as_tensor, Tensor
+from theano.gof import op
+from theano.gof.graph import Apply
+
+import numpy
+
+class OneHot(op.Op):
+    """
+    Construct a one-hot vector, x out of y.
+
+    @todo: Document inputs and outputs
+    @todo: Use 'bool' as output dtype? Or, at least 'int64' ? Not float64!
+    @todo: Use 'bool' as output dtype, not 'int64' ?
+    @todo: Allow this to operate on column vectors (Tensor)
+    @todo: Describe better.
+    """
+
+    def make_node(self, x, y):
+        """
+        @type x: Vector L{Tensor} of integers
+        @param x: The entries of the one-hot vector to be one.
+        @type y: Integer scalar L{Tensor}
+        @param y: The length (#columns) of the one-hot vectors.
+        @return: A L{Tensor} of one-hot vectors
+
+        @precondition: x < y for all entries of x
+        @todo: Check that x and y are int types
+        """
+        x = as_tensor(x)
+        y = as_tensor(y)
+        #assert x.dtype[0:3] == "int"
+        #assert y.dtype[0:3] == "int"
+        inputs = [x, y]
+        ##outputs = [tensor.Tensor("int64", broadcastable=[False, False])]
+        #outputs = [tensor.Tensor("float64", broadcastable=[False, False])]
+        #outputs = [Tensor("int64", broadcastable=[False, False])]
+        outputs = [Tensor("float64", broadcastable=[False, False]).make_result()]
+        node = Apply(op = self, inputs = inputs, outputs = outputs)
+        return node
+
+    def perform(self, node, (x, y), (out, )):
+        assert x.dtype == "int64" or x.dtype == "int32"
+        assert x.ndim == 1
+        assert y.dtype == "int64" or x.dtype == "int32"
+        assert y.ndim == 0
+        out[0] = numpy.zeros((x.shape[0], y), dtype="float64")
+        for c in range(x.shape[0]):
+            assert x[c] < y
+            out[0][c, x[c]] = 1
+
+    def grad(self, (x, y), (out_gradient, )):
+        return None, None
+one_hot = OneHot()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/onehotop.py.scalar	Mon Jul 07 10:08:35 2008 -0400
@@ -0,0 +1,64 @@
+"""
+One hot Op
+"""
+
+#from theano import tensor
+from theano.tensor import as_tensor, Tensor
+#from theano import scalar
+from theano.scalar import as_scalar
+from theano.gof import op
+from theano.gof.graph import Apply
+
+import numpy
+
+class OneHot(op.Op):
+    """
+    Construct a one-hot vector, x out of y.
+
+    @todo: Document inputs and outputs
+    @todo: Use 'bool' as output dtype? Or, at least 'int64' ? Not float64!
+    @todo: Use 'bool' as output dtype, not 'int64' ?
+    @todo: Allow this to operate on column vectors (Tensor)
+    @todo: Describe better.
+    @todo: What type is y?
+    @todo: What about operating on L{Scalar}s?
+    """
+
+    def make_node(self, x, y):
+        """
+        @type x: Vector L{Tensor} of integers
+        @param x: The entries of the one-hot vector to be one.
+        @type y: Integer L{Scalar}
+        @param y: The length (#columns) of the one-hot vectors.
+        @return: A L{Tensor} of one-hot vectors
+
+        @precondition: x < y for all entries of x
+        @todo: Check that x and y are int types
+        """
+        #x = tensor.as_tensor(x)
+        #y = scalar.as_scalar(y)
+        x = as_tensor(x)
+        y = as_scalar(y)
+        #assert x.dtype[0:3] == "int"
+        #assert y.dtype[0:3] == "int"
+        inputs = [x, y]
+        ##outputs = [tensor.Tensor("int64", broadcastable=[False, False])]
+        #outputs = [tensor.Tensor("float64", broadcastable=[False, False])]
+        #outputs = [Tensor("int64", broadcastable=[False, False])]
+        outputs = [Tensor("float64", broadcastable=[False, False]).make_result()]
+        node = Apply(op = self, inputs = inputs, outputs = outputs)
+        return node
+
+    def perform(self, node, (x, y), (out, )):
+        assert x.dtype == "int64"
+        assert type(y) == numpy.int64
+        assert x.ndim == 1
+        #out = numpy.zeros((x.shape[0], y), dtype="int64")
+        out[0] = numpy.zeros((x.shape[0], y), dtype="float64")
+        for c in range(x.shape[0]):
+            assert x[c] < y
+            out[0][c, x[c]] = 1
+
+    def grad(self, (x, y), (out_gradient, )):
+        return None, None
+one_hot = OneHot()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/random_transformation.py	Mon Jul 07 10:08:35 2008 -0400
@@ -0,0 +1,132 @@
+"""
+New L{Op}s that aren't in core theano
+"""
+
+from theano import sparse
+from theano import tensor
+from theano import scalar
+from theano.gof import op
+
+from theano.sparse import _is_dense, _is_sparse, _is_dense_result, _is_sparse_result
+
+import scipy.sparse
+
+import numpy
+
+class RowRandomTransformation(op.Op):
+    """
+    Given C{x}, a (sparse) matrix with shape (exmpls, dimensions), we
+    multiply it by a deterministic random matrix of shape (dimensions,
+    length) to obtain random transformation output of shape (exmpls,
+    length).
+
+    Each element of the deterministic random matrix is selected uniformly
+    from [-1, +1).
+    @todo: Use another random distribution?
+
+    @note: This function should be written such that if length is
+    increased, we obtain the same results (except longer). Similarly,
+    the rows should be able to be permuted and get the same result in
+    the same fashion.
+
+    @todo: This may be slow?
+    @todo: Rewrite for dense matrices too?
+    @todo: Is there any way to verify the convention that each row is
+    an example? Should I rename the variables in the code to make the
+    semantics more explicit?
+    @todo: AUTOTEST: Autotest that dense and spare versions of this are identical.
+    @todo: Rename? Is Row the correct name? Maybe column-wise?
+
+    @type  x: L{scipy.sparse.spmatrix}
+    @param x: Sparse matrix to be randomly transformed with shape (exmpls, dimensions)
+    @type  length: int
+    @param length: The number of transformations of C{x} to be performed.
+    @param initial_seed: Initial seed for the RNG.
+    @rtype: L{numpy.ndarray}
+    @return: Array with C{length} random transformations, with shape (exmpls, length)
+    """
+
+    import random
+    """
+    RNG used for random transformations.
+    Does not share state with rest of program.
+    @todo: Make STATIC and private. Ask James or Olivier how to make this more Pythonic.
+    """
+    _trng = random.Random()
+
+    def __init__(self, x, length, initial_seed=0, **kwargs):
+        """
+        @todo: Which broadcastable values should I use?
+        """
+        assert 0        # Needs to be updated to Olivier's new Op creation approach
+        op.Op.__init__(self, **kwargs)
+        x = sparse.as_sparse(x)
+        self.initial_seed = initial_seed
+        self.length = length
+        self.inputs = [x]
+        self.outputs = [tensor.Tensor(x.dtype, broadcastable=[False, False])]
+#        self.outputs = [tensor.Tensor(x.dtype, broadcastable=[True, True])]
+
+    def _random_matrix_value(self, row, col, rows):
+        """
+        From a deterministic random matrix, find one element.
+        @param row: The row of the element to be read.
+        @param col: The column of the element to be read.
+        @param row: The number of rows in the matrix.
+        @type row: int
+        @type col: int
+        @type rows: int
+        @note: This function is designed such that if we extend
+        the number of columns in the random matrix, the values of
+        the earlier entries is unchanged.
+        @todo: Make this static
+        """
+        # Choose the random entry at (l, c)
+        rngidx = col * rows + row
+        # Set the random number state for this random entry
+        # Note: This may be slow
+        self._trng.seed(rngidx + self.initial_seed)
+
+        # Determine the value for this entry
+        val = self._trng.uniform(-1, +1)
+#       print "Exmpl #%d, dimension #%d => Random projection #%d has idx %d (+ seed %d) and value %f" % (r, c, j, rngidx, self.initial_seed, val)
+        return val
+
+    def impl(self, xorig):
+        assert _is_sparse(xorig)
+        assert len(xorig.shape) == 2
+        # Since conversions to and from the COO format are quite fast, you
+        # can use this approach to efficiently implement lots computations
+        # on sparse matrices.
+        x = xorig.tocoo()
+        (rows, cols) = x.shape
+        tot = rows * cols
+        out = numpy.zeros((rows, self.length))
+#        print "l = %d" % self.length
+#        print "x.getnnz() = %d" % x.getnnz()
+        all = zip(x.col, x.row, x.data)
+        all.sort()      # TODO: Maybe this is very slow?
+        lastc = None
+        lastl = None
+        lastval = None
+        for l in range(self.length):
+            for (c, r, data) in all:
+                assert c < cols
+                assert r < rows
+                if not c == lastc or not l == lastl:
+                    lastc = c
+                    lastl = l
+                    lastval = self._random_matrix_value(c, l, cols)
+                val = lastval
+#                val = self._random_matrix_value(c, l, cols)
+#                val = self._trng.uniform(-1, +1)
+#                val = 1.0
+                out[r][l] += val * data
+        return out
+    def __copy__(self):
+        return self.__class__(self.inputs[0], self.length, self.initial_seed)
+    def clone_with_new_inputs(self, *new_inputs):
+        return self.__class__(new_inputs[0], self.length, self.initial_seed)
+    def desc(self, *new_inputs):
+        return (self.__class__, self.length, self.initial_seed)
+row_random_transformation = RowRandomTransformation()
--- a/statscollector.py	Mon Jun 16 17:47:36 2008 -0400
+++ b/statscollector.py	Mon Jul 07 10:08:35 2008 -0400
@@ -1,7 +1,13 @@
 
 # Here is how I see stats collectors:
 
-#    def my_stats((residue,nll),(regularizer)):
+def my_stats(graph):
+    graph.mse=examplewise_mean(square_norm(graph.residue))
+    graph.training_loss=graph.regularizer+examplewise_sum(graph.nll)
+    return [graph.mse,graph.training_loss]
+    
+
+#    def my_stats(residue,nll,regularizer):
 #            mse=examplewise_mean(square_norm(residue))
 #            training_loss=regularizer+examplewise_sum(nll)
 #            set_names(locals())