changeset 375:12ce29abf27d

Automated merge with http://lgcm.iro.umontreal.ca/hg/pylearn
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Mon, 16 Jun 2008 17:47:36 -0400
parents 2b16604ffad9 (current diff) aa8aff6abbf7 (diff)
children c9a89be5cb0a
files _nnet_ops.py mlp.py test_dataset.py test_filetensor.py test_mlp.py
diffstat 22 files changed, 2002 insertions(+), 1652 deletions(-) [+]
line wrap: on
line diff
--- a/__init__.py	Tue Jun 03 21:27:32 2008 -0400
+++ b/__init__.py	Mon Jun 16 17:47:36 2008 -0400
@@ -1,5 +1,10 @@
 import filetensor
 import nnet_ops
+import version
 
 from lookup_list import LookupList
 
+def __src_version__():
+    #todo - this is vulnerable to the bug in theano ticket #160
+    return version.src_version(__name__)
+
--- a/_nnet_ops.py	Tue Jun 03 21:27:32 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,41 +0,0 @@
-
-import unittest
-import theano._test_tensor as TT
-import numpy
-
-from nnet_ops import *
-
-class T_sigmoid(unittest.TestCase):
-    def setUp(self):
-        numpy.random.seed(9999)
-    def test_elemwise(self):
-        TT.verify_grad(self, sigmoid, [numpy.random.rand(3,4)])
-
-class T_softplus(unittest.TestCase):
-    def setUp(self):
-        numpy.random.seed(9999)
-    def test_elemwise(self):
-        TT.verify_grad(self, softplus, [numpy.random.rand(3,4)])
-
-class T_CrossentropySoftmax1Hot(unittest.TestCase):
-    def setUp(self):
-        numpy.random.seed(9999)
-    def test0(self):
-        y_idx = [0,1,3]
-        class Dummy(object):
-            def make_node(self, a,b):
-                return crossentropy_softmax_1hot_with_bias(a, b, y_idx)[0:1]
-        TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4),
-            numpy.random.rand(4)])
-
-    def test1(self):
-        y_idx = [0,1,3]
-        class Dummy(object):
-            def make_node(self, a):
-                return crossentropy_softmax_1hot(a, y_idx)[0:1]
-        TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4)])
-
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/_test_dataset.py	Tue Jun 03 21:27:32 2008 -0400
+++ b/_test_dataset.py	Mon Jun 16 17:47:36 2008 -0400
@@ -1,183 +1,583 @@
+#!/bin/env python
 from dataset import *
 from math import *
-import unittest
-import sys
-import numpy as N
+import numpy, unittest, sys
+from misc import *
+from lookup_list import LookupList
+
+def have_raised(to_eval, **var):
+    have_thrown = False
+    try:
+        eval(to_eval)
+    except :
+        have_thrown = True
+    return have_thrown
+
+def have_raised2(f, *args, **kwargs):
+    have_thrown = False
+    try:
+        f(*args, **kwargs)
+    except :
+        have_thrown = True
+    return have_thrown
+
+def test1():
+    print "test1"
+    global a,ds
+    a = numpy.random.rand(10,4)
+    print a
+    ds = ArrayDataSet(a,{'x':slice(3),'y':3,'z':[0,2]})
+    print "len(ds)=",len(ds)
+    assert(len(ds)==10)
+    print "example 0 = ",ds[0]
+#    assert
+    print "x=",ds["x"]
+    print "x|y"
+    for x,y in ds("x","y"):
+        print x,y
+    minibatch_iterator = ds.minibatches(fieldnames=['z','y'],n_batches=1,minibatch_size=3,offset=4)
+    minibatch = minibatch_iterator.__iter__().next()
+    print "minibatch=",minibatch
+    for var in minibatch:
+        print "var=",var
+    print "take a slice and look at field y",ds[1:6:2]["y"]
+
+    del a,ds,x,y,minibatch_iterator,minibatch,var
 
-def _sum_all(a):
-    s=a
-    while isinstance(s,numpy.ndarray):
-        s=sum(s)
-    return s
-    
-class T_arraydataset(unittest.TestCase):
-    def setUp(self):
-        numpy.random.seed(123456)
+def test_iterate_over_examples(array,ds):
+#not in doc!!!
+    i=0
+    for example in range(len(ds)):
+        wanted = array[example][:3]
+        returned = ds[example]['x']
+        if (wanted != returned).all():
+            print 'returned:', returned
+            print 'wanted:', wanted
+        assert (ds[example]['x']==array[example][:3]).all()
+        assert ds[example]['y']==array[example][3]
+        assert (ds[example]['z']==array[example][[0,2]]).all()
+        i+=1
+    assert i==len(ds)
+    del example,i
+
+#     - for example in dataset:
+    i=0
+    for example in ds:
+        assert len(example)==3
+        assert (example['x']==array[i][:3]).all()
+        assert example['y']==array[i][3]
+        assert (example['z']==array[i][0:3:2]).all()
+        assert (numpy.append(example['x'],example['y'])==array[i]).all()
+        i+=1
+    assert i==len(ds)
+    del example,i
 
+#     - for val1,val2,... in dataset:
+    i=0
+    for x,y,z in ds:
+        assert (x==array[i][:3]).all()
+        assert y==array[i][3]
+        assert (z==array[i][0:3:2]).all()
+        assert (numpy.append(x,y)==array[i]).all()
+        i+=1
+    assert i==len(ds)
+    del x,y,z,i
+
+#     - for example in dataset(field1, field2,field3, ...):
+    i=0
+    for example in ds('x','y','z'):
+        assert len(example)==3
+        assert (example['x']==array[i][:3]).all()
+        assert example['y']==array[i][3]
+        assert (example['z']==array[i][0:3:2]).all()
+        assert (numpy.append(example['x'],example['y'])==array[i]).all()
+        i+=1
+    assert i==len(ds)
+    del example,i
+    i=0
+    for example in ds('y','x'):
+        assert len(example)==2
+        assert (example['x']==array[i][:3]).all()
+        assert example['y']==array[i][3]
+        assert (numpy.append(example['x'],example['y'])==array[i]).all()
+        i+=1
+    assert i==len(ds)
+    del example,i
 
-    def test_ctor_len(self):
-        n = numpy.random.rand(8,3)
-        a=ArrayDataSet(n)
-        self.failUnless(a.data is n)
-        self.failUnless(a.fields is None)
+#     - for val1,val2,val3 in dataset(field1, field2,field3):
+    i=0
+    for x,y,z in ds('x','y','z'):
+        assert (x==array[i][:3]).all()
+        assert y==array[i][3]
+        assert (z==array[i][0:3:2]).all()
+        assert (numpy.append(x,y)==array[i]).all()
+        i+=1
+    assert i==len(ds)
+    del x,y,z,i
+    i=0
+    for y,x in ds('y','x',):
+        assert (x==array[i][:3]).all()
+        assert y==array[i][3]
+        assert (numpy.append(x,y)==array[i]).all()
+        i+=1
+    assert i==len(ds)
+    del x,y,i
 
-        self.failUnless(len(a) == n.shape[0])
-        self.failUnless(a[0].shape == (n.shape[1],))
+    def test_minibatch_size(minibatch,minibatch_size,len_ds,nb_field,nb_iter_finished):
+        ##full minibatch or the last minibatch
+        for idx in range(nb_field):
+            test_minibatch_field_size(minibatch[idx],minibatch_size,len_ds,nb_iter_finished)
+        del idx
+    def test_minibatch_field_size(minibatch_field,minibatch_size,len_ds,nb_iter_finished):
+        assert len(minibatch_field)==minibatch_size or ((nb_iter_finished*minibatch_size+len(minibatch_field))==len_ds and len(minibatch_field)<minibatch_size)
+
+#     - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N):
+    i=0
+    mi=0
+    m=ds.minibatches(['x','z'], minibatch_size=3)
+    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    for minibatch in m:
+        assert isinstance(minibatch,DataSetFields)
+        assert len(minibatch)==2
+        test_minibatch_size(minibatch,m.minibatch_size,len(ds),2,mi)
+        if type(ds)==ArrayDataSet:
+            assert (minibatch[0][:,::2]==minibatch[1]).all()
+        else:
+            for j in xrange(len(minibatch[0])):
+                (minibatch[0][j][::2]==minibatch[1][j]).all()
+        mi+=1
+        i+=len(minibatch[0])
+    assert i==len(ds)
+    assert mi==4
+    del minibatch,i,m,mi
 
-    def test_iter(self):
-        arr = numpy.random.rand(8,3)
-        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,3)})
-        for i, example in enumerate(a):
-            self.failUnless(numpy.all( example['x'] == arr[i,:2]))
-            self.failUnless(numpy.all( example['y'] == arr[i,1:3]))
+    i=0
+    mi=0
+    m=ds.minibatches(['x','y'], minibatch_size=3)
+    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    for minibatch in m:
+        assert len(minibatch)==2
+        test_minibatch_size(minibatch,m.minibatch_size,len(ds),2,mi)
+        mi+=1
+        for id in range(len(minibatch[0])):
+            assert (numpy.append(minibatch[0][id],minibatch[1][id])==array[i]).all()
+            i+=1
+    assert i==len(ds)
+    assert mi==4
+    del minibatch,i,id,m,mi
 
-    def test_zip(self):
-        arr = numpy.random.rand(8,3)
-        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,3)})
-        for i, x in enumerate(a.zip("x")):
-            self.failUnless(numpy.all( x == arr[i,:2]))
+#     - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N):
+    i=0
+    mi=0
+    m=ds.minibatches(['x','z'], minibatch_size=3)
+    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    for x,z in m:
+        test_minibatch_field_size(x,m.minibatch_size,len(ds),mi)
+        test_minibatch_field_size(z,m.minibatch_size,len(ds),mi)
+        for id in range(len(x)):
+            assert (x[id][::2]==z[id]).all()
+            i+=1
+        mi+=1
+    assert i==len(ds)
+    assert mi==4
+    del x,z,i,m,mi
+    i=0
+    mi=0
+    m=ds.minibatches(['x','y'], minibatch_size=3)
+    for x,y in m:
+        test_minibatch_field_size(x,m.minibatch_size,len(ds),mi)
+        test_minibatch_field_size(y,m.minibatch_size,len(ds),mi)
+        mi+=1
+        for id in range(len(x)):
+            assert (numpy.append(x[id],y[id])==array[i]).all()
+            i+=1
+    assert i==len(ds)
+    assert mi==4
+    del x,y,i,id,m,mi
+
+#not in doc
+    i=0
+    m=ds.minibatches(['x','y'],n_batches=1,minibatch_size=3,offset=4)
+    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    for x,y in m:
+        assert len(x)==m.minibatch_size
+        assert len(y)==m.minibatch_size
+        for id in range(m.minibatch_size):
+            assert (numpy.append(x[id],y[id])==array[i+4]).all()
+            i+=1
+    assert i==m.n_batches*m.minibatch_size
+    del x,y,i,id,m
+
+    i=0
+    m=ds.minibatches(['x','y'],n_batches=2,minibatch_size=3,offset=4)
+    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    for x,y in m:
+        assert len(x)==m.minibatch_size
+        assert len(y)==m.minibatch_size
+        for id in range(m.minibatch_size):
+            assert (numpy.append(x[id],y[id])==array[i+4]).all()
+            i+=1
+    assert i==m.n_batches*m.minibatch_size
+    del x,y,i,id,m
 
-    def test_minibatch_basic(self):
-        arr = numpy.random.rand(10,4)
-        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
-        for i, mb in enumerate(a.minibatches(minibatch_size=2)): #all fields
-            self.failUnless(numpy.all( mb['x'] == arr[i*2:i*2+2,0:2]))
-            self.failUnless(numpy.all( mb['y'] == arr[i*2:i*2+2,1:4]))
+    i=0
+    m=ds.minibatches(['x','y'],n_batches=20,minibatch_size=3,offset=4)
+    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
+    for x,y in m:
+        assert len(x)==m.minibatch_size
+        assert len(y)==m.minibatch_size
+        for id in range(m.minibatch_size):
+            assert (numpy.append(x[id],y[id])==array[(i+4)%array.shape[0]]).all()
+            i+=1
+    assert i==m.n_batches*m.minibatch_size
+    del x,y,i,id
+
+    assert not have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array)+1,offset=0)
+    assert not have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array),offset=0)
 
-    def test_getattr(self):
-        arr = numpy.random.rand(10,4)
-        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
-        a_y = a.y
-        self.failUnless(numpy.all( a_y == arr[:,1:4]))
+def test_ds_iterator(array,iterator1,iterator2,iterator3):
+    l=len(iterator1)
+    i=0
+    for x,y in iterator1:
+        assert (x==array[i][:3]).all()
+        assert y==array[i][3]
+        assert (numpy.append(x,y)==array[i]).all()
+        i+=1
+    assert i==l
+    i=0
+    for y,z in iterator2:
+        assert y==array[i][3]
+        assert (z==array[i][0:3:2]).all()
+        i+=1
+    assert i==l
+    i=0
+    for x,y,z in iterator3:
+        assert (x==array[i][:3]).all()
+        assert y==array[i][3]
+        assert (z==array[i][0:3:2]).all()
+        assert (numpy.append(x,y)==array[i]).all()
+        i+=1
+    assert i==l
+    
+def test_getitem(array,ds):
+    def test_ds(orig,ds,index):
+        i=0
+        assert len(ds)==len(index)
+        for x,z,y in ds('x','z','y'):
+            assert (orig[index[i]]['x']==array[index[i]][:3]).all()
+            assert (orig[index[i]]['x']==x).all()
+            assert orig[index[i]]['y']==array[index[i]][3]
+            assert (orig[index[i]]['y']==y).all() # why does it crash sometimes?
+            assert (orig[index[i]]['z']==array[index[i]][0:3:2]).all()
+            assert (orig[index[i]]['z']==z).all()
+            i+=1
+        del i
+        ds[0]
+        if len(ds)>2:
+            ds[:1]
+            ds[1:1]
+            ds[1:1:1]
+        if len(ds)>5:
+            ds[[1,2,3]]
+        for x in ds:
+            pass
 
-    def test_minibatch_wraparound_even(self):
-        arr = numpy.random.rand(10,4)
-        arr2 = ArrayDataSet.Iterator.matcat(arr,arr)
+#ds[:n] returns a dataset with the n first examples.
+    ds2=ds[:3]
+    assert isinstance(ds2,LookupList)
+    test_ds(ds,ds2,index=[0,1,2])
+    del ds2
+
+#ds[i1:i2:s]# returns a ds with the examples i1,i1+s,...i2-s.
+    ds2=ds.subset[1:7:2]
+    assert isinstance(ds2,DataSet)
+    test_ds(ds,ds2,[1,3,5])
+    del ds2
+
+#ds[i]
+    ds2=ds[5]
+    assert isinstance(ds2,Example)
+    assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds)  # index not defined
+    assert not have_raised("var['ds']["+str(len(ds)-1)+"]",ds=ds)
+    del ds2
 
-        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
+#ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in.
+    ds2=ds.subset[[4,7,2,8]]
+    assert isinstance(ds2,DataSet)
+    test_ds(ds,ds2,[4,7,2,8])
+    del ds2
+
+    #ds.<property># returns the value of a property associated with
+      #the name <property>. The following properties should be supported:
+      #    - 'description': a textual description or name for the ds
+      #    - 'fieldtypes': a list of types (one per field)
+
+    #* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#????
+        #assert hstack([ds('x','y'),ds('z')])==ds
+        #hstack([ds('z','y'),ds('x')])==ds
+    assert have_raised2(hstack,[ds('x'),ds('x')])
+    assert have_raised2(hstack,[ds('y','x'),ds('x')])
+    assert not have_raised2(hstack,[ds('x'),ds('y')])
+        
+    #        i=0
+    #        for example in hstack([ds('x'),ds('y'),ds('z')]):
+    #            example==ds[i]
+    #            i+=1 
+    #        del i,example
+    #* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#????
 
-        #print arr
-        for i, x in enumerate(a.minibatches(["x"], minibatch_size=2, n_batches=8)):
-            #print 'x' , x
-            self.failUnless(numpy.all( x == arr2[i*2:i*2+2,0:2]))
+def test_fields_fct(ds):
+    #@todo, fill correctly
+    assert len(ds.fields())==3
+    i=0
+    v=0
+    for field in ds.fields():
+        for field_value in field: # iterate over the values associated to that field for all the ds examples
+            v+=1
+        i+=1
+    assert i==3
+    assert v==3*10
+    del i,v
+    
+    i=0
+    v=0
+    for field in ds('x','z').fields():
+        i+=1
+        for val in field:
+            v+=1
+    assert i==2
+    assert v==2*10
+    del i,v
+    
+    i=0
+    v=0
+    for field in ds.fields('x','y'):
+        i+=1
+        for val in field:
+            v+=1
+    assert i==2
+    assert v==2*10
+    del i,v
+    
+    i=0
+    v=0
+    for field_examples in ds.fields():
+        for example_value in field_examples:
+            v+=1
+        i+=1
+    assert i==3
+    assert v==3*10
+    del i,v
+    
+    assert ds == ds.fields().examples()
+    assert len(ds('x','y').fields()) == 2
+    assert len(ds('x','z').fields()) == 2
+    assert len(ds('y').fields()) == 1
 
-    def test_minibatch_wraparound_odd(self):
-        arr = numpy.random.rand(10,4)
-        arr2 = ArrayDataSet.Iterator.matcat(arr,arr)
+    del field
 
-        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
+def test_overrides(ds) :
+    """ Test for examples that an override __getitem__ acts as the one in DataSet """
+    def ndarray_list_equal(nda,l) :
+        """ 
+        Compares if a ndarray is the same as the list. Do it by converting the list into
+        an numpy.ndarray, if possible
+        """
+        try :
+            l = numpy.asmatrix(l)
+        except :
+            return False
+        return smart_equal(nda,l)
+        
+    def smart_equal(a1,a2) :
+        """
+        Handles numpy.ndarray, LookupList, and basic containers
+        """
+        if not isinstance(a1,type(a2)) and not isinstance(a2,type(a1)):
+            #special case: matrix vs list of arrays
+            if isinstance(a1,numpy.ndarray) :
+                return ndarray_list_equal(a1,a2)
+            elif isinstance(a2,numpy.ndarray) :
+                return ndarray_list_equal(a2,a1)
+            return False
+        # compares 2 numpy.ndarray
+        if isinstance(a1,numpy.ndarray):
+            if len(a1.shape) != len(a2.shape):
+                return False
+            for k in range(len(a1.shape)) :
+                if a1.shape[k] != a2.shape[k]:
+                    return False
+            return (a1==a2).all()
+        # compares 2 lookuplists
+        if isinstance(a1,LookupList) :
+            if len(a1._names) != len(a2._names) :
+                return False
+            for k in a1._names :
+                if k not in a2._names :
+                    return False
+                if not smart_equal(a1[k],a2[k]) :
+                    return False
+            return True
+        # compares 2 basic containers
+        if hasattr(a1,'__len__'):
+            if len(a1) != len(a2) :
+                return False
+            for k in range(len(a1)) :
+                if not smart_equal(a1[k],a2[k]):
+                    return False
+            return True
+        # try basic equals
+        return a1 is a2
 
-        for i, x in enumerate(a.minibatches(["x"], minibatch_size=3, n_batches=6)):
-            self.failUnless(numpy.all( x == arr2[i*3:i*3+3,0:2]))
+    def mask(ds) :
+        class TestOverride(type(ds)):
+            def __init__(self,ds) :
+                self.ds = ds
+            def __getitem__(self,key) :
+                res1 = self.ds[key]
+                res2 = DataSet.__getitem__(ds,key)
+                assert smart_equal(res1,res2)
+                return res1
+        return TestOverride(ds)
+    # test getitem
+    ds2 = mask(ds)
+    for k in range(10):
+        res = ds2[k]
+    res = ds2[1:len(ds):3]
     
+        
 
-class T_renamingdataset(unittest.TestCase):
-    def setUp(self):
-        numpy.random.seed(123456)
+    
 
 
-    def test_hasfield(self):
-        n = numpy.random.rand(3,8)
-        a=ArrayDataSet(data=n,fields={"x":slice(2),"y":slice(1,4),"z":slice(4,6)})
-        b=a.rename({'xx':'x','zz':'z'})
-        self.failUnless(b.hasFields('xx','zz') and not b.hasFields('x') and not b.hasFields('y'))
-
-class T_applyfunctiondataset(unittest.TestCase):
-    def setUp(self):
-        numpy.random.seed(123456)
-
-    def test_function(self):
-        n = numpy.random.rand(3,8)
-        a=ArrayDataSet(data=n,fields={"x":slice(2),"y":slice(1,4),"z":slice(4,6)})
-        b=a.apply_function(lambda x,y: x+y,x+1, ['x','y'], ['x+y','x+1'], False,False,False)
-        print b.fieldNames()
-        print b('x+y')
-        
+def test_all(array,ds):
+    assert len(ds)==10
+    test_iterate_over_examples(array, ds)
+    test_overrides(ds)
+    test_getitem(array, ds)
+    test_ds_iterator(array,ds('x','y'),ds('y','z'),ds('x','y','z'))
+    test_fields_fct(ds)
 
 
+class T_DataSet(unittest.TestCase):
+    def test_ArrayDataSet(self):
+        #don't test stream
+        #tested only with float value
+        #don't always test with y
+        #don't test missing value
+        #don't test with tuple
+        #don't test proterties
+        a2 = numpy.random.rand(10,4)
+        ds = ArrayDataSet(a2,{'x':slice(3),'y':3,'z':[0,2]})###???tuple not tested
+        ds = ArrayDataSet(a2,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
+        #assert ds==a? should this work?
 
-# to be used with a any new dataset
-class T_dataset_tester(object):
-    """
-    This class' goal is to test any new dataset that is created
-    Tests are (will be!) designed to check the normal behaviours
-    of a dataset, as defined in dataset.py
-    """
+        test_all(a2,ds)
+
+        del a2, ds
+
+    def test_CachedDataSet(self):
+        a = numpy.random.rand(10,4)
+        ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
+        ds2 = CachedDataSet(ds1)
+        ds3 = CachedDataSet(ds1,cache_all_upon_construction=True)
+
+        test_all(a,ds2)
+        test_all(a,ds3)
+
+        del a,ds1,ds2,ds3
 
 
-    def __init__(self,ds,runall=True) :
-        """if interested in only a subset of test, init with runall=False"""
-        self.ds = ds
-        
-        if runall :
-            self.test1_basicstats(ds)
-            self.test2_slicing(ds)
-            self.test3_fields_iterator_consistency(ds)
+    def test_DataSetFields(self):
+        raise NotImplementedError()
+
+    def test_ApplyFunctionDataSet(self):
+        a = numpy.random.rand(10,4)
+        a2 = a+1
+        ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
 
-    def test1_basicstats(self,ds) :
-        """print basics stats on a dataset, like length"""
+        ds2 = ApplyFunctionDataSet(ds1,lambda x,y,z: (x+1,y+1,z+1), ['x','y','z'],minibatch_mode=False)
+        ds3 = ApplyFunctionDataSet(ds1,lambda x,y,z: (numpy.array(x)+1,numpy.array(y)+1,numpy.array(z)+1),
+                                   ['x','y','z'],
+                                   minibatch_mode=True)
 
-        print 'len(ds) = ',len(ds)
-        print 'num fields = ', len(ds.fieldNames())
-        print 'types of field: ',
-        for k in ds.fieldNames() :
-            print type(ds[0](k)[0]),
-        print ''
+        test_all(a2,ds2)
+        test_all(a2,ds3)
+
+        del a,ds1,ds2,ds3
 
-    def test2_slicing(self,ds) :
-        """test if slicing works properly"""
-        print 'testing slicing...',
-        sys.stdout.flush()
-        
-        middle = len(ds) / 2
-        tenpercent = int(len(ds) * .1)
-        set1 = ds[:middle+tenpercent]
-        set2 = ds[middle-tenpercent:]
-        for k in range(tenpercent + tenpercent -1):
-            for k2 in ds.fieldNames() :
-                if type(set1[middle-tenpercent+k](k2)[0]) == N.ndarray :
-                    for k3 in range(len(set1[middle-tenpercent+k](k2)[0])) :
-                        assert set1[middle-tenpercent+k](k2)[0][k3] == set2[k](k2)[0][k3]
-                else :
-                    assert set1[middle-tenpercent+k](k2)[0] == set2[k](k2)[0]
-        assert tenpercent > 1
-        set3 = ds[middle-tenpercent:middle+tenpercent:2]
-        for k2 in ds.fieldNames() :
-            if type(set2[2](k2)[0]) == N.ndarray :
-                for k3 in range(len(set2[2](k2)[0])) :
-                    assert set2[2](k2)[0][k3] == set3[1](k2)[0][k3]
-            else :
-                assert set2[2](k2)[0] == set3[1](k2)[0]
+    def test_FieldsSubsetDataSet(self):
+        a = numpy.random.rand(10,4)
+        ds = ArrayDataSet(a,Example(['x','y','z','w'],[slice(3),3,[0,2],0]))
+        ds = FieldsSubsetDataSet(ds,['x','y','z'])
+
+        test_all(a,ds)
+
+        del a, ds
 
-        print 'done'
+    def test_MinibatchDataSet(self):
+        raise NotImplementedError()
+    def test_HStackedDataSet(self):
+        raise NotImplementedError()
+    def test_VStackedDataSet(self):
+        raise NotImplementedError()
+    def test_ArrayFieldsDataSet(self):
+        raise NotImplementedError()
 
 
-    def test3_fields_iterator_consistency(self,ds) :
-        """ check if the number of iterator corresponds to the number of fields"""
-        print 'testing fields/iterator consistency...',
-        sys.stdout.flush()
-
-        # basic test
-        maxsize = min(len(ds)-1,100)
-        for iter in ds[:maxsize] :
-            assert len(iter) == len(ds.fieldNames())
-        if len(ds.fieldNames()) == 1 :
-            print 'done'
-            return
+class T_Exotic1(unittest.TestCase):
+    class DataSet(DataSet):
+            """ Dummy dataset, where one field is a ndarray of variables size. """
+            def __len__(self) :
+                return 100
+            def fieldNames(self) :
+                return 'input','target','name'
+            def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+                class MultiLengthDataSetIterator(object):
+                    def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset):
+                        if fieldnames is None: fieldnames = dataset.fieldNames()
+                        self.minibatch = Example(fieldnames,range(len(fieldnames)))
+                        self.dataset, self.minibatch_size, self.current = dataset, minibatch_size, offset
+                    def __iter__(self):
+                            return self
+                    def next(self):
+                        for k in self.minibatch._names :
+                            self.minibatch[k] = []
+                        for ex in range(self.minibatch_size) :
+                            if 'input' in self.minibatch._names:
+                                self.minibatch['input'].append( numpy.array( range(self.current + 1) ) )
+                            if 'target' in self.minibatch._names:
+                                self.minibatch['target'].append( self.current % 2 )
+                            if 'name' in self.minibatch._names:
+                                self.minibatch['name'].append( str(self.current) )
+                            self.current += 1
+                        return self.minibatch
+                return MultiLengthDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
+    
+    def test_ApplyFunctionDataSet(self):
+        ds = T_Exotic1.DataSet()
+        dsa = ApplyFunctionDataSet(ds,lambda x,y,z: (x[-1],y*10,int(z)),['input','target','name'],minibatch_mode=False) #broken!!!!!!
+        for k in range(len(dsa)):
+            res = dsa[k]
+            self.failUnless(ds[k]('input')[0][-1] == res('input')[0] , 'problem in first applied function')
+        res = dsa[33:96:3]
+          
+    def test_CachedDataSet(self):
+        ds = T_Exotic1.DataSet()
+        dsc = CachedDataSet(ds)
+        for k in range(len(dsc)) :
+            self.failUnless(numpy.all( dsc[k]('input')[0] == ds[k]('input')[0] ) , (dsc[k],ds[k]) )
+        res = dsc[:]
 
-        # with minibatches iterator
-        ds2 = ds.minibatches[:maxsize]([ds.fieldNames()[0],ds.fieldNames()[1]],minibatch_size=2)
-        for iter in ds2 :
-            assert len(iter) == 2
-
-        print 'done'
-
+if __name__=='__main__':
+    if len(sys.argv)==2:
+        if sys.argv[1]=="--debug":
+            module = __import__("_test_dataset")
+            tests = unittest.TestLoader().loadTestsFromModule(module)
+            tests.debug()
+        print "bad argument: only --debug is accepted"
+    elif len(sys.argv)==1:
+        unittest.main()
+    else:
+        print "bad argument: only --debug is accepted"
 
-
-
-
-###################################################################
-# main
-if __name__ == '__main__':
-    unittest.main()
-    
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/_test_filetensor.py	Mon Jun 16 17:47:36 2008 -0400
@@ -0,0 +1,116 @@
+from filetensor import *
+import filetensor
+
+import unittest
+import os
+
+class T(unittest.TestCase):
+    fname = '/tmp/some_mat'
+
+    def setUp(self):
+        #TODO: test that /tmp/some_mat does not exist
+        try:
+            os.stat(self.fname)
+        except OSError:
+            return #assume file was not found
+        raise Exception('autotest file "%s" exists!' % self.fname)
+
+    def tearDown(self):
+        os.remove(self.fname)
+
+    def test_file(self):
+        gen = numpy.random.rand(1)
+        f = file(self.fname, 'w');
+        write(f, gen)
+        f.flush()
+        f = file(self.fname, 'r');
+        mat = read(f, None, debug=False) #load from filename
+        self.failUnless(gen.shape == mat.shape)
+        self.failUnless(numpy.all(gen == mat))
+
+    def test_filename(self):
+        gen = numpy.random.rand(1)
+        write(self.fname, gen)
+        mat = read(self.fname, None, debug=False) #load from filename
+        self.failUnless(gen.shape == mat.shape)
+        self.failUnless(numpy.all(gen == mat))
+
+    def testNd(self):
+        """shape and values are stored correctly for tensors of rank 0 to 5"""
+        whole_shape = [5, 6, 7, 8, 9]
+        for i in xrange(5):
+            gen = numpy.asarray(numpy.random.rand(*whole_shape[:i]))
+            f = file(self.fname, 'w');
+            write(f, gen)
+            f.flush()
+            f = file(self.fname, 'r');
+            mat = read(f, None, debug=False) #load from filename
+            self.failUnless(gen.shape == mat.shape)
+            self.failUnless(numpy.all(gen == mat))
+
+    def test_dtypes(self):
+        """shape and values are stored correctly for all dtypes """
+        for dtype in filetensor._dtype_magic:
+            gen = numpy.asarray(
+                    numpy.random.rand(4, 5, 2, 1) * 100,
+                    dtype=dtype)
+            f = file(self.fname, 'w');
+            write(f, gen)
+            f.flush()
+            f = file(self.fname, 'r');
+            mat = read(f, None, debug=False) #load from filename
+            self.failUnless(gen.dtype == mat.dtype)
+            self.failUnless(gen.shape == mat.shape)
+            self.failUnless(numpy.all(gen == mat))
+
+    def test_dtype_invalid(self):
+        gen = numpy.zeros((3,4), dtype='uint16') #an unsupported dtype
+        f = file(self.fname, 'w')
+        passed = False
+        try:
+            write(f, gen)
+        except TypeError, e:
+            if e[0].startswith('Invalid ndarray dtype'):
+                passed = True
+        f.close()
+        self.failUnless(passed)
+        
+
+if __name__ == '__main__':
+    unittest.main()
+
+    #a small test script, starts by reading sys.argv[1]
+    #print 'rval', rval.shape, rval.size
+
+    if 0:
+        write(f, rval)
+        print ''
+        f.close()
+        f = file('/tmp/some_mat', 'r');
+        rval2 = read(f) #load from file handle
+        print 'rval2', rval2.shape, rval2.size
+
+        assert rval.dtype == rval2.dtype
+        assert rval.shape == rval2.shape
+        assert numpy.all(rval == rval2)
+        print 'ok'
+
+    def _unused():
+        f.seek(0,2) #seek to end
+        f_len =  f.tell()
+        f.seek(f_data_start,0) #seek back to where we were
+
+        if debug: print 'length:', f_len
+
+
+        f_data_bytes = (f_len - f_data_start)
+
+        if debug: print 'data bytes according to header: ', dim_size * elsize
+        if debug: print 'data bytes according to file  : ', f_data_bytes
+
+        if debug: print 'reading data...'
+        sys.stdout.flush()
+
+    def read_ndarray(f, dim, dtype):
+        return numpy.fromfile(f, dtype=dtype, count=_prod(dim)).reshape(dim)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/_test_lookup_list.py	Mon Jun 16 17:47:36 2008 -0400
@@ -0,0 +1,24 @@
+from lookup_list import *
+import unittest
+
+class T_LookUpList(unittest.TestCase):
+    def test_LookupList(self):
+        #test only the example in the doc???
+        example = LookupList(['x','y','z'],[1,2,3])
+        example['x'] = [1, 2, 3] # set or change a field
+        x, y, z = example
+        x = example[0]
+        x = example["x"]
+        assert example.keys()==['x','y','z']
+        assert example.values()==[[1,2,3],2,3]
+        assert example.items()==[('x',[1,2,3]),('y',2),('z',3)]
+        example.append_keyval('u',0) # adds item with name 'u' and value 0
+        assert len(example)==4 # number of items = 4 here
+        example2 = LookupList(['v','w'], ['a','b'])
+        example3 = LookupList(['x','y','z','u','v','w'], [[1, 2, 3],2,3,0,'a','b'])
+        assert example+example2==example3
+        self.assertRaises(AssertionError,example.__add__,example)
+        del example, example2, example3, x, y ,z
+
+if __name__=='__main__':
+    unittest.main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/_test_nnet_ops.py	Mon Jun 16 17:47:36 2008 -0400
@@ -0,0 +1,41 @@
+
+import unittest
+import theano._test_tensor as TT
+import numpy
+
+from nnet_ops import *
+
+class T_sigmoid(unittest.TestCase):
+    def setUp(self):
+        numpy.random.seed(9999)
+    def test_elemwise(self):
+        TT.verify_grad(self, sigmoid, [numpy.random.rand(3,4)])
+
+class T_softplus(unittest.TestCase):
+    def setUp(self):
+        numpy.random.seed(9999)
+    def test_elemwise(self):
+        TT.verify_grad(self, softplus, [numpy.random.rand(3,4)])
+
+class T_CrossentropySoftmax1Hot(unittest.TestCase):
+    def setUp(self):
+        numpy.random.seed(9999)
+    def test0(self):
+        y_idx = [0,1,3]
+        class Dummy(object):
+            def make_node(self, a,b):
+                return crossentropy_softmax_1hot_with_bias(a, b, y_idx)[0:1]
+        TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4),
+            numpy.random.rand(4)])
+
+    def test1(self):
+        y_idx = [0,1,3]
+        class Dummy(object):
+            def make_node(self, a):
+                return crossentropy_softmax_1hot(a, y_idx)[0:1]
+        TT.verify_grad(self, Dummy(), [numpy.random.rand(3,4)])
+
+
+
+if __name__ == '__main__':
+    unittest.main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/amat.py	Mon Jun 16 17:47:36 2008 -0400
@@ -0,0 +1,123 @@
+"""load PLearn AMat files"""
+
+import sys, numpy, array
+
+path_MNIST = '/u/bergstrj/pub/data/mnist.amat'
+
+
+class AMat:
+    """DataSource to access a plearn amat file as a periodic unrandomized stream.
+
+    Attributes:
+
+    input -- minibatch of input
+    target -- minibatch of target
+    weight -- minibatch of weight
+    extra -- minitbatch of extra
+
+    all -- the entire data contents of the amat file
+    n_examples -- the number of training examples in the file
+
+    AMat stands for Ascii Matri[x,ces]
+
+    """
+
+    marker_size = '#size:'
+    marker_sizes = '#sizes:'
+    marker_col_names = '#:'
+
+    def __init__(self, path, head=None, update_interval=0, ofile=sys.stdout):
+
+        """Load the amat at <path> into memory.
+        
+        path - str: location of amat file
+        head - int: stop reading after this many data rows
+        update_interval - int: print '.' to ofile every <this many> lines
+        ofile - file: print status, msgs, etc. to this file
+
+        """
+        self.all = None
+        self.input = None
+        self.target = None
+        self.weight = None
+        self.extra = None
+
+        self.header = False
+        self.header_size = None
+        self.header_rows = None
+        self.header_cols = None
+        self.header_sizes = None
+        self.header_col_names = []
+
+        data_started = False
+        data = array.array('d')
+        
+        f = open(path)
+        n_data_lines = 0
+        len_float_line = None
+
+        for i,line in enumerate(f):
+            if n_data_lines == head:
+                #we've read enough data, 
+                # break even if there's more in the file
+                break
+            if len(line) == 0 or line == '\n':
+                continue
+            if line[0] == '#':
+                if not data_started:
+                    #the condition means that the file has a header, and we're on 
+                    # some header line
+                    self.header = True
+                    if line.startswith(AMat.marker_size):
+                        info = line[len(AMat.marker_size):]
+                        self.header_size = [int(s) for s in info.split()]
+                        self.header_rows, self.header_cols = self.header_size
+                    if line.startswith(AMat.marker_col_names):
+                        info = line[len(AMat.marker_col_names):]
+                        self.header_col_names = info.split()
+                    elif line.startswith(AMat.marker_sizes):
+                        info = line[len(AMat.marker_sizes):]
+                        self.header_sizes = [int(s) for s in info.split()]
+            else:
+                #the first non-commented line tells us that the header is done
+                data_started = True
+                float_line = [float(s) for s in line.split()]
+                if len_float_line is None:
+                    len_float_line = len(float_line)
+                    if (self.header_cols is not None) \
+                            and self.header_cols != len_float_line:
+                        print >> sys.stderr, \
+                                'WARNING: header declared %i cols but first line has %i, using %i',\
+                                self.header_cols, len_float_line, len_float_line
+                else:
+                    if len_float_line != len(float_line):
+                        raise IOError('wrong line length', i, line)
+                data.extend(float_line)
+                n_data_lines += 1
+
+                if update_interval > 0 and (ofile is not None) \
+                        and n_data_lines % update_interval == 0:
+                    ofile.write('.')
+                    ofile.flush()
+
+        if update_interval > 0:
+            ofile.write('\n')
+        f.close()
+
+        # convert from array.array to numpy.ndarray
+        nshape = (len(data) / len_float_line, len_float_line)
+        self.all = numpy.frombuffer(data).reshape(nshape)
+        self.n_examples = self.all.shape[0]
+
+        # assign
+        if self.header_sizes is not None:
+            if len(self.header_sizes) > 4:
+                print >> sys.stderr, 'WARNING: ignoring sizes after 4th in %s' % path
+            leftmost = 0
+            #here we make use of the fact that if header_sizes has len < 4
+            # the loop will exit before 4 iterations
+            attrlist = ['input', 'target', 'weight', 'extra']
+            for attr, ncols in zip(attrlist, self.header_sizes): 
+                setattr(self, attr, self.all[:, leftmost:leftmost+ncols])
+                leftmost += ncols
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/autotest.py	Mon Jun 16 17:47:36 2008 -0400
@@ -0,0 +1,54 @@
+import unittest, os, sys, traceback
+
+def test_root_dir(debugmode=False):
+    suite = None
+    filenames = os.listdir('.')
+    for filename in filenames:
+        if filename[-3:] == '.py' and filename.startswith('_test'):
+            #print >>sys.stderr, 'Loading', modname
+            modname = filename[0:-3]
+
+            try:
+                module = __import__(modname)
+            except Exception, e:
+                print >>sys.stderr, "===================================================="
+                print >>sys.stderr, "Failed to load %s.py" % modname
+                print >>sys.stderr, "===================================================="
+                traceback.print_exc()
+                print >>sys.stderr, "===================================================="
+                continue
+                
+            tests = unittest.TestLoader().loadTestsFromModule(module)
+            if tests.countTestCases() > 0:
+                print >>sys.stderr, 'Testing', modname
+                if suite is None:
+                    suite = tests
+                else:
+                    suite.addTests(tests)
+    if suite is None:
+        print >>sys.stderr, "No suite found"
+        sys.exit(1)
+    if debugmode:
+        suite.debug()
+    else:
+        unittest.TextTestRunner(verbosity=1).run(suite)
+
+if __name__ == '__main__':
+
+    def printUsage():
+        print >>sys.stderr, "Bad argument: ",sys.argv
+        print >>sys.stderr, "only --debug is supported"
+        sys.exit(1)
+    debugparam=""
+
+    if len(sys.argv)==2:
+        if sys.argv[1]=="--debug":
+            debugparam="--debug"
+            sys.argv.remove(debugparam)
+        else:
+            printUsage()
+    elif len(sys.argv)>2:
+        printUsage()
+
+    test_root_dir(debugparam!="")
+
--- a/dataset.py	Tue Jun 03 21:27:32 2008 -0400
+++ b/dataset.py	Mon Jun 16 17:47:36 2008 -0400
@@ -1,6 +1,5 @@
 
-from lookup_list import LookupList
-Example = LookupList
+from lookup_list import LookupList as Example
 from misc import unique_elements_list_intersection
 from string import join
 from sys import maxint
@@ -38,7 +37,6 @@
         else:
             return [self.__getattribute__(name) for name in attribute_names]
     
-    
 class DataSet(AttributesHolder):
     """A virtual base class for datasets.
 
@@ -101,17 +99,24 @@
     of examples) can be extracted. These operations are not supported
     by default in the case of streams.
 
-     - dataset[:n] returns a dataset with the n first examples.
+     - dataset[:n] returns an Example with the n first examples.
 
-     - dataset[i1:i2:s] returns a dataset with the examples i1,i1+s,...i2-s.
+     - dataset[i1:i2:s] returns an Example with the examples i1,i1+s,...i2-s.
 
      - dataset[i] returns an Example.
 
-     - dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in.
+     - dataset[[i1,i2,...in]] returns a. Example with examples i1,i2,...in.
+
+    A similar command gives you a DataSet instead of Examples :
+
+     - dataset.subset[:n] returns a DataSet with the n first examples.
 
-     - dataset[fieldname] an iterable over the values of the field fieldname across
-     the dataset (the iterable is obtained by default by calling valuesVStack
-     over the values for individual examples).
+     - dataset.subset[i1:i2:s] returns a DataSet with the examples i1,i1+s,...i2-s.
+
+     - dataset.subset[i] returns a DataSet.
+
+     - dataset.subset[[i1,i2,...in]] returns a DataSet with examples i1,i2,...in.
+
 
      - dataset.<property> returns the value of a property associated with
      the name <property>. The following properties should be supported:
@@ -151,9 +156,9 @@
        - __len__ if it is not a stream
        - fieldNames
        - minibatches_nowrap (called by DataSet.minibatches())
+    For efficiency of implementation, a sub-class might also want to redefine
        - valuesHStack
        - valuesVStack
-    For efficiency of implementation, a sub-class might also want to redefine
        - hasFields
        - __getitem__ may not be feasible with some streams
        - __iter__
@@ -167,17 +172,56 @@
     numpy_vstack = lambda fieldname,values: numpy.vstack(values)
     numpy_hstack = lambda fieldnames,values: numpy.hstack(values)
         
-    def __init__(self,description=None,fieldtypes=None):
-        if description is None:
-            # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)"
-            description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )"
-        self.description=description
-        self.fieldtypes=fieldtypes
+    def __init__(self, description=None, fieldnames=None, fieldtypes=None):
+        """
+        @type fieldnames: list of strings
+        @type fieldtypes: list of python types, same length as fieldnames
+        @type description: string 
+        @param description: description/name for this dataset
+        """
+        def default_desc():
+            return type(self).__name__ \
+                    + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )"
+
+        #self.fieldnames = fieldnames
+
+        self.fieldtypes = fieldtypes if fieldtypes is not None \
+                else [None]*1 #len(fieldnames)
+
+        self.description =  default_desc() if description is None \
+                else description
         self._attribute_names = ["description"]
-        if fieldtypes:
-            self._attribute_names.append("fieldtypes")
+
+
+    attributeNames = property(lambda self: copy.copy(self._attribute_names))
+
+    def __contains__(self, fieldname):
+        return (fieldname in self.fieldNames()) \
+                or (fieldname in self.attributeNames())
+
+    def __iter__(self):
+        """Supports the syntax "for i in dataset: ..."
 
-    def attributeNames(self): return self._attribute_names
+        Using this syntax, "i" will be an Example instance (or equivalent) with
+        all the fields of DataSet self.  Every field of "i" will give access to
+        a field of a single example.  Fields should be accessible via
+        i["fielname"] or i[3] (in the order defined by the elements of the
+        Example returned by this iterator), but the derived class is free
+        to accept any type of identifier, and add extra functionality to the iterator.
+
+        The default implementation calls the minibatches iterator and extracts the first example of each field.
+        """
+        return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1))
+
+    def __len__(self):
+        """
+        len(dataset) returns the number of examples in the dataset.
+        By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint).
+        Sub-classes which implement finite-length datasets should redefine this method.
+        Some methods only make sense for finite-length datasets.
+        """
+        return None
+
 
     class MinibatchToSingleExampleIterator(object):
         """
@@ -196,7 +240,13 @@
         def next(self):
             size1_minibatch = self.minibatch_iterator.next()
             if not self.minibatch:
-                self.minibatch = Example(size1_minibatch.keys(),[value[0] for value in size1_minibatch.values()])
+                names = size1_minibatch.keys()
+                # next lines are a hack, but there was problem when we were getting [array(327)] for instance
+                try:
+                    values = [value[0] for value in size1_minibatch.values()]
+                except :
+                    values = [value for value in size1_minibatch.values()]
+                self.minibatch = Example(names,values)
             else:
                 self.minibatch._values = [value[0] for value in size1_minibatch.values()]
             return self.minibatch
@@ -204,24 +254,6 @@
         def next_index(self):
             return self.minibatch_iterator.next_index()
 
-    def __iter__(self):
-        """Supports the syntax "for i in dataset: ..."
-
-        Using this syntax, "i" will be an Example instance (or equivalent) with
-        all the fields of DataSet self.  Every field of "i" will give access to
-        a field of a single example.  Fields should be accessible via
-        i["fielname"] or i[3] (in the order defined by the elements of the
-        Example returned by this iterator), but the derived class is free
-        to accept any type of identifier, and add extra functionality to the iterator.
-
-        The default implementation calls the minibatches iterator and extracts the first example of each field.
-        """
-        return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1))
-
-    def __contains__(self, fieldname):
-        return (fieldname in self.fieldNames()) \
-                or (fieldname in self.attributeNames())
-
     class MinibatchWrapAroundIterator(object):
         """
         An iterator for minibatches that handles the case where we need to wrap around the
@@ -238,9 +270,8 @@
             self.n_batches=n_batches
             self.n_batches_done=0
             self.next_row=offset
-            self.offset=offset
             self.L=len(dataset)
-            assert offset+minibatch_size<=self.L
+            self.offset=offset % self.L
             ds_nbatches =  (self.L-self.next_row)/self.minibatch_size
             if n_batches is not None:
                 ds_nbatches = min(n_batches,ds_nbatches)
@@ -248,8 +279,7 @@
                 assert dataset.hasFields(*fieldnames)
             else:
                 self.fieldnames=dataset.fieldNames()
-            self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size,
-                                                            ds_nbatches,self.next_row)
+            self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, ds_nbatches,self.next_row)
 
         def __iter__(self):
             return self
@@ -278,7 +308,7 @@
                     first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
                     second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next()
                     minibatch = Example(self.fieldnames,
-                                        [self.dataset.valuesAppend(name,[first_part[name],second_part[name]])
+                                        [self.dataset.valuesVStack(name,[first_part[name],second_part[name]])
                                          for name in self.fieldnames])
             self.next_row=upper
             self.n_batches_done+=1
@@ -322,7 +352,7 @@
         f1, f2, and f3 fields of a batch of examples on each loop iteration.
 
         The minibatches iterator is expected to return upon each call to next()
-        a DataSetFields object, which is a LookupList (indexed by the field names) whose
+        a DataSetFields object, which is a Example (indexed by the field names) whose
         elements are iterable and indexable over the minibatch examples, and which keeps a pointer to
         a sub-dataset that can be used to iterate over the individual examples
         in the minibatch. Hence a minibatch can be converted back to a regular
@@ -338,6 +368,7 @@
         On every iteration, the variables i1, i2, i3 will have
         exactly minibatch_size elements. e.g. len(i1) == minibatch_size
 
+        @DEPRECATED n_batches : not used anywhere
         - n_batches (integer, default None)
         The iterator will loop exactly this many times, and then stop.  If None,
         the derived class can choose a default.  If (-1), then the returned
@@ -349,8 +380,17 @@
         Note: A list-like container is something like a tuple, list, numpy.ndarray or
         any other object that supports integer indexing and slicing.
 
+        @ATTENTION: now minibatches returns minibatches_nowrap, which is supposed to return complete
+        batches only, raise StopIteration
+
         """
-        return DataSet.MinibatchWrapAroundIterator(self,fieldnames,minibatch_size,n_batches,offset)
+        #return DataSet.MinibatchWrapAroundIterator(self,fieldnames,minibatch_size,n_batches,offset)\
+        assert offset >= 0
+        assert offset < len(self)
+        assert offset + minibatch_size -1 < len(self)
+        if fieldnames == None :
+            fieldnames = self.fieldNames()
+        return self.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset)
 
     def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
         """
@@ -366,15 +406,6 @@
         """
         raise AbstractFunction()
 
-    def __len__(self):
-        """
-        len(dataset) returns the number of examples in the dataset.
-        By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint).
-        Sub-classes which implement finite-length datasets should redefine this method.
-        Some methods only make sense for finite-length datasets.
-        """
-        return maxint
-
     def is_unbounded(self):
         """
         Tests whether a dataset is unbounded (e.g. a stream).
@@ -412,60 +443,128 @@
         """
         return DataSetFields(self,fieldnames)
 
+    def getitem_key(self, fieldname):
+        """A not-so-well thought-out place to put code that used to be in
+        getitem.
+        """
+        #removing as per discussion June 4. --JSB
+
+        i = fieldname
+        # else check for a fieldname
+        if self.hasFields(i):
+            return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
+        # else we are trying to access a property of the dataset
+        assert i in self.__dict__ # else it means we are trying to access a non-existing property
+        return self.__dict__[i]
+
     def __getitem__(self,i):
         """
-        dataset[i] returns the (i+1)-th example of the dataset.
-        dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
-        dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
-        dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in.
-        dataset['key'] returns a property associated with the given 'key' string.
-        If 'key' is a fieldname, then the VStacked field values (iterable over
-        field values) for that field is returned. Other keys may be supported
-        by different dataset subclasses. The following key names are encouraged:
-          - 'description': a textual description or name for the dataset
-          - '<fieldname>.type': a type name or value for a given <fieldname>
+        @rtype: Example 
+        @returns: single or multiple examples
 
-        Note that some stream datasets may be unable to implement random access, i.e.
-        arbitrary slicing/indexing
-        because they can only iterate through examples one or a minibatch at a time
-        and do not actually store or keep past (or future) examples.
+        @type i: integer or slice or <iterable> of integers
+        @param i:
+            dataset[i] returns the (i+1)-th example of the dataset.
+            dataset[i:j] returns a LookupList with examples i,i+1,...,j-1.
+            dataset[i:j:s] returns a LookupList with examples i,i+2,i+4...,j-2.
+            dataset[[i1,i2,..,in]] returns a LookupList with examples i1,i2,...,in.
+
+        @note:
+        Some stream datasets may be unable to implement random access, i.e.
+        arbitrary slicing/indexing because they can only iterate through
+        examples one or a minibatch at a time and do not actually store or keep
+        past (or future) examples.
 
         The default implementation of getitem uses the minibatches iterator
         to obtain one example, one slice, or a list of examples. It may not
         always be the most efficient way to obtain the result, especially if
         the data are actually stored in a memory array.
         """
-        # check for an index
+
         if type(i) is int:
-            return DataSet.MinibatchToSingleExampleIterator(
-                self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next()
-        rows=None
-        # or a slice
+            assert i >= 0 # TBM: see if someone complains and want negative i
+            if i >= len(self) :
+                raise IndexError
+            i_batch = self.minibatches_nowrap(self.fieldNames(),
+                    minibatch_size=1, n_batches=1, offset=i)
+            return DataSet.MinibatchToSingleExampleIterator(i_batch).next()
+
+        #if i is a contiguous slice
+        if type(i) is slice and (i.step in (None, 1)):
+            offset = 0 if i.start is None else i.start
+            upper_bound = len(self) if i.stop is None else i.stop
+            upper_bound = min(len(self) , upper_bound)
+            #return MinibatchDataSet(self.minibatches_nowrap(self.fieldNames(),
+            #        minibatch_size=upper_bound - offset,
+            #        n_batches=1,
+            #        offset=offset).next())
+            # now returns a LookupList
+            return self.minibatches_nowrap(self.fieldNames(),
+                    minibatch_size=upper_bound - offset,
+                    n_batches=1,
+                    offset=offset).next()
+
+        # if slice has a step param, convert it to list and handle it with the
+        # list code
         if type(i) is slice:
-            #print 'i=',i
-            if not i.start: i=slice(0,i.stop,i.step)
-            if not i.stop: i=slice(i.start,len(self),i.step)
-            if not i.step: i=slice(i.start,i.stop,1)
-            if i.step is 1:
-                return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples()
-            rows = range(i.start,i.stop,i.step)
-        # or a list of indices
-        elif type(i) is list:
-            rows = i
-        if rows is not None:
-            examples = [self[row] for row in rows]
-            fields_values = zip(*examples)
-            return MinibatchDataSet(
-                Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values)
-                                            for fieldname,field_values
-                                            in zip(self.fieldNames(),fields_values)]),
-                self.valuesVStack,self.valuesHStack)
-        # else check for a fieldname
-        if self.hasFields(i):
-            return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
-        # else we are trying to access a property of the dataset
-        assert i in self.__dict__ # else it means we are trying to access a non-existing property
-        return self.__dict__[i]
+            offset = 0 if i.start is None else i.start
+            upper_bound = len(self) if i.stop is None else i.stop
+            upper_bound = min(len(self) , upper_bound)
+            i = list(range(offset, upper_bound, i.step))
+
+        # handle tuples, arrays, lists
+        if hasattr(i, '__getitem__'):
+            for idx in i:
+                #dis-allow nested slices
+                if not isinstance(idx, int):
+                    raise TypeError(idx)
+                if idx >= len(self) :
+                    raise IndexError
+            # call back into self.__getitem__
+            examples = [self.minibatches_nowrap(self.fieldNames(),
+                    minibatch_size=1, n_batches=1, offset=ii).next()
+                    for ii in i]
+            # re-index the fields in each example by field instead of by example
+            field_values = [[] for blah in  self.fieldNames()]
+            for e in examples:
+                for f,v in zip(field_values, e):
+                    f.append(v)
+            #build them into a LookupList (a.ka. Example)
+            zz = zip(self.fieldNames(),field_values)
+            vst = [self.valuesVStack(fieldname,field_values) for fieldname,field_values in zz]
+            example = Example(self.fieldNames(), vst)
+            #return MinibatchDataSet(example, self.valuesVStack, self.valuesHStack)
+            # now returns a LookupList
+            return example
+
+        # what in the world is i?
+        raise TypeError(i, type(i))
+
+
+    """
+    Enables the call dataset.subset[a:b:c] that will return a DataSet
+    around the examples returned by __getitem__(slice(a,b,c))
+       
+    @SEE DataSet.__getsubset(self)
+    """
+    subset = property(lambda s : s.__getsubset(),doc="returns a subset as a DataSet")
+
+
+    def __getsubset(self) :
+        """
+        Enables the call data.subset[a:b:c], returns a DataSet.
+        Default implementation is a simple wrap around __getitem__() using MinibatchDataSet.
+
+        @RETURN DataSet
+        @SEE DataSet.subset = property(lambda s : s.__getsubset())
+        """
+        _self = self
+        class GetSliceReturnsDataSet(object) :
+            def __getitem__(self,slice) :
+                return MinibatchDataSet(_self.__getitem__(slice))
+        return GetSliceReturnsDataSet()
+
+
 
     def valuesHStack(self,fieldnames,fieldvalues):
         """
@@ -488,24 +587,21 @@
         # the default implementation of horizontal stacking is to put values in a list
         return fieldvalues
 
-
     def valuesVStack(self,fieldname,values):
         """
-        Return a value that corresponds to concatenating (vertically) several values of the
-        same field. This can be important to build a minibatch out of individual examples. This
-        is likely to involve a copy of the original values. When the values are numpy arrays, the
-        result should be numpy.vstack(values).
-        The default is to use numpy.vstack for numpy.ndarray values, and a list
-        pointing to the original values for other data types.
+        @param fieldname: the name of the field from which the values were taken 
+        @type fieldname: any type 
+
+        @param values: bits near the beginning or end of the dataset 
+        @type values: list of minibatches (returned by minibatch_nowrap) 
+
+        @return: the concatenation (stacking) of the values 
+        @rtype: something suitable as a minibatch field 
         """
-        all_numpy=True
-        for value in values:
-            if not type(value) is numpy.ndarray:
-                all_numpy=False
-        if all_numpy:
-            return numpy.vstack(values)
-        # the default implementation of vertical stacking is to put values in a list
-        return values
+        rval = []
+        for v in values:
+            rval.extend(v)
+        return rval
 
     def __or__(self,other):
         """
@@ -581,11 +677,54 @@
     def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
         assert self.hasFields(*fieldnames)
         return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset)
-    def __getitem__(self,i):
+    def dontuse__getitem__(self,i):
         return FieldsSubsetDataSet(self.src[i],self.fieldnames)
     
-        
-class DataSetFields(LookupList):
+class RenamedFieldsDataSet(DataSet):
+    """
+    A sub-class of L{DataSet} that selects and renames a subset of the fields.
+    """
+    def __init__(self,src,src_fieldnames,new_fieldnames):
+        self.src=src
+        self.src_fieldnames=src_fieldnames
+        self.new_fieldnames=new_fieldnames
+        assert src.hasFields(*src_fieldnames)
+        assert len(src_fieldnames)==len(new_fieldnames)
+        self.valuesHStack = src.valuesHStack
+        self.valuesVStack = src.valuesVStack
+
+    def __len__(self): return len(self.src)
+    
+    def fieldNames(self):
+        return self.new_fieldnames
+
+    def __iter__(self):
+        class FieldsSubsetIterator(object):
+            def __init__(self,ds):
+                self.ds=ds
+                self.src_iter=ds.src.__iter__()
+                self.example=None
+            def __iter__(self): return self
+            def next(self):
+                complete_example = self.src_iter.next()
+                if self.example:
+                    self.example._values=[complete_example[field]
+                                          for field in self.ds.src_fieldnames]
+                else:
+                    self.example=Example(self.ds.new_fieldnames,
+                                         [complete_example[field]
+                                          for field in self.ds.src_fieldnames])
+                return self.example
+        return FieldsSubsetIterator(self)
+
+    def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+        assert self.hasFields(*fieldnames)
+        return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset)
+    def __getitem__(self,i):
+        return FieldsSubsetDataSet(self.src[i],self.new_fieldnames)
+
+
+class DataSetFields(Example):
     """
     Although a L{DataSet} iterates over examples (like rows of a matrix), an associated
     DataSetFields iterates over fields (like columns of a matrix), and can be understood
@@ -616,15 +755,16 @@
         original_dataset=dataset
         if not fieldnames:
             fieldnames=dataset.fieldNames()
-        elif not fieldnames==dataset.fieldNames():
+        elif not list(fieldnames)==list(dataset.fieldNames()):
+            #we must cast to list, othersize('x','y')!=['x','y']
             dataset = FieldsSubsetDataSet(dataset,fieldnames)
         assert dataset.hasFields(*fieldnames)
         self.dataset=dataset
 
         if isinstance(dataset,MinibatchDataSet):
-            LookupList.__init__(self,fieldnames,list(dataset._fields))
+            Example.__init__(self,fieldnames,list(dataset._fields))
         elif isinstance(original_dataset,MinibatchDataSet):
-            LookupList.__init__(self,fieldnames,
+            Example.__init__(self,fieldnames,
                                 [original_dataset._fields[field]
                                  for field in fieldnames])
         else:
@@ -632,7 +772,7 @@
                                                      minibatch_size=len(dataset),
                                                      n_batches=1)
             minibatch=minibatch_iterator.next()
-            LookupList.__init__(self,fieldnames,minibatch)
+            Example.__init__(self,fieldnames,minibatch)
         
     def examples(self):
         return self.dataset
@@ -654,7 +794,7 @@
     
 class MinibatchDataSet(DataSet):
     """
-    Turn a L{LookupList} of same-length (iterable) fields into an example-iterable dataset.
+    Turn a L{Example} of same-length (iterable) fields into an example-iterable dataset.
     Each element of the lookup-list should be an iterable and sliceable, all of the same length.
     """
     def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack,
@@ -674,14 +814,15 @@
                 print 'len(field) = ', len(field)
                 print 'self._fields.keys() = ', self._fields.keys()
                 print 'field=',field
+                print 'fields_lookuplist=', fields_lookuplist
             assert self.length==len(field)
-        self.values_vstack=values_vstack
-        self.values_hstack=values_hstack
+        self.valuesVStack=values_vstack
+        self.valuesHStack=values_hstack
 
     def __len__(self):
         return self.length
 
-    def __getitem__(self,i):
+    def dontuse__getitem__(self,i):
         if type(i) in (slice,list):
             return DataSetFields(MinibatchDataSet(
                 Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames())
@@ -711,7 +852,7 @@
 
                 self.ds=ds
                 self.next_example=offset
-                assert minibatch_size > 0
+                assert minibatch_size >= 0
                 if offset+minibatch_size > ds.length:
                     raise NotImplementedError()
             def __iter__(self):
@@ -735,12 +876,6 @@
         # tbm: added fieldnames to handle subset of fieldnames
         return Iterator(self,fieldnames)
 
-    def valuesVStack(self,fieldname,fieldvalues):
-        return self.values_vstack(fieldname,fieldvalues)
-    
-    def valuesHStack(self,fieldnames,fieldvalues):
-        return self.values_hstack(fieldnames,fieldvalues)
-    
 class HStackedDataSet(DataSet):
     """
     A L{DataSet} that wraps several datasets and shows a view that includes all their fields,
@@ -804,7 +939,7 @@
                 return self
             def next(self):
                 # concatenate all the fields of the minibatches
-                l=LookupList()
+                l=Example()
                 for iter in self.iterators:
                     l.append_lookuplist(iter.next())
                 return l
@@ -828,10 +963,10 @@
         return HStackedIterator(self,iterators)
 
 
-    def valuesVStack(self,fieldname,fieldvalues):
+    def untested_valuesVStack(self,fieldname,fieldvalues):
         return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues)
     
-    def valuesHStack(self,fieldnames,fieldvalues):
+    def untested_valuesHStack(self,fieldnames,fieldvalues):
         """
         We will use the sub-dataset associated with the first fieldname in the fieldnames list
         to do the work, hoping that it can cope with the other values (i.e. won't care
@@ -953,25 +1088,85 @@
     Virtual super-class of datasets whose field values are numpy array,
     thus defining valuesHStack and valuesVStack for sub-classes.
     """
-    def __init__(self, description=None, field_types=None):
-        DataSet.__init__(self, description, field_types)
-    def valuesHStack(self, fieldnames, fieldvalues):
+    def __init__(self,description=None,field_types=None):
+        DataSet.__init__(self,description,field_types)
+    def untested_valuesHStack(self,fieldnames,fieldvalues):
         """Concatenate field values horizontally, e.g. two vectors
         become a longer vector, two matrices become a wider matrix, etc."""
         return numpy.hstack(fieldvalues)
-    def valuesVStack(self, fieldname, values):
+    def untested_valuesVStack(self,fieldname,values):
         """Concatenate field values vertically, e.g. two vectors
         become a two-row matrix, two matrices become a longer matrix, etc."""
         return numpy.vstack(values)
-    def valuesAppend(self, fieldname, values):
-        s0 = sum([v.shape[0] for v in values])
-        #TODO: there's gotta be a better way to do this!
-        rval = numpy.ndarray([s0] + values[0].shape[1:],dtype=values[0].dtype)
-        cur_row = 0
-        for v in values:
-            rval[cur_row:cur_row+v.shape[0]] = v
-            cur_row += v.shape[0]
-        return rval
+
+
+
+class NArraysDataSet(ArrayFieldsDataSet) :
+    """
+    An NArraysDataSet stores fields that are numpy tensor, whose first axis
+    iterates over examples. It's a generalization of ArrayDataSet.
+    """
+    #@TODO not completely implemented yet
+    def __init__(self, data_arrays, fieldnames, **kwargs) :
+        """
+        Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list
+        of fieldnames. The number of arrays must be the same as the number of
+        fieldnames. Each set of numpy tensor must have the same first dimension (first
+        axis) corresponding to the number of examples.
+
+        Every tensor is treated as a numpy array (using numpy.asarray)
+        """
+        ArrayFieldsDataSet.__init__(self,**kwargs)
+        assert len(data_arrays) == len(fieldnames)
+        assert len(fieldnames) > 0
+        ndarrays = [numpy.asarray(a) for a in data_arrays]
+        lens = [a.shape[0] for a in ndarrays]
+        num_examples = lens[0] #they must all be equal anyway
+        self._fieldnames = fieldnames
+        for k in ndarrays :
+            assert k.shape[0] == num_examples
+        self._datas = ndarrays
+        # create dict 
+        self.map_field_idx = dict()
+        for k in range(len(fieldnames)):
+            self.map_field_idx[fieldnames[k]] = k
+
+
+    def __len__(self) :
+        """
+        Length of the dataset is based on the first array = data_arrays[0], using its shape
+        """
+        return self._datas[0].shape[0]
+
+    def fieldNames(self) :
+        """
+        Returns the fieldnames as set in self.__init__
+        """
+        return self._fieldnames
+
+    def field_pos(self,fieldname) :
+        """
+        Returns the index of a given fieldname. Fieldname must exists! see fieldNames().
+        """
+        return self.map_field_idx[fieldname]
+
+    def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+        cursor = Example(fieldnames,[0]*len(fieldnames))
+        fieldnames = self.fieldNames() if fieldnames is None else fieldnames
+        for n in xrange(n_batches):
+            if offset == len(self):
+                break
+            for f in range(len(cursor._names)) :
+                idx = self.field_pos(cursor._names[f])
+                sub_data = self._datas[idx][offset : offset+minibatch_size]
+                cursor._values[f] = sub_data
+            offset += len(sub_data) #can be less than minibatch_size at end
+            yield cursor
+
+        #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
+
+
+
 
 class ArrayDataSet(ArrayFieldsDataSet):
     """
@@ -996,7 +1191,7 @@
         for fieldname, fieldcolumns in self.fields_columns.items():
             if type(fieldcolumns) is int:
                 assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1]
-                if 0:
+                if 1:
                     #I changed this because it didn't make sense to me,
                     # and it made it more difficult to write my learner.
                     # If it breaks stuff, let's talk about it.
@@ -1030,22 +1225,18 @@
             return Example(fieldnames,
                            [self.data[key,col] for col in values])
         if type(key) is slice:
-            return MinibatchDataSet(Example(fieldnames,
-                                            [self.data[key,col] for col in values]))
+            return Example(fieldnames,[self.data[key,col] for col in values])
         if type(key) is list:
             for i in range(len(key)):
                 if self.hasFields(key[i]):
                     key[i]=self.fields_columns[key[i]]
-            return MinibatchDataSet(Example(fieldnames,
-                                            #we must separate differently for list as numpy
-                                            # doesn't support self.data[[i1,...],[i2,...]]
-                                            # when their is more then two i1 and i2
-                                            [self.data[key,:][:,col]
-                                             if isinstance(col,list) else
-                                             self.data[key,col] for col in values]),
-
-
-                                    self.valuesVStack,self.valuesHStack)
+            return Example(fieldnames,
+                               #we must separate differently for list as numpy
+                               # doesn't support self.data[[i1,...],[i2,...]]
+                               # when their is more then two i1 and i2
+                               [self.data[key,:][:,col]
+                               if isinstance(col,list) else
+                               self.data[key,col] for col in values])
 
         # else check for a fieldname
         if self.hasFields(key):
@@ -1054,55 +1245,46 @@
         assert key in self.__dict__ # else it means we are trying to access a non-existing property
         return self.__dict__[key]
         
-    def __iter__(self):
-        class ArrayDataSetIterator2(object):
-            def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset):
+    def dontuse__iter__(self):
+        class ArrayDataSetIteratorIter(object):
+            def __init__(self,dataset,fieldnames):
                 if fieldnames is None: fieldnames = dataset.fieldNames()
                 # store the resulting minibatch in a lookup-list of values
-                self.minibatch = LookupList(fieldnames,[0]*len(fieldnames))
+                self.minibatch = Example(fieldnames,[0]*len(fieldnames))
                 self.dataset=dataset
-                self.minibatch_size=minibatch_size
-                assert offset>=0 and offset<len(dataset.data)
-                assert offset+minibatch_size<=len(dataset.data)
-                self.current=offset
+                self.current=0
                 self.columns = [self.dataset.fields_columns[f] 
                                 for f in self.minibatch._names]
+                self.l = self.dataset.data.shape[0]
             def __iter__(self):
                 return self
             def next(self):
                 #@todo: we suppose that we need to stop only when minibatch_size == 1.
                 # Otherwise, MinibatchWrapAroundIterator do it.
-                if self.current>=self.dataset.data.shape[0]:
+                if self.current>=self.l:
                     raise StopIteration
                 sub_data =  self.dataset.data[self.current]
                 self.minibatch._values = [sub_data[c] for c in self.columns]
 
-                self.current+=self.minibatch_size
+                self.current+=1
                 return self.minibatch
 
-        return ArrayDataSetIterator2(self,self.fieldNames(),1,0,0)
+        return ArrayDataSetIteratorIter(self,self.fieldNames())
 
     def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
-        class ArrayDataSetIterator(object):
-            def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset):
-                if fieldnames is None: fieldnames = dataset.fieldNames()
-                # store the resulting minibatch in a lookup-list of values
-                self.minibatch = LookupList(fieldnames,[0]*len(fieldnames))
-                self.dataset=dataset
-                self.minibatch_size=minibatch_size
-                assert offset>=0 and offset<len(dataset.data)
-                assert offset+minibatch_size<=len(dataset.data)
-                self.current=offset
-            def __iter__(self):
-                return self
-            def next(self):
-                #@todo: we suppose that MinibatchWrapAroundIterator stop the iterator
-                sub_data =  self.dataset.data[self.current:self.current+self.minibatch_size]
-                self.minibatch._values = [sub_data[:,self.dataset.fields_columns[f]] for f in self.minibatch._names]
-                self.current+=self.minibatch_size
-                return self.minibatch
+        cursor = Example(fieldnames,[0]*len(fieldnames))
+        fieldnames = self.fieldNames() if fieldnames is None else fieldnames
+        if n_batches == None:
+            n_batches = (len(self) - offset) / minibatch_size
+        for n in xrange(n_batches):
+            if offset == len(self):
+                break
+            sub_data = self.data[offset : offset+minibatch_size]
+            offset += len(sub_data) #can be less than minibatch_size at end
+            cursor._values = [sub_data[:,self.fields_columns[f]] for f in cursor._names]
+            yield cursor
 
-        return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
+        #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
 
 
 class CachedDataSet(DataSet):
@@ -1167,7 +1349,7 @@
               return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
       return CacheIterator(self)
 
-  def __getitem__(self,i):
+  def dontuse__getitem__(self,i):
       if type(i)==int and len(self.cached_examples)>i:
           return self.cached_examples[i]
       else:
@@ -1180,7 +1362,7 @@
               self.l = len(dataset)
               self.current = 0
               self.fieldnames = self.dataset.fieldNames()
-              self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames))
+              self.example = Example(self.fieldnames,[0]*len(self.fieldnames))
           def __iter__(self): return self
           def next(self):
               if self.current>=self.l:
@@ -1197,108 +1379,103 @@
       return CacheIteratorIter(self)
 
 class ApplyFunctionDataSet(DataSet):
-  """
-  A L{DataSet} that contains as fields the results of applying a
-  given function example-wise or minibatch-wise to all the fields of
-  an input dataset.  The output of the function should be an iterable
-  (e.g. a list or a LookupList) over the resulting values.
-  
-  The function take as input the fields of the dataset, not the examples.
+    """
+    A L{DataSet} that contains as fields the results of applying a
+    given function example-wise or minibatch-wise to all the fields of
+    an input dataset.  The output of the function should be an iterable
+    (e.g. a list or a LookupList) over the resulting values.
+    
+    The function take as input the fields of the dataset, not the examples.
 
-  In minibatch mode, the function is expected to work on minibatches
-  (takes a minibatch in input and returns a minibatch in output). More
-  precisely, it means that each element of the input or output list
-  should be iterable and indexable over the individual example values
-  (typically these elements will be numpy arrays). All of the elements
-  in the input and output lists should have the same length, which is
-  the length of the minibatch.
+    In minibatch mode, the function is expected to work on minibatches
+    (takes a minibatch in input and returns a minibatch in output). More
+    precisely, it means that each element of the input or output list
+    should be iterable and indexable over the individual example values
+    (typically these elements will be numpy arrays). All of the elements
+    in the input and output lists should have the same length, which is
+    the length of the minibatch.
 
-  The function is applied each time an example or a minibatch is accessed.
-  To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
+    The function is applied each time an example or a minibatch is accessed.
+    To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
+
+    If the values_{h,v}stack functions are not provided, then
+    the input_dataset.values{H,V}Stack functions are used by default.
+
+    """
 
-  If the values_{h,v}stack functions are not provided, then
-  the input_dataset.values{H,V}Stack functions are used by default.
-  """
-  def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
-               values_hstack=None,values_vstack=None,
-               description=None,fieldtypes=None):
-      """
-      Constructor takes an input dataset that has as many fields as the function
-      expects as inputs. The resulting dataset has as many fields as the function
-      produces as outputs, and that should correspond to the number of output names
-      (provided in a list).
+    def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
+                 values_hstack=None,values_vstack=None,
+                 description=None,fieldtypes=None):
+        """
+        Constructor takes an input dataset that has as many fields as the function
+        expects as inputs. The resulting dataset has as many fields as the function
+        produces as outputs, and that should correspond to the number of output names
+        (provided in a list).
 
-      Note that the expected semantics of the function differs in minibatch mode
-      (it takes minibatches of inputs and produces minibatches of outputs, as
-      documented in the class comment).
+        Note that the expected semantics of the function differs in minibatch mode
+        (it takes minibatches of inputs and produces minibatches of outputs, as
+        documented in the class comment).
 
-      TBM: are filedtypes the old field types (from input_dataset) or the new ones
-      (for the new dataset created)?
-      """
-      self.input_dataset=input_dataset
-      self.function=function
-      self.output_names=output_names
-      self.minibatch_mode=minibatch_mode
-      DataSet.__init__(self,description,fieldtypes)
-      self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
-      self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
-
-  def __len__(self):
-      return len(self.input_dataset)
+        TBM: are filedtypes the old field types (from input_dataset) or the new ones
+        (for the new dataset created)?
+        """
+        self.input_dataset=input_dataset
+        self.function=function
+        self.output_names=output_names
+        self.minibatch_mode=minibatch_mode
+        DataSet.__init__(self,description,fieldtypes)
+        self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
+        self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
 
-  def fieldNames(self):
-      return self.output_names
+    def __len__(self):
+        return len(self.input_dataset)
+
+    def fieldNames(self):
+        return self.output_names
 
-  def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
-      class ApplyFunctionIterator(object):
-          def __init__(self,output_dataset):
-              self.input_dataset=output_dataset.input_dataset
-              self.output_dataset=output_dataset
-              self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size,
-                                                                 n_batches=n_batches,offset=offset).__iter__()
+    def minibatches_nowrap(self, fieldnames, *args, **kwargs):
+        all_input_fieldNames = self.input_dataset.fieldNames()
+        mbnw = self.input_dataset.minibatches_nowrap
 
-          def __iter__(self): return self
+        for input_fields in mbnw(all_input_fieldNames, *args, **kwargs):
+            if self.minibatch_mode:
+                all_output_fields = self.function(*input_fields)
+            else:
+                input_examples = zip(*input_fields) #makes so that [i] means example i
+                output_examples = [self.function(*input_example)
+                                    for input_example in input_examples]
+                all_output_fields = zip(*output_examples)
 
-          def next(self):
-              function_inputs = self.input_iterator.next()
-              all_output_names = self.output_dataset.output_names
-              if self.output_dataset.minibatch_mode:
-                  function_outputs = self.output_dataset.function(*function_inputs)
-              else:
-                  input_examples = zip(*function_inputs)
-                  output_examples = [self.output_dataset.function(*input_example)
-                                     for input_example in input_examples]
-                  function_outputs = [self.output_dataset.valuesVStack(name,values)
-                                      for name,values in zip(all_output_names,
-                                                             zip(*output_examples))]
-              all_outputs = Example(all_output_names,function_outputs)
-              if fieldnames==all_output_names:
-                  return all_outputs
-              return Example(fieldnames,[all_outputs[name] for name in fieldnames])
-
-
-      return ApplyFunctionIterator(self)
+            all_outputs = Example(self.output_names, all_output_fields)
+            #print 'input_fields', input_fields
+            #print 'all_outputs', all_outputs
+            if fieldnames==self.output_names:
+                rval = all_outputs
+            else:
+                rval = Example(fieldnames,[all_outputs[name] for name in fieldnames])
+            #print 'rval', rval
+            #print '--------'
+            yield rval
 
-  def __iter__(self): # only implemented for increased efficiency
-      class ApplyFunctionSingleExampleIterator(object):
-          def __init__(self,output_dataset):
-              self.current=0
-              self.output_dataset=output_dataset
-              self.input_iterator=output_dataset.input_dataset.__iter__()
-          def __iter__(self): return self
-          def next(self):
-              if self.output_dataset.minibatch_mode:
-                  function_inputs = [[input] for input in self.input_iterator.next()]
-                  outputs = self.output_dataset.function(*function_inputs)
-                  assert all([hasattr(output,'__iter__') for output in outputs])
-                  function_outputs = [output[0] for output in outputs]
-              else:
-                  function_inputs = self.input_iterator.next()
-                  function_outputs = self.output_dataset.function(*function_inputs)
-              return Example(self.output_dataset.output_names,function_outputs)
-      return ApplyFunctionSingleExampleIterator(self)
-  
-
+    def untested__iter__(self): # only implemented for increased efficiency
+        class ApplyFunctionSingleExampleIterator(object):
+            def __init__(self,output_dataset):
+                self.current=0
+                self.output_dataset=output_dataset
+                self.input_iterator=output_dataset.input_dataset.__iter__()
+            def __iter__(self): return self
+            def next(self):
+                if self.output_dataset.minibatch_mode:
+                    function_inputs = [[input] for input in self.input_iterator.next()]
+                    outputs = self.output_dataset.function(*function_inputs)
+                    assert all([hasattr(output,'__iter__') for output in outputs])
+                    function_outputs = [output[0] for output in outputs]
+                else:
+                    function_inputs = self.input_iterator.next()
+                    function_outputs = self.output_dataset.function(*function_inputs)
+                return Example(self.output_dataset.output_names,function_outputs)
+        return ApplyFunctionSingleExampleIterator(self)
+    
 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
     """
     Wraps an arbitrary L{DataSet} into one for supervised learning tasks
--- a/denoising_aa.py	Tue Jun 03 21:27:32 2008 -0400
+++ b/denoising_aa.py	Mon Jun 16 17:47:36 2008 -0400
@@ -106,11 +106,14 @@
         self.denoising_autoencoder_formula = corruption_formula + autoencoder.rename(x='corrupted_x')
         
     def __call__(self, training_set=None):
-        """ Allocate and optionnaly train a model"""
+        """ Allocate and optionnaly train a model
+
+        @TODO enables passing in training and valid sets, instead of cutting one set in 80/20
+        """
         model = DenoisingAutoEncoderModel(self)
         if training_set:
             print 'DenoisingAutoEncoder(): what do I do if training_set????'
-            # copied from mlp_factory_approach:
+            # copied from old mlp_factory_approach:
             if len(trainset) == sys.maxint:
                 raise NotImplementedError('Learning from infinite streams is not supported')
             nval = int(self.validation_portion * len(trainset))
--- a/learner.py	Tue Jun 03 21:27:32 2008 -0400
+++ b/learner.py	Mon Jun 16 17:47:36 2008 -0400
@@ -17,7 +17,7 @@
     in machine learning:  an offline learning algorithm is applied
     to a training dataset, 
 
-       model = learning_algorithm(training_set)
+        model = learning_algorithm(training_set)
         
     resulting in a fully trained model that can be applied to another dataset
     in order to perform some desired computation:
@@ -110,7 +110,7 @@
         """
         raise AbstractFunction()
     
-class LearnerModel(LearnedModel):
+class LearnerModel(TrainedModel):
     """
     LearnerModel is a base class for models returned by instances of a LearningAlgorithm subclass.
     It is only given here to define the expected semantics.
--- a/lookup_list.py	Tue Jun 03 21:27:32 2008 -0400
+++ b/lookup_list.py	Mon Jun 16 17:47:36 2008 -0400
@@ -7,21 +7,23 @@
     a dictionary the order of the elements depends not on their key but
     on the order given by the user through construction) so that
     following syntactic constructions work as one would expect::
-       example = LookupList(['x','y','z'],[1,2,3])
-       example['x'] = [1, 2, 3] # set or change a field
-       print example('z','y') # prints [3,2]
-       x, y, z = example
-       x = example[0]
-       x = example["x"]
-       print example.keys() # prints ['x','y','z']
-       print example.values() # prints [[1,2,3],2,3]
-       print example.items() # prints [('x',[1,2,3]),('y',2),('z',3)]
-       example.append_keyval('u',0) # adds item with name 'u' and value 0
-       print len(example) # number of items = 4 here
-       example2 = LookupList(['v', 'w'], ['a','b'])
-       print example+example2 # addition is like for lists, a concatenation of the items.
-       example + example # throw an error as we can't have duplicate name.
+       >>> example = LookupList(['x','y','z'],[1,2,3])
+       >>> example['x'] = [1, 2, 3] # set or change a field
+       >>> print example('z','y') # prints [3,2]
+       >>> x, y, z = example
+       >>> x = example[0]
+       >>> x = example["x"]
+       >>> print example.keys() # prints ['x','y','z']
+       >>> print example.values() # prints [[1,2,3],2,3]
+       >>> print example.items() # prints [('x',[1,2,3]),('y',2),('z',3)]
+       >>> example.append_keyval('u',0) # adds item with name 'u' and value 0
+       >>> print len(example) # number of items = 4 here
+       >>> example2 = LookupList(['v', 'w'], ['a','b'])
+       >>> print example+example2 # addition is like for lists, a concatenation of the items.
+       >>> example + example # throw an error as we can't have duplicate name.
+
     @note: The element names should be unique.
+
     @todo: Convert this documentation into doctest
     format, and actually perform doctest'ing:
     U{http://epydoc.sourceforge.net/manual-epytext.html#doctest-blocks}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/misc_theano.py	Mon Jun 16 17:47:36 2008 -0400
@@ -0,0 +1,20 @@
+
+import theano
+
+class Print(theano.Op):
+    def __init__(self,message=""):
+        self.message=message
+        self.view_map={0:[0]}
+
+    def make_node(self,xin):
+        xout = xin.type.make_result()
+        return theano.Apply(op = self, inputs = [xin], outputs=[xout])
+
+    def perform(self,node,inputs,output_storage):
+        xin, = inputs
+        xout, = output_storage
+        xout[0] = xin
+        print self.message,xin
+
+    def grad(self,input,output_gradients):
+        return output_gradients
--- a/mlp.py	Tue Jun 03 21:27:32 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,240 +0,0 @@
-"""
-A straightforward classicial feedforward
-one-hidden-layer neural net, with L2 regularization.
-This is one of the simplest example of L{Learner}, and illustrates
-the use of theano.
-"""
-
-from learner import *
-from theano import tensor as t
-from nnet_ops import *
-import math
-from misc import *
-
-def function(inputs, outputs, linker='c&py'):
-    return theano.function(inputs, outputs, unpack_single=False,linker=linker)
-
-def randshape(*shape): return (numpy.random.rand(*shape) -0.5) * 0.001
-
-class ManualNNet(object):
-    def __init__(self, ninputs, nhid, nclass, lr, nepochs, 
-            linker='c&yp', 
-            hidden_layer=None):
-        class Vars:
-            def __init__(self, lr, l2coef=0.0):
-                lr = t.constant(lr)
-                l2coef = t.constant(l2coef)
-                input = t.matrix('input') # n_examples x n_inputs
-                target = t.ivector('target') # n_examples x 1
-                W2 = t.matrix('W2')
-                b2 = t.vector('b2')
-
-                if hidden_layer:
-                    hid, hid_params, hid_ivals, hid_regularization = hidden_layer(input)
-                else:
-                    W1 = t.matrix('W1')
-                    b1 = t.vector('b1')
-                    hid = t.tanh(b1 + t.dot(input, W1))
-                    hid_params = [W1, b1]
-                    hid_regularization = l2coef * t.sum(W1*W1)
-                    hid_ivals = [randshape(ninputs, nhid), randshape(nhid)]
-
-                params = [W2, b2] + hid_params
-                ivals = [randshape(nhid, nclass), randshape(nclass)]\
-                        + hid_ivals
-                nll, predictions = crossentropy_softmax_1hot( b2 + t.dot(hid, W2), target)
-                regularization = l2coef * t.sum(W2*W2) + hid_regularization
-                output_class = t.argmax(predictions,1)
-                loss_01 = t.neq(output_class, target)
-                g_params = t.grad(nll + regularization, params)
-                new_params = [t.sub_inplace(p, lr * gp) for p,gp in zip(params, g_params)]
-                self.__dict__.update(locals()); del self.self
-        self.nhid = nhid
-        self.nclass = nclass
-        self.nepochs = nepochs
-        self.v = Vars(lr)
-        self.params = None
-
-    def update(self, trainset):
-        params = self.v.ivals
-        update_fn = function(
-                [self.v.input, self.v.target] + self.v.params,
-                [self.v.nll] + self.v.new_params)
-        for i in xrange(self.nepochs):
-            for input, target in trainset.minibatches(['input', 'target'],
-                    minibatch_size=min(32, len(trainset))):
-                dummy = update_fn(input, target[:,0], *params)
-                if 0: print dummy[0] #the nll
-        return self.use
-    __call__ = update
-
-    def use(self, dset,
-            output_fieldnames=['output_class'],
-            test_stats_collector=None,
-            copy_inputs=False,
-            put_stats_in_output_dataset=True,
-            output_attributes=[]):
-        inputs = [self.v.input, self.v.target] + self.v.params
-        fn = function(inputs, [getattr(self.v, name) for name in output_fieldnames])
-        target = dset.fields()['target'] if ('target' in dset.fields()) else numpy.zeros((1,1),dtype='int64')
-        return ApplyFunctionDataSet(dset, 
-            lambda input, target: fn(input, target[:,0], *self.v.ivals),
-            output_fieldnames)
-
-
-class OneHiddenLayerNNetClassifier(OnlineGradientTLearner):
-    """
-    Implement a straightforward classicial feedforward
-    one-hidden-layer neural net, with L2 regularization.
-
-    The predictor parameters are obtained by minibatch/online gradient descent.
-    Training can proceed sequentially (with multiple calls to update with
-    different disjoint subsets of the training sets).
-
-    Hyper-parameters:
-      - L2_regularizer
-      - learning_rate
-      - n_hidden
-
-    For each (input_t,output_t) pair in a minibatch,::
-
-       output_activations_t = b2+W2*tanh(b1+W1*input_t)
-       output_t = softmax(output_activations_t)
-       output_class_t = argmax(output_activations_t)
-       class_error_t = 1_{output_class_t != target_t}
-       nll_t = -log(output_t[target_t])
-
-    and the training criterion is::
-
-       loss = L2_regularizer*(||W1||^2 + ||W2||^2) + sum_t nll_t
-
-    The parameters are [b1,W1,b2,W2] and are obtained by minimizing the loss by
-    stochastic minibatch gradient descent::
-
-       parameters[i] -= learning_rate * dloss/dparameters[i]
-       
-    The fields and attributes expected and produced by use and update are the following:
-
-     - Input and output fields (example-wise quantities):
-
-       - 'input' (always expected by use and update)
-       - 'target' (optionally expected by use and always by update)
-       - 'output' (optionally produced by use)
-       - 'output_class' (optionally produced by use)
-       - 'class_error' (optionally produced by use)
-       - 'nll' (optionally produced by use)
-       
-     - optional attributes (optionally expected as input_dataset attributes)
-       (warning, this may be dangerous, the 'use' method will use those provided in the 
-       input_dataset rather than those learned during 'update'; currently no support
-       for providing these to update):
-       
-       - 'L2_regularizer'
-       - 'b1' 
-       - 'W1'
-       - 'b2' 
-       - 'W2'
-       - 'parameters' = [b1, W1, b2, W2]
-       - 'regularization_term'
-
-    """
-    def __init__(self,n_hidden,n_classes,learning_rate,max_n_epochs,L2_regularizer=0,init_range=1.,n_inputs=None,minibatch_size=None,linker='c|py'):
-        self._n_inputs = n_inputs
-        self._n_outputs = n_classes
-        self._n_hidden = n_hidden
-        self._init_range = init_range
-        self._max_n_epochs = max_n_epochs
-        self._minibatch_size = minibatch_size
-        self.learning_rate = learning_rate # this is the float
-        self.L2_regularizer = L2_regularizer
-        self._learning_rate = t.scalar('learning_rate') # this is the symbol
-        self._input = t.matrix('input') # n_examples x n_inputs
-        self._target = t.lmatrix('target') # n_examples x 1
-        self._target_vector = self._target[:,0]
-        self._L2_regularizer = t.scalar('L2_regularizer')
-        self._W1 = t.matrix('W1')
-        self._W2 = t.matrix('W2')
-        self._b1 = t.row('b1')
-        self._b2 = t.row('b2')
-        self._regularization_term = self._L2_regularizer * (t.sum(self._W1*self._W1) + t.sum(self._W2*self._W2))
-        self._output_activations =self._b2+t.dot(t.tanh(self._b1+t.dot(self._input,self._W1.T)),self._W2.T)
-        self._nll,self._output = crossentropy_softmax_1hot(self._output_activations,self._target_vector)
-        self._output_class = t.argmax(self._output,1)
-        self._class_error = t.neq(self._output_class,self._target_vector)
-        self._minibatch_criterion = self._nll + self._regularization_term / t.shape(self._input)[0]
-        OnlineGradientTLearner.__init__(self, linker = linker)
-            
-    def attributeNames(self):
-        return ["parameters","b1","W2","b2","W2", "L2_regularizer","regularization_term"]
-
-    def parameterAttributes(self):
-        return ["b1","W1", "b2", "W2"]
-    
-    def updateMinibatchInputFields(self):
-        return ["input","target"]
-    
-    def updateMinibatchInputAttributes(self):
-        return OnlineGradientTLearner.updateMinibatchInputAttributes(self)+["L2_regularizer"]
-    
-    def updateEndOutputAttributes(self):
-        return ["regularization_term"]
-
-    def lossAttribute(self):
-        return "minibatch_criterion"
-    
-    def defaultOutputFields(self, input_fields):
-        output_fields = ["output", "output_class",]
-        if "target" in input_fields:
-            output_fields += ["class_error", "nll"]
-        return output_fields
-        
-    def updateMinibatch(self,minibatch):
-        MinibatchUpdatesTLearner.updateMinibatch(self,minibatch)
-        #print self.nll
-
-    def allocate(self,minibatch):
-        minibatch_n_inputs  = minibatch["input"].shape[1]
-        if not self._n_inputs:
-            self._n_inputs = minibatch_n_inputs
-            self.b1 = numpy.zeros((1,self._n_hidden))
-            self.b2 = numpy.zeros((1,self._n_outputs))
-            self.forget()
-        elif self._n_inputs!=minibatch_n_inputs:
-            # if the input changes dimension on the fly, we resize and forget everything
-            self.forget()
-            
-    def forget(self):
-        if self._n_inputs:
-            r = self._init_range/math.sqrt(self._n_inputs)
-            self.W1 = numpy.random.uniform(low=-r,high=r,
-                                           size=(self._n_hidden,self._n_inputs))
-            r = self._init_range/math.sqrt(self._n_hidden)
-            self.W2 = numpy.random.uniform(low=-r,high=r,
-                                           size=(self._n_outputs,self._n_hidden))
-            self.b1[:]=0
-            self.b2[:]=0
-            self._n_epochs=0
-
-    def isLastEpoch(self):
-        self._n_epochs +=1
-        return self._n_epochs>=self._max_n_epochs
-
-    def debug_updateMinibatch(self,minibatch):
-        # make sure all required fields are allocated and initialized
-        self.allocate(minibatch)
-        input_attributes = self.names2attributes(self.updateMinibatchInputAttributes())
-        input_fields = minibatch(*self.updateMinibatchInputFields())
-        print 'input attributes', input_attributes
-        print 'input fields', input_fields
-        results = self.update_minibatch_function(*(input_attributes+input_fields))
-        print 'output attributes', self.updateMinibatchOutputAttributes()
-        print 'results', results
-        self.setAttributes(self.updateMinibatchOutputAttributes(),
-                           results)
-
-        if 0:
-            print 'n0', self.names2OpResults(self.updateMinibatchOutputAttributes()+ self.updateMinibatchInputFields())
-            print 'n1', self.names2OpResults(self.updateMinibatchOutputAttributes())
-            print 'n2', self.names2OpResults(self.updateEndInputAttributes())
-            print 'n3', self.names2OpResults(self.updateEndOutputAttributes())
-
--- a/mlp_factory_approach.py	Tue Jun 03 21:27:32 2008 -0400
+++ b/mlp_factory_approach.py	Mon Jun 16 17:47:36 2008 -0400
@@ -1,10 +1,12 @@
-import copy, sys
+import copy, sys, os
 import numpy
 
 import theano
 from theano import tensor as T
 
-from pylearn import dataset, nnet_ops, stopper, LookupList
+import dataset, nnet_ops, stopper, filetensor
+from pylearn.lookup_list import LookupList
+
 
 class AbstractFunction (Exception): pass
 
@@ -35,6 +37,17 @@
             raise Exception('why not called?') 
             return GraphLearner.Model(self.algo, [copy.copy(p) for p in params])
 
+        def __eq__(self,other,tolerance=0.) :
+            """ Only compares weights of matrices and bias vector. """
+            if not isinstance(other,GraphLearner.Model) :
+                return False
+            for p in range(4) :
+                if self.params[p].shape != other.params[p].shape :
+                    return False
+                if not numpy.all( numpy.abs(self.params[p] - other.params[p]) <= tolerance ) :                    
+                    return False
+            return True
+
         def _cache(self, key, valfn):
             d = self._fn_cache
             if key not in d:
@@ -42,17 +55,25 @@
             return d[key]
 
         def update_minibatch(self, minibatch):
+            if not isinstance(minibatch, LookupList):
+                print type(minibatch)
             assert isinstance(minibatch, LookupList)
             self.update_fn(minibatch['input'], minibatch['target'], *self.params)
 
         def update(self, dataset, 
                 default_minibatch_size=32):
-            """Update this model from more training data."""
+            """
+            Update this model from more training data.Uses all the data once, cut
+            into minibatches. No early stopper here. 
+            """
             params = self.params
             minibatch_size = min(default_minibatch_size, len(dataset))
             for mb in dataset.minibatches(['input', 'target'], minibatch_size=minibatch_size):
                 self.update_minibatch(mb)
 
+        def save(self, f):
+            self.algo.graph.save(f, self)
+
         def __call__(self, testset, fieldnames=['output_class']):
             """Apply this model (as a function) to new data.
 
@@ -105,12 +126,19 @@
             return theano.gof.PerformLinker()
 
         def early_stopper(self):
-            stopper.NStages(10,1)
+            stopper.NStages(300,1)
         
         def train_iter(self, trainset):
             raise AbstractFunction
         optimizer = Opt()
 
+        def load(self,f) :
+            raise AbstractFunction
+
+        def save(self,f,model) :
+            raise AbstractFunction
+
+
     def __init__(self, graph):
         self.graph = graph
 
@@ -121,12 +149,13 @@
                 unpack_single=False,
                 optimizer=self.graph.optimizer,
                 linker=self.graph.linker() if hasattr(self.graph, 'linker')
-                else 'c&py')
+                else 'c|py')
 
     def __call__(self,
             trainset=None,
             validset=None,
-            iparams=None):
+            iparams=None,
+            stp=None):
         """Allocate and optionally train a model
 
         @param trainset: Data for minimizing the cost function
@@ -141,32 +170,52 @@
         @param target: name of field to use as target
         @type target: string
 
+        @param stp: early stopper, if None use default in graphMLP.G
+        @type stp: None or early stopper
+
         @return: model
         @rtype: GraphLearner.Model instance
         
         """
+        
         iparams = self.graph.iparams() if iparams is None else iparams
+
+        # if we load, type(trainset) == 'str'
+        if isinstance(trainset,str) or isinstance(trainset,file):
+            #loadmodel = GraphLearner.Model(self, iparams)
+            loadmodel = self.graph.load(self,trainset)
+            return loadmodel
+
         curmodel = GraphLearner.Model(self, iparams)
         best = curmodel
         
         if trainset is not None: 
             #do some training by calling Model.update_minibatch()
-            stp = self.graph.early_stopper()
-            for mb in self.graph.train_iter(trainset):
-                curmodel.update_minibatch(mb)
-                if stp.set_score:
-                    if validset:
-                        stp.score = curmodel(validset, ['validset_score'])
-                        if (stp.score < stp.best_score):
-                            best = copy.copy(curmodel)
-                    else:
-                        stp.score = 0.0
-                stp.next()
+            if stp == None :
+                stp = self.graph.early_stopper()
+            try :
+                countiter = 0
+                for mb in self.graph.train_iter(trainset):
+                    curmodel.update_minibatch(mb)
+                    if stp.set_score:
+                        if validset:
+                            stp.score = curmodel(validset, ['validset_score'])
+                            if (stp.score < stp.best_score):
+                                best = copy.copy(curmodel)
+                        else:
+                            stp.score = 0.0
+                    countiter +=1 
+                    stp.next()
+            except StopIteration :
+                print 'Iterations stopped after ', countiter,' iterations'
             if validset:
                 curmodel = best
         return curmodel
 
+
 def graphMLP(ninputs, nhid, nclass, lr_val, l2coef_val=0.0):
+
+
     def wrapper(i, node, thunk):
         if 0:
             print i, node
@@ -183,6 +232,7 @@
         l2coef = T.constant(l2coef_val)
         input = T.matrix() # n_examples x n_inputs
         target = T.ivector() # len: n_examples
+        #target = T.matrix()
         W2, b2 = T.matrix(), T.vector()
 
         W1, b1 = T.matrix(), T.vector()
@@ -191,7 +241,7 @@
 
         params = [W1, b1, W2, b2] 
         activations = b2 + T.dot(hid, W2)
-        nll, predictions = nnet_ops.crossentropy_softmax_1hot(activations, target)
+        nll, predictions = nnet_ops.crossentropy_softmax_1hot(activations, target )
         regularization = l2coef * T.sum(W2*W2) + hid_regularization
         output_class = T.argmax(activations,1)
         loss_01 = T.neq(output_class, target)
@@ -199,7 +249,39 @@
         g_params = T.grad(nll, params)
         new_params = [T.sub_inplace(p, lr * gp) for p,gp in zip(params, g_params)]
 
+            
+        def __eq__(self,other) :
+            print 'G.__eq__ from graphMLP(), not implemented yet'
+            return NotImplemented
+
+
+        def load(self, algo, f):
+            """ Load from file the 2 matrices and bias vectors """
+            cloase_at_end = False
+            if isinstance(f,str) :
+                f = open(f,'r')
+                close_at_end = True
+            params = []
+            for i in xrange(4):
+                params.append(filetensor.read(f))
+            if close_at_end :
+                f.close()
+            return GraphLearner.Model(algo, params)
+
+        def save(self, f, model):
+            """ Save params to file, so 2 matrices and 2 bias vectors. Same order as iparams. """
+            cloase_at_end = False
+            if isinstance(f,str) :
+                f = open(f,'w')
+                close_at_end = True
+            for p in model.params:
+                filetensor.write(f,p)
+            if close_at_end :
+                f.close()
+
+
         def iparams(self):
+            """ init params. """
             def randsmall(*shape): 
                 return (numpy.random.rand(*shape) -0.5) * 0.001
             return [randsmall(ninputs, nhid)
@@ -209,8 +291,9 @@
 
         def train_iter(self, trainset):
             return trainset.minibatches(['input', 'target'],
-                    minibatch_size=min(len(trainset), 32), n_batches=300)
+                    minibatch_size=min(len(trainset), 32), n_batches=2000)
         def early_stopper(self): 
+            """ overwrites GraphLearner.graph function """
             return stopper.NStages(300,1)
 
     return G()
@@ -250,6 +333,26 @@
         self.failUnless(n_match ==  (numpy.sum(training_set1.fields()['target'] ==
                 training_set2.fields()['target'])), omatch)
 
+        model1.save('/tmp/model1')
+        
+        #denoising_aa = GraphLearner(denoising_g)
+        #model1 = denoising_aa(trainset)
+        #hidset = model(trainset, fieldnames=['hidden'])
+        #model2 = denoising_aa(hidset)
+        
+        #f = open('blah', 'w')
+        #for m in model:
+        #    m.save(f)
+        #filetensor.write(f, initial_classification_weights)
+        #f.flush()
+
+        #deep_sigmoid_net = GraphLearner(deepnetwork_g)
+        #deep_model = deep_sigmoid_net.load('blah')
+        #deep_model.update(trainset)  #do some fine tuning
+
+        model1_dup = learn_algo('/tmp/model1')
+
+
     def equiv(self, g0, g1):
         training_set1 = dataset.ArrayDataSet(numpy.array([[0, 0, 0],
                                                          [0, 1, 1],
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/stat_ops.py	Mon Jun 16 17:47:36 2008 -0400
@@ -0,0 +1,92 @@
+
+import theano
+from theano import gof
+from theano import tensor
+import numpy
+
+
+class ExampleWiseMean(gof.Op):
+    
+    def __init__(self):
+        self.destroy_map = {0: [1, 2]}
+
+    def make_node(self, x):
+        return gof.Apply(self,
+                         [x, tensor.value(float('nan')), tensor.value(0)],
+                         [tensor.Tensor(dtype = 'float64',
+                                        broadcastable = x.type.broadcastable)()])
+
+    def perform(self, node, (x, sum, n), (out,)):
+        if numpy.isnan(sum).any():
+            sum.resize(x.shape, refcheck=0)
+            sum[:] = x
+        else:
+            sum += x
+        n += 1
+        out[0] = sum / n
+
+    def c_code(self, name, node, (x, sum, n), (out, ), sub):
+        return """
+        PyObject* multi;
+        int nelems;
+        if (isnan(((double*)(%(sum)s->data))[0])) {
+            PyArray_Dims dims;
+            dims.len = %(x)s->nd;
+            dims.ptr = %(x)s->dimensions;
+            PyArray_Resize(%(sum)s, &dims, 0, PyArray_CORDER);
+            multi = PyArray_MultiIterNew(2, %(sum)s, %(x)s);
+            nelems = PyArray_SIZE(%(sum)s);
+            while (nelems--) {
+                // Copy %(x)s in %(sum)s
+                *(double*)PyArray_MultiIter_DATA(multi, 0) = *(double*)PyArray_MultiIter_DATA(multi, 1);
+                PyArray_MultiIter_NEXT(multi);
+            }
+        }
+        else {
+            // Add some error checking on the size of x
+            multi = PyArray_MultiIterNew(2, %(sum)s, %(x)s);
+            nelems = PyArray_SIZE(%(sum)s);
+            while (nelems--) {
+                // Add %(x)s to %(sum)s
+                *(double*)PyArray_MultiIter_DATA(multi, 0) += *(double*)PyArray_MultiIter_DATA(multi, 1);
+                PyArray_MultiIter_NEXT(multi);
+            }
+        }
+        ((npy_int64*)(%(n)s->data))[0]++;
+        int n = ((npy_int64*)(%(n)s->data))[0];
+        if (%(out)s == NULL) {
+            %(out)s = (PyArrayObject*)PyArray_EMPTY(%(sum)s->nd, %(sum)s->dimensions, NPY_FLOAT64, 0);
+        }
+        multi = PyArray_MultiIterNew(2, %(sum)s, %(out)s);
+        nelems = PyArray_SIZE(%(sum)s);
+        while (nelems--) {
+            // %(out)s <- %(sum)s / %(n)s
+            *(double*)PyArray_MultiIter_DATA(multi, 1) = *(double*)PyArray_MultiIter_DATA(multi, 0) / n;
+            PyArray_MultiIter_NEXT(multi);
+        }        
+        """ % dict(locals(), **sub)
+
+
+
+if __name__ == '__main__':
+    
+    vectors = numpy.random.RandomState(666).rand(10, 2)
+
+    x = tensor.dvector()
+    e = ExampleWiseMean()(x)
+
+    # f = theano.function([x], [e], linker = 'py')
+
+    # for i, v in enumerate(vectors):
+    #     print v, "->", f(v), numpy.mean(vectors[:i+1], axis=0)
+
+    # print
+
+    f = theano.function([x], [e], linker = 'c|py')
+
+    for i, v in enumerate(vectors):
+        print v, "->", f(v), numpy.mean(vectors[:i+1], axis=0)
+
+
+
+
--- a/statscollector.py	Tue Jun 03 21:27:32 2008 -0400
+++ b/statscollector.py	Mon Jun 16 17:47:36 2008 -0400
@@ -3,7 +3,7 @@
 
 #    def my_stats((residue,nll),(regularizer)):
 #            mse=examplewise_mean(square_norm(residue))
-# 	         training_loss=regularizer+examplewise_sum(nll)
+#            training_loss=regularizer+examplewise_sum(nll)
 #            set_names(locals())
 #            return ((residue,nll),(regularizer),(),(mse,training_loss))
 #    my_stats_collector = make_stats_collector(my_stats)
--- a/test_dataset.py	Tue Jun 03 21:27:32 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,652 +0,0 @@
-#!/bin/env python
-from dataset import *
-from math import *
-import numpy
-from misc import *
-
-def have_raised(to_eval, **var):
-    have_thrown = False
-    try:
-        eval(to_eval)
-    except :
-        have_thrown = True
-    return have_thrown
-
-def have_raised2(f, *args, **kwargs):
-    have_thrown = False
-    try:
-        f(*args, **kwargs)
-    except :
-        have_thrown = True
-    return have_thrown
-
-def test1():
-    print "test1"
-    global a,ds
-    a = numpy.random.rand(10,4)
-    print a
-    ds = ArrayDataSet(a,{'x':slice(3),'y':3,'z':[0,2]})
-    print "len(ds)=",len(ds)
-    assert(len(ds)==10)
-    print "example 0 = ",ds[0]
-#    assert
-    print "x=",ds["x"]
-    print "x|y"
-    for x,y in ds("x","y"):
-        print x,y
-    minibatch_iterator = ds.minibatches(fieldnames=['z','y'],n_batches=1,minibatch_size=3,offset=4)
-    minibatch = minibatch_iterator.__iter__().next()
-    print "minibatch=",minibatch
-    for var in minibatch:
-        print "var=",var
-    print "take a slice and look at field y",ds[1:6:2]["y"]
-
-    del a,ds,x,y,minibatch_iterator,minibatch,var
-
-def test_iterate_over_examples(array,ds):
-#not in doc!!!
-    i=0
-    for example in range(len(ds)):
-        assert (ds[example]['x']==array[example][:3]).all()
-        assert ds[example]['y']==array[example][3]
-        assert (ds[example]['z']==array[example][[0,2]]).all()
-        i+=1
-    assert i==len(ds)
-    del example,i
-
-#     - for example in dataset:
-    i=0
-    for example in ds:
-        assert len(example)==3
-        assert (example['x']==array[i][:3]).all()
-        assert example['y']==array[i][3]
-        assert (example['z']==array[i][0:3:2]).all()
-        assert (numpy.append(example['x'],example['y'])==array[i]).all()
-        i+=1
-    assert i==len(ds)
-    del example,i
-
-#     - for val1,val2,... in dataset:
-    i=0
-    for x,y,z in ds:
-        assert (x==array[i][:3]).all()
-        assert y==array[i][3]
-        assert (z==array[i][0:3:2]).all()
-        assert (numpy.append(x,y)==array[i]).all()
-        i+=1
-    assert i==len(ds)
-    del x,y,z,i
-
-#     - for example in dataset(field1, field2,field3, ...):
-    i=0
-    for example in ds('x','y','z'):
-        assert len(example)==3
-        assert (example['x']==array[i][:3]).all()
-        assert example['y']==array[i][3]
-        assert (example['z']==array[i][0:3:2]).all()
-        assert (numpy.append(example['x'],example['y'])==array[i]).all()
-        i+=1
-    assert i==len(ds)
-    del example,i
-    i=0
-    for example in ds('y','x'):
-        assert len(example)==2
-        assert (example['x']==array[i][:3]).all()
-        assert example['y']==array[i][3]
-        assert (numpy.append(example['x'],example['y'])==array[i]).all()
-        i+=1
-    assert i==len(ds)
-    del example,i
-
-#     - for val1,val2,val3 in dataset(field1, field2,field3):
-    i=0
-    for x,y,z in ds('x','y','z'):
-        assert (x==array[i][:3]).all()
-        assert y==array[i][3]
-        assert (z==array[i][0:3:2]).all()
-        assert (numpy.append(x,y)==array[i]).all()
-        i+=1
-    assert i==len(ds)
-    del x,y,z,i
-    i=0
-    for y,x in ds('y','x',):
-        assert (x==array[i][:3]).all()
-        assert y==array[i][3]
-        assert (numpy.append(x,y)==array[i]).all()
-        i+=1
-    assert i==len(ds)
-    del x,y,i
-
-    def test_minibatch_size(minibatch,minibatch_size,len_ds,nb_field,nb_iter_finished):
-        ##full minibatch or the last minibatch
-        for idx in range(nb_field):
-            test_minibatch_field_size(minibatch[idx],minibatch_size,len_ds,nb_iter_finished)
-        del idx
-    def test_minibatch_field_size(minibatch_field,minibatch_size,len_ds,nb_iter_finished):
-        assert len(minibatch_field)==minibatch_size or ((nb_iter_finished*minibatch_size+len(minibatch_field))==len_ds and len(minibatch_field)<minibatch_size)
-
-#     - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N):
-    i=0
-    mi=0
-    m=ds.minibatches(['x','z'], minibatch_size=3)
-    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
-    for minibatch in m:
-        assert isinstance(minibatch,DataSetFields)
-        assert len(minibatch)==2
-        test_minibatch_size(minibatch,m.minibatch_size,len(ds),2,mi)
-        if type(ds)==ArrayDataSet:
-            assert (minibatch[0][:,::2]==minibatch[1]).all()
-        else:
-            for j in xrange(len(minibatch[0])):
-                (minibatch[0][j][::2]==minibatch[1][j]).all()
-        mi+=1
-        i+=len(minibatch[0])
-    assert i==len(ds)
-    assert mi==4
-    del minibatch,i,m,mi
-
-    i=0
-    mi=0
-    m=ds.minibatches(['x','y'], minibatch_size=3)
-    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
-    for minibatch in m:
-        assert len(minibatch)==2
-        test_minibatch_size(minibatch,m.minibatch_size,len(ds),2,mi)
-        mi+=1
-        for id in range(len(minibatch[0])):
-            assert (numpy.append(minibatch[0][id],minibatch[1][id])==array[i]).all()
-            i+=1
-    assert i==len(ds)
-    assert mi==4
-    del minibatch,i,id,m,mi
-
-#     - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N):
-    i=0
-    mi=0
-    m=ds.minibatches(['x','z'], minibatch_size=3)
-    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
-    for x,z in m:
-        test_minibatch_field_size(x,m.minibatch_size,len(ds),mi)
-        test_minibatch_field_size(z,m.minibatch_size,len(ds),mi)
-        for id in range(len(x)):
-            assert (x[id][::2]==z[id]).all()
-            i+=1
-        mi+=1
-    assert i==len(ds)
-    assert mi==4
-    del x,z,i,m,mi
-    i=0
-    mi=0
-    m=ds.minibatches(['x','y'], minibatch_size=3)
-    for x,y in m:
-        test_minibatch_field_size(x,m.minibatch_size,len(ds),mi)
-        test_minibatch_field_size(y,m.minibatch_size,len(ds),mi)
-        mi+=1
-        for id in range(len(x)):
-            assert (numpy.append(x[id],y[id])==array[i]).all()
-            i+=1
-    assert i==len(ds)
-    assert mi==4
-    del x,y,i,id,m,mi
-
-#not in doc
-    i=0
-    m=ds.minibatches(['x','y'],n_batches=1,minibatch_size=3,offset=4)
-    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
-    for x,y in m:
-        assert len(x)==m.minibatch_size
-        assert len(y)==m.minibatch_size
-        for id in range(m.minibatch_size):
-            assert (numpy.append(x[id],y[id])==array[i+4]).all()
-            i+=1
-    assert i==m.n_batches*m.minibatch_size
-    del x,y,i,id,m
-
-    i=0
-    m=ds.minibatches(['x','y'],n_batches=2,minibatch_size=3,offset=4)
-    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
-    for x,y in m:
-        assert len(x)==m.minibatch_size
-        assert len(y)==m.minibatch_size
-        for id in range(m.minibatch_size):
-            assert (numpy.append(x[id],y[id])==array[i+4]).all()
-            i+=1
-    assert i==m.n_batches*m.minibatch_size
-    del x,y,i,id,m
-
-    i=0
-    m=ds.minibatches(['x','y'],n_batches=20,minibatch_size=3,offset=4)
-    assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
-    for x,y in m:
-        assert len(x)==m.minibatch_size
-        assert len(y)==m.minibatch_size
-        for id in range(m.minibatch_size):
-            assert (numpy.append(x[id],y[id])==array[(i+4)%array.shape[0]]).all()
-            i+=1
-    assert i==m.n_batches*m.minibatch_size
-    del x,y,i,id
-
-    #@todo: we can't do minibatch bigger then the size of the dataset???
-    assert have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array)+1,offset=0)
-    assert not have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array),offset=0)
-
-def test_ds_iterator(array,iterator1,iterator2,iterator3):
-    l=len(iterator1)
-    i=0
-    for x,y in iterator1:
-        assert (x==array[i][:3]).all()
-        assert y==array[i][3]
-        assert (numpy.append(x,y)==array[i]).all()
-        i+=1
-    assert i==l
-    i=0
-    for y,z in iterator2:
-        assert y==array[i][3]
-        assert (z==array[i][0:3:2]).all()
-        i+=1
-    assert i==l
-    i=0
-    for x,y,z in iterator3:
-        assert (x==array[i][:3]).all()
-        assert y==array[i][3]
-        assert (z==array[i][0:3:2]).all()
-        assert (numpy.append(x,y)==array[i]).all()
-        i+=1
-    assert i==l
-
-def test_getitem(array,ds):
-    def test_ds(orig,ds,index):
-        i=0
-        assert len(ds)==len(index)
-        for x,z,y in ds('x','z','y'):
-            assert (orig[index[i]]['x']==array[index[i]][:3]).all()
-            assert (orig[index[i]]['x']==x).all()
-            assert orig[index[i]]['y']==array[index[i]][3]
-            assert orig[index[i]]['y']==y
-            assert (orig[index[i]]['z']==array[index[i]][0:3:2]).all()
-            assert (orig[index[i]]['z']==z).all()
-            i+=1
-        del i
-        ds[0]
-        if len(ds)>2:
-            ds[:1]
-            ds[1:1]
-            ds[1:1:1]
-        if len(ds)>5:
-            ds[[1,2,3]]
-        for x in ds:
-            pass
-
-#ds[:n] returns a dataset with the n first examples.
-    ds2=ds[:3]
-    assert isinstance(ds2,DataSet)
-    test_ds(ds,ds2,index=[0,1,2])
-    del ds2
-
-#ds[i1:i2:s]# returns a ds with the examples i1,i1+s,...i2-s.
-    ds2=ds[1:7:2]
-    assert isinstance(ds2,DataSet)
-    test_ds(ds,ds2,[1,3,5])
-    del ds2
-
-#ds[i]
-    ds2=ds[5]
-    assert isinstance(ds2,Example)
-    assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds)  # index not defined
-    assert not have_raised("var['ds']["+str(len(ds)-1)+"]",ds=ds)
-    del ds2
-
-#ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in.
-    ds2=ds[[4,7,2,8]]
-    assert isinstance(ds2,DataSet)
-    test_ds(ds,ds2,[4,7,2,8])
-    del ds2
-
-#ds[fieldname]# an iterable over the values of the field fieldname across
-  #the ds (the iterable is obtained by default by calling valuesVStack
-  #over the values for individual examples).
-    assert have_raised("ds['h']")  # h is not defined...
-    assert have_raised("ds[['x']]")  # bad syntax
-    assert not have_raised("var['ds']['x']",ds=ds)
-    isinstance(ds['x'],DataSetFields)
-    ds2=ds['x']
-    assert len(ds['x'])==10
-    assert len(ds['y'])==10
-    assert len(ds['z'])==10
-    i=0
-    for example in ds['x']:
-        assert (example==array[i][:3]).all()
-        i+=1
-    assert i==len(ds)
-    i=0
-    for example in ds['y']:
-        assert (example==array[i][3]).all()
-        i+=1
-    assert i==len(ds)
-    i=0
-    for example in ds['z']:
-        assert (example==array[i,0:3:2]).all()
-        i+=1
-    assert i==len(ds)
-    del ds2,i
-
-#ds.<property># returns the value of a property associated with
-  #the name <property>. The following properties should be supported:
-  #    - 'description': a textual description or name for the ds
-  #    - 'fieldtypes': a list of types (one per field)
-
-#* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#????
-    #assert hstack([ds('x','y'),ds('z')])==ds
-    #hstack([ds('z','y'),ds('x')])==ds
-    assert have_raised2(hstack,[ds('x'),ds('x')])
-    assert have_raised2(hstack,[ds('y','x'),ds('x')])
-    assert not have_raised2(hstack,[ds('x'),ds('y')])
-    
-#        i=0
-#        for example in hstack([ds('x'),ds('y'),ds('z')]):
-#            example==ds[i]
-#            i+=1 
-#        del i,example
-#* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#????
-
-def test_fields_fct(ds):
-    #@todo, fill correctly
-    assert len(ds.fields())==3
-    i=0
-    v=0
-    for field in ds.fields():
-        for field_value in field: # iterate over the values associated to that field for all the ds examples
-            v+=1
-        i+=1
-    assert i==3
-    assert v==3*10
-    del i,v
-    
-    i=0
-    v=0
-    for field in ds('x','z').fields():
-        i+=1
-        for val in field:
-            v+=1
-    assert i==2
-    assert v==2*10
-    del i,v
-    
-    i=0
-    v=0
-    for field in ds.fields('x','y'):
-        i+=1
-        for val in field:
-            v+=1
-    assert i==2
-    assert v==2*10
-    del i,v
-    
-    i=0
-    v=0
-    for field_examples in ds.fields():
-        for example_value in field_examples:
-            v+=1
-        i+=1
-    assert i==3
-    assert v==3*10
-    del i,v
-    
-    assert ds == ds.fields().examples()
-    assert len(ds('x','y').fields()) == 2
-    assert len(ds('x','z').fields()) == 2
-    assert len(ds('y').fields()) == 1
-
-    del field
-def test_all(array,ds):
-    assert len(ds)==10
-
-    test_iterate_over_examples(array, ds)
-    test_getitem(array, ds)
-    test_ds_iterator(array,ds('x','y'),ds('y','z'),ds('x','y','z'))
-    test_fields_fct(ds)
-
-def test_ArrayDataSet():
-    #don't test stream
-    #tested only with float value
-    #don't always test with y
-    #don't test missing value
-    #don't test with tuple
-    #don't test proterties
-    print "test_ArrayDataSet"
-    a2 = numpy.random.rand(10,4)
-    ds = ArrayDataSet(a2,{'x':slice(3),'y':3,'z':[0,2]})###???tuple not tested
-    ds = ArrayDataSet(a2,LookupList(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
-    #assert ds==a? should this work?
-
-    test_all(a2,ds)
-
-    del a2, ds
-
-def test_LookupList():
-    #test only the example in the doc???
-    print "test_LookupList"
-    example = LookupList(['x','y','z'],[1,2,3])
-    example['x'] = [1, 2, 3] # set or change a field
-    x, y, z = example
-    x = example[0]
-    x = example["x"]
-    assert example.keys()==['x','y','z']
-    assert example.values()==[[1,2,3],2,3]
-    assert example.items()==[('x',[1,2,3]),('y',2),('z',3)]
-    example.append_keyval('u',0) # adds item with name 'u' and value 0
-    assert len(example)==4 # number of items = 4 here
-    example2 = LookupList(['v','w'], ['a','b'])
-    example3 = LookupList(['x','y','z','u','v','w'], [[1, 2, 3],2,3,0,'a','b'])
-    assert example+example2==example3
-    assert have_raised("var['x']+var['x']",x=example)
-
-    del example, example2, example3, x, y ,z
-
-def test_CachedDataSet():
-    print "test_CacheDataSet"
-    a = numpy.random.rand(10,4)
-    ds1 = ArrayDataSet(a,LookupList(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
-    ds2 = CachedDataSet(ds1)
-    ds3 = CachedDataSet(ds1,cache_all_upon_construction=True)
-
-    test_all(a,ds2)
-    test_all(a,ds3)
-
-    del a,ds1,ds2,ds3
-
-
-def test_DataSetFields():
-    print "test_DataSetFields"
-    raise NotImplementedError()
-
-def test_ApplyFunctionDataSet():
-    print "test_ApplyFunctionDataSet"
-    a = numpy.random.rand(10,4)
-    a2 = a+1
-    ds1 = ArrayDataSet(a,LookupList(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
-
-    ds2 = ApplyFunctionDataSet(ds1,lambda x,y,z: (x+1,y+1,z+1), ['x','y','z'],minibatch_mode=False)
-    ds3 = ApplyFunctionDataSet(ds1,lambda x,y,z: (numpy.array(x)+1,numpy.array(y)+1,numpy.array(z)+1),
-                               ['x','y','z'],
-                               minibatch_mode=True)
-
-    test_all(a2,ds2)
-    test_all(a2,ds3)
-
-    del a,ds1,ds2,ds3
-
-def test_FieldsSubsetDataSet():
-    print "test_FieldsSubsetDataSet"
-    raise NotImplementedError()
-def test_MinibatchDataSet():
-    print "test_MinibatchDataSet"
-    raise NotImplementedError()
-def test_HStackedDataSet():
-    print "test_HStackedDataSet"
-    raise NotImplementedError()
-def test_VStackedDataSet():
-    print "test_VStackedDataSet"
-    raise NotImplementedError()
-def test_ArrayFieldsDataSet():
-    print "test_ArrayFieldsDataSet"
-    raise NotImplementedError()
-
-
-def test_speed(array, ds):
-    print "test_speed", ds.__class__
-
-    mat = numpy.random.rand(400,100)
-
-    @print_timing
-    def f_array_full(a):
-        a+1
-    @print_timing
-    def f_array_index(a):
-        for id in range(a.shape[0]):
-#            pass
-            a[id]+1
-#            a[id]*mat
-    @print_timing
-    def f_array_iter(a):
-        for r in a:
-#            pass
-            r+1
-#            r*mat
-    @print_timing
-    def f_ds_index(ds):
-        for id in range(len(ds)):
-#            pass
-            ds[id][0]+1
-#            ds[id][0]*mat
-    @print_timing
-    def f_ds_iter(ds):
-        for ex in ds:
-#            pass
-            ex[0]+1
-#            a[0]*mat
-    @print_timing
-    def f_ds_mb1(ds,mb_size):
-        for exs in ds.minibatches(minibatch_size = mb_size):
-            for ex in exs:
-#                pass
-                ex[0]+1
-#                ex[0]*mat
-    @print_timing
-    def f_ds_mb2(ds,mb_size):
-        for exs in ds.minibatches(minibatch_size = mb_size):
-#            pass
-            exs[0]+1
-#            ex[0]*mat
-
-    f_array_full(array)
-    f_array_index(array)
-    f_array_iter(array)
-
-    f_ds_index(ds)
-    f_ds_index(ds)
-    f_ds_iter(ds)
-    f_ds_iter(ds)
-
-    f_ds_mb1(ds,10)
-    f_ds_mb1(ds,100)
-    f_ds_mb1(ds,1000)
-    f_ds_mb1(ds,10000)
-    f_ds_mb2(ds,10)
-    f_ds_mb2(ds,100)
-    f_ds_mb2(ds,1000)
-    f_ds_mb2(ds,10000)
-
-
-
-
-
-
-#****************************************************************
-# dummy tests, less powerful than the previous tests, but can work with any new weird dataset.
-# Basically, emphasis is put on consistency, but it never checks the actual values.
-# To be used as a checklist, or a first test, when creating a new dataset
-
-def dummytest_all(ds) :
-    """ Launches all the dummytests with a given dataset. """
-
-    dummytest1_basicstats(ds)
-    dummytest2_slicing(ds)
-    dummytest3_fields_iterator_consistency(ds)
-
-
-def dummytest1_basicstats(ds) :
-    """print basics stats on a dataset, like length"""
-
-    print 'len(ds) = ',len(ds)
-    print 'num fields = ', len(ds.fieldNames())
-    print 'types of field: ',
-    for k in ds.fieldNames() :
-        print type(ds[0](k)[0]),
-    print ''
-
-def dummytest2_slicing(ds) :
-    """test if slicing seems to works properly"""
-    print 'testing slicing...',
-    sys.stdout.flush()
-        
-    middle = len(ds) / 2
-    tenpercent = int(len(ds) * .1)
-    set1 = ds[:middle+tenpercent]
-    set2 = ds[middle-tenpercent:]
-    for k in range(tenpercent + tenpercent -1):
-        for k2 in ds.fieldNames() :
-            if type(set1[middle-tenpercent+k](k2)[0]) == N.ndarray :
-                for k3 in range(len(set1[middle-tenpercent+k](k2)[0])) :
-                    assert set1[middle-tenpercent+k](k2)[0][k3] == set2[k](k2)[0][k3]
-            else :
-                assert set1[middle-tenpercent+k](k2)[0] == set2[k](k2)[0]
-    assert tenpercent > 1
-    set3 = ds[middle-tenpercent:middle+tenpercent:2]
-    for k2 in ds.fieldNames() :
-        if type(set2[2](k2)[0]) == N.ndarray :
-            for k3 in range(len(set2[2](k2)[0])) :
-                assert set2[2](k2)[0][k3] == set3[1](k2)[0][k3]
-        else :
-            assert set2[2](k2)[0] == set3[1](k2)[0]
-
-    print 'done'
-
-
-def dummytest3_fields_iterator_consistency(ds) :
-    """test if the number of iterator corresponds to the number of fields, also do it for minibatches"""
-    print 'testing fields/iterator consistency...',
-    sys.stdout.flush()
-
-    # basic test
-    maxsize = min(len(ds)-1,100)
-    for iter in ds[:maxsize] :
-        assert len(iter) == len(ds.fieldNames())
-    if len(ds.fieldNames()) == 1 :
-        print 'done'
-        return
-
-    # with minibatches iterator
-    ds2 = ds[:maxsize].minibatches([ds.fieldNames()[0],ds.fieldNames()[1]],minibatch_size=2)
-    for iter in ds2 :
-        assert len(iter) == 2
-
-    print 'done'
-
-
-
-
-
-
-
-
-
-if __name__=='__main__':
-    test1()
-    test_LookupList()
-    test_ArrayDataSet()
-    test_CachedDataSet()
-    test_ApplyFunctionDataSet()
-    #test_speed()
-#test pmat.py
-
--- a/test_filetensor.py	Tue Jun 03 21:27:32 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,116 +0,0 @@
-from filetensor import *
-import filetensor
-
-import unittest
-import os
-
-class T(unittest.TestCase):
-    fname = '/tmp/some_mat'
-
-    def setUp(self):
-        #TODO: test that /tmp/some_mat does not exist
-        try:
-            os.stat(self.fname)
-        except OSError:
-            return #assume file was not found
-        raise Exception('autotest file "%s" exists!' % self.fname)
-
-    def tearDown(self):
-        os.remove(self.fname)
-
-    def test_file(self):
-        gen = numpy.random.rand(1)
-        f = file(self.fname, 'w');
-        write(f, gen)
-        f.flush()
-        f = file(self.fname, 'r');
-        mat = read(f, None, debug=False) #load from filename
-        self.failUnless(gen.shape == mat.shape)
-        self.failUnless(numpy.all(gen == mat))
-
-    def test_filename(self):
-        gen = numpy.random.rand(1)
-        write(self.fname, gen)
-        mat = read(self.fname, None, debug=False) #load from filename
-        self.failUnless(gen.shape == mat.shape)
-        self.failUnless(numpy.all(gen == mat))
-
-    def testNd(self):
-        """shape and values are stored correctly for tensors of rank 0 to 5"""
-        whole_shape = [5, 6, 7, 8, 9]
-        for i in xrange(5):
-            gen = numpy.asarray(numpy.random.rand(*whole_shape[:i]))
-            f = file(self.fname, 'w');
-            write(f, gen)
-            f.flush()
-            f = file(self.fname, 'r');
-            mat = read(f, None, debug=False) #load from filename
-            self.failUnless(gen.shape == mat.shape)
-            self.failUnless(numpy.all(gen == mat))
-
-    def test_dtypes(self):
-        """shape and values are stored correctly for all dtypes """
-        for dtype in filetensor._dtype_magic:
-            gen = numpy.asarray(
-                    numpy.random.rand(4, 5, 2, 1) * 100,
-                    dtype=dtype)
-            f = file(self.fname, 'w');
-            write(f, gen)
-            f.flush()
-            f = file(self.fname, 'r');
-            mat = read(f, None, debug=False) #load from filename
-            self.failUnless(gen.dtype == mat.dtype)
-            self.failUnless(gen.shape == mat.shape)
-            self.failUnless(numpy.all(gen == mat))
-
-    def test_dtype_invalid(self):
-        gen = numpy.zeros((3,4), dtype='uint16') #an unsupported dtype
-        f = file(self.fname, 'w')
-        passed = False
-        try:
-            write(f, gen)
-        except TypeError, e:
-            if e[0].startswith('Invalid ndarray dtype'):
-                passed = True
-        f.close()
-        self.failUnless(passed)
-        
-
-if __name__ == '__main__':
-    unittest.main()
-
-    #a small test script, starts by reading sys.argv[1]
-    #print 'rval', rval.shape, rval.size
-
-    if 0:
-        write(f, rval)
-        print ''
-        f.close()
-        f = file('/tmp/some_mat', 'r');
-        rval2 = read(f) #load from file handle
-        print 'rval2', rval2.shape, rval2.size
-
-        assert rval.dtype == rval2.dtype
-        assert rval.shape == rval2.shape
-        assert numpy.all(rval == rval2)
-        print 'ok'
-
-    def _unused():
-        f.seek(0,2) #seek to end
-        f_len =  f.tell()
-        f.seek(f_data_start,0) #seek back to where we were
-
-        if debug: print 'length:', f_len
-
-
-        f_data_bytes = (f_len - f_data_start)
-
-        if debug: print 'data bytes according to header: ', dim_size * elsize
-        if debug: print 'data bytes according to file  : ', f_data_bytes
-
-        if debug: print 'reading data...'
-        sys.stdout.flush()
-
-    def read_ndarray(f, dim, dtype):
-        return numpy.fromfile(f, dtype=dtype, count=_prod(dim)).reshape(dim)
-
--- a/test_mlp.py	Tue Jun 03 21:27:32 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,132 +0,0 @@
-
-from mlp import *
-import dataset
-import nnet_ops
-
-
-from functools import partial
-def separator(debugger, i, node, *ths):
-    print "==================="
-
-def what(debugger, i, node, *ths):
-    print "#%i" % i, node
-
-def parents(debugger, i, node, *ths):
-    print [input.step for input in node.inputs]
-
-def input_shapes(debugger, i, node, *ths):
-    print "input shapes: ",
-    for r in node.inputs:
-        if hasattr(r.value, 'shape'):
-            print r.value.shape,
-        else:
-            print "no_shape",
-    print
-
-def input_types(debugger, i, node, *ths):
-    print "input types: ",
-    for r in node.inputs:
-        print r.type,
-    print
-
-def output_shapes(debugger, i, node, *ths):
-    print "output shapes:",
-    for r in node.outputs:
-        if hasattr(r.value, 'shape'):
-            print r.value.shape,
-        else:
-            print "no_shape",
-    print
-
-def output_types(debugger, i, node, *ths):
-    print "output types:",
-    for r in node.outputs:
-        print r.type,
-    print
-
-
-def test0():
-    linker = 'c|py'
-    #linker = partial(theano.gof.DebugLinker, linkers = [theano.gof.OpWiseCLinker],
-    #                 debug_pre = [separator, what, parents, input_types, input_shapes],
-    #                 debug_post = [output_shapes, output_types],
-    #                 compare_fn = lambda x, y: numpy.all(x == y))
-    
-    nnet = OneHiddenLayerNNetClassifier(10,2,.001,1000, linker = linker)
-    training_set = dataset.ArrayDataSet(numpy.array([[0, 0, 0],
-                                                     [0, 1, 1],
-                                                     [1, 0, 1],
-                                                     [1, 1, 1]]),
-                                        {'input':slice(2),'target':2})
-    fprop=nnet(training_set)
-
-    output_ds = fprop(training_set)
-
-    for fieldname in output_ds.fieldNames():
-        print fieldname+"=",output_ds[fieldname]
-
-def test1():
-    nnet = ManualNNet(2, 10,3,.1,1000)
-    training_set = dataset.ArrayDataSet(numpy.array([[0, 0, 0],
-                                                     [0, 1, 1],
-                                                     [1, 0, 1],
-                                                     [1, 1, 1]]),
-                                        {'input':slice(2),'target':2})
-    fprop=nnet(training_set)
-
-    output_ds = fprop(training_set)
-
-    for fieldname in output_ds.fieldNames():
-        print fieldname+"=",output_ds[fieldname]
-
-def test2():
-    training_set = dataset.ArrayDataSet(numpy.array([[0, 0, 0],
-                                                     [0, 1, 1],
-                                                     [1, 0, 1],
-                                                     [1, 1, 1]]),
-                                        {'input':slice(2),'target':2})
-    nin, nhid=2, 10
-    def sigm_layer(input):
-        W1 = t.matrix('W1')
-        b1 = t.vector('b1')
-        return (nnet_ops.sigmoid(b1 + t.dot(input, W1)),
-                [W1, b1],
-                [(numpy.random.rand(nin, nhid) -0.5) * 0.001, numpy.zeros(nhid)])
-    nnet = ManualNNet(nin, nhid, 3, .1, 1000, hidden_layer=sigm_layer)
-    fprop=nnet(training_set)
-
-    output_ds = fprop(training_set)
-
-    for fieldname in output_ds.fieldNames():
-        print fieldname+"=",output_ds[fieldname]
-
-def test_interface_0():
-    learner = ManualNNet(2, 10, 3, .1, 1000)
-
-    model = learner(training_set)
-
-    model2 = learner(training_set)    # trains model a second time
-
-    learner.update(additional_data)   # modifies nnet and model by side-effect
-
-
-def test_interface2_1():
-    learn_algo = ManualNNet(2, 10, 3, .1, 1000)
-
-    prior = learn_algo()
-
-    model1 = learn_algo(training_set1)
-
-    model2 = learn_algo(training_set2)
-
-    model2.update(additional_data)
-
-    n_match = 0
-    for o1, o2 in zip(model1.use(test_data), model2.use(test_data)):
-        n_match += (o1 == o2) 
-
-    print n_match
-
-test1()
-test2()
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test_speed.py	Mon Jun 16 17:47:36 2008 -0400
@@ -0,0 +1,79 @@
+import numpy
+from dataset import *
+from misc import *
+def test_speed(array, ds):
+    print "test_speed", ds.__class__
+
+    mat = numpy.random.rand(400,100)
+
+    @print_timing
+    def f_array_full(a):
+        a+1
+    @print_timing
+    def f_array_index(a):
+        for id in range(a.shape[0]):
+#            pass
+            a[id]+1
+#            a[id]*mat
+    @print_timing
+    def f_array_iter(a):
+        for r in a:
+#            pass
+            r+1
+#            r*mat
+    @print_timing
+    def f_ds_index(ds):
+        for id in range(len(ds)):
+#            pass
+            ds[id][0]+1
+#            ds[id][0]*mat
+    @print_timing
+    def f_ds_iter(ds):
+        for ex in ds:
+#            pass
+            ex[0]+1
+#            a[0]*mat
+    @print_timing
+    def f_ds_mb1(ds,mb_size):
+        for exs in ds.minibatches(minibatch_size = mb_size):
+            for ex in exs:
+#                pass
+                ex[0]+1
+#                ex[0]*mat
+    @print_timing
+    def f_ds_mb2(ds,mb_size):
+        for exs in ds.minibatches(minibatch_size = mb_size):
+#            pass
+            exs[0]+1
+#            ex[0]*mat
+
+    f_array_full(array)
+    f_array_index(array)
+    f_array_iter(array)
+
+    f_ds_index(ds)
+    f_ds_iter(ds)
+
+    f_ds_mb1(ds,10)
+    f_ds_mb1(ds,100)
+    f_ds_mb1(ds,1000)
+    f_ds_mb1(ds,10000)
+    f_ds_mb2(ds,10)
+    f_ds_mb2(ds,100)
+    f_ds_mb2(ds,1000)
+    f_ds_mb2(ds,10000)
+
+if __name__=='__main__':
+    a2 = numpy.random.rand(100000,400)
+    ds1 = ArrayDataSet(a2,{'all':slice(0,a2.shape[1],1)})
+    test_speed(a2,ds1)
+    a1 = numpy.random.rand(100000,40)
+    ds4 = ArrayDataSet(a1,LookupList(["f"+str(x)for x in range(a1.shape[1])],
+                                     range(a1.shape[1])))
+    test_speed(a2,ds4)
+    ds2=CachedDataSet(ds1,cache_all_upon_construction=False)
+    test_speed(a2,ds2)
+    ds3=CachedDataSet(ds1,cache_all_upon_construction=True)
+    test_speed(a2,ds3)
+    del a2,ds1,ds2,ds3
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/version.py	Mon Jun 16 17:47:36 2008 -0400
@@ -0,0 +1,292 @@
+import subprocess as _subprocess
+import imp as _imp
+import sys
+import os
+
+
+_cache = dict()
+
+def src_version(module_name):
+    """Return compact identifier of module code.
+
+    @return: compact identifier of module code.
+    @rtype: string
+
+    @note: This function tries to establish that the source files and the repo
+    are syncronized.  It raises an Exception if there are un-tracked '.py'
+    files, or if there are un-committed modifications.  This implementation uses
+    "hg id" to establish this.  The code returned by "hg id" is not affected by
+    hg pull, but pulling might remove the " tip" string which might have
+    appeared.  This implementation ignores the  " tip" information, and only
+    uses the code.
+
+    @note: This implementation is assumes that the import directory is under
+    version control by mercurial.
+
+    """
+
+    if module_name not in _cache:
+
+        try :
+            location = _imp.find_module(module_name)[1]
+        except ImportError:
+            _cache[module_name] = None
+            return None
+        #print 'location:', location
+        isdir = False
+        if os.path.isdir(location) :
+            isdir = True
+        elif os.path.isfile(location) :
+            isdir = False
+        else :
+            # SEEMS THIS CASE EXIST, FOR WEIRD BUILTIN FUNCTIONS
+            #print location,": it's 'not a dir, it's not a file, it's superman!"
+            #raise Exception('Unknown location or file type')
+            _cache[module_name] = None
+            return None
+
+
+        # we're dealing with a dir
+        if isdir :
+
+            # under hg?
+            if not os.path.exists( os.path.join( location , '.hg') ) :
+                _cache[module_name] = None
+                return None
+
+            status = _subprocess.Popen(('hg','st'),cwd=location,stdout=_subprocess.PIPE).communicate()[0]
+            #print 'status =', status
+            #TODO: check that the process return code is 0 (ticket #45)
+
+            #status_codes = [line[0] for line in  if line and line[0] != '?']
+            for line in status.split('\n'):
+                if not line: continue
+                if line[0] != '?':
+                    raise Exception('Uncommitted modification to "%s" in %s (%s)'
+                        %(line[2:], __name__,location))
+                if line[0] == '?' and line[-3:] == '.py':
+                    raise Exception('Untracked file "%s" in %s (%s)'
+                        %(line[2:], __name__, location))
+
+            hg_id = _subprocess.Popen(('hg','id'),cwd=location,stdout=_subprocess.PIPE).communicate()[0]
+
+            # This asserts my understanding of hg id return values
+            # There is mention in the doc that it might return two parent hash codes
+            # but I've never seen it, and I dont' know what it means or how it is
+            # formatted.
+            tokens = hg_id.split(' ')
+            assert len(tokens) <= 2
+            assert len(tokens) >= 1
+            assert tokens[0][-1] != '+' # the trailing + indicates uncommitted changes
+            if len(tokens) == 2:
+                assert tokens[1] == 'tip\n'
+
+            _cache[module_name] = tokens[0]
+
+        # we're dealing with a file
+        if not isdir :
+
+            folder = os.path.split( os.path.abspath(location) )[0]
+            # under hg?
+            if not os.path.exists( os.path.join( folder , '.hg') ) :
+                _cache[module_name] = None
+                return None
+
+            status = _subprocess.Popen(('hg','st',location),cwd=folder,stdout=_subprocess.PIPE).communicate()[0]
+            #print 'status =', status
+
+            #status_codes = [line[0] for line in  if line and line[0] != '?']
+            for line in status.split('\n'):
+                if not line: continue
+                if line[0] != '?':
+                    raise Exception('Uncommitted modification to "%s" in %s (%s)'
+                        %(line[2:], location,folder))
+                if line[0] == '?' and line[-3:] == '.py':
+                    raise Exception('Untracked file "%s" in %s (%s)'
+                        %(line[2:], location, folder))
+
+            hg_id = _subprocess.Popen(('hg','id'),cwd=folder,stdout=_subprocess.PIPE).communicate()[0]
+
+            # This asserts my understanding of hg id return values
+            # There is mention in the doc that it might return two parent hash codes
+            # but I've never seen it, and I dont' know what it means or how it is
+            # formatted.
+            tokens = hg_id.split(' ')
+            assert len(tokens) <= 2
+            assert len(tokens) >= 1
+            if tokens[0][-1] == '+' :
+                tokens[0] = tokens[0][:-1] # the change was not on this file
+            if len(tokens) == 2:
+                assert tokens[1] == 'tip\n'
+
+            _cache[module_name] = tokens[0]
+
+
+    return _cache[module_name]
+
+_unknown_version = 'unknown version'
+
+def hg_version(dirname, filenames=None):
+    """Return current changeset of directory I{dirname}.
+
+    @type filename: list of str (or default: None)
+    @param filename: if specified, we ignore modifications to other files.
+
+    @rtype: tuple (last changeset, modified)
+
+    """
+    if type(filenames) not in (list, tuple, type(None)):
+        raise TypeError(filenames) 
+
+    #may raise exception, for example if hg is not visible via PATH
+    status_proc = _subprocess.Popen(('hg','st'), cwd=dirname, 
+            stdout=_subprocess.PIPE, stderr=_subprocess.PIPE)
+    status = status_proc.communicate()[0] #read stdout into buffer
+    if status_proc.returncode != 0:
+        raise OSError('hg returned %i, maybe %s is not under hg control?',
+                (status_proc.returncode, dirname))
+
+    #may raise exception, for example if hg is not visible via PATH
+    id_proc = _subprocess.Popen(('hg','id', '-i'), cwd=dirname,
+            stdout=_subprocess.PIPE, stderr=_subprocess.PIPE)
+    id_stdout = id_proc.communicate()[0]
+    if id_proc.returncode != 0:
+        raise OSError('hg returned %i, maybe %s is not under hg control?', 
+                (id_proc.returncode, dirname))
+
+    care_about = (lambda some_file : True) if filenames is None \
+            else (lambda some_file : some_file in filenames)
+
+    # parse status codes for what we care about
+    care_about_mod = False
+    for line in status.split('\n'):
+        if not line:  #empty lines happen
+            continue
+        line_file = line[2:]
+        if line[0] != '?' and care_about(line_file): 
+            care_about_mod = True
+            #raise Exception('Uncommitted modification', 
+                    #os.path.join(dirname, line_file))
+        if line[0] == '?' and line[-3:] == '.py':
+            print >> sys.stderr, 'WARNING: untracked file', os.path.join(dirname, line_file)
+
+    # id_stdout is 12 hex digits followed by '+\n' or '\n'
+    # return the trailing '+' character only if there were changes to files that
+    # the caller cares about (named in filenames)
+    modified = (id_stdout[12] == '+')
+    assert len(id_stdout) in (13, 14) #sanity check
+    if modified and care_about_mod :
+        return id_stdout[:13]
+    else:
+        return id_stdout[:12]
+
+def _import_id_py_source(location):
+    try:
+        dirname = os.path.dirname(location[1])
+        basename = os.path.basename(location[1])
+        return hg_version(dirname, [basename])
+    except OSError, e:
+        print >> sys.stderr, 'IGNORNING', e
+        return _unknown_version + ' PY_SOURCE'
+
+def _import_id_py_compiled(location):
+    #a .pyc file was found, but no corresponding .py
+    return _unknown_version + ' PYC_COMPILED'
+
+def _import_id_pkg_directory(location):
+    try:
+        return hg_version(location[1])
+    except OSError, e:
+        print >> sys.stderr, 'IGNORNING', e
+        return _unknown_version + ' PKG_DIRECTORY'
+
+def _import_id(tag):
+    try :
+        location = _imp.find_module(tag)
+    except ImportError, e: #raise when tag is not found
+        return e #put this in the cache, import_id will raise it
+
+    #the find_module was successful, location is valid
+    resource_type = location[2][2]
+
+    if resource_type == _imp.PY_SOURCE:
+        return _import_id_py_source(location)
+    if resource_type == _imp.PY_COMPILED:
+        return _import_id_py_compiled(location)
+    if resource_type == _imp.C_EXTENSION:
+        raise NoteImplementedError
+    if resource_type == _imp.PY_RESOURCE:
+        raise NoteImplementedError
+    if resource_type == _imp.PKG_DIRECTORY:
+        return _import_id_pkg_directory(location)
+    if resource_type == _imp.C_BUILTIN:
+        raise NoteImplementedError
+    if resource_type == _imp.PY_FROZEN:
+        raise NoteImplementedError
+
+    assert False #the list of resource types above should be exhaustive
+
+def import_id(tag):
+    """Return an identifier of the code imported by 'import <tag>'.
+
+    @param tag: a module or file name
+    @type tag: string
+
+    @rtype: string
+    @return: identifier of the code imported by 'import <tag>'.
+
+    This high-level function might do different things depending on, for
+    example, whether I{tag} identifies a file or a directory, or whether the
+    named entity is under some sort of version/revision control.
+
+    Versions are sought in the following order:
+    0. If I{tag} is 'python' then sys.version will be returned
+    1. If I{tag} names a file or folder under revision control, this function
+    will attempt to guess which one, and return a string that identifies the
+    running code (a revision id, not the whole file!)
+    2.  If I{tag} names a module with a __version__ attribute, then that
+    attribute will be returned as a string.
+    3. The string starting with 'unknown version' will be returned for other valid modules.
+    4. An exception will be raise for non-existent modules.
+
+    @note: This function may import the named entity in order to return a
+    __version__ module attribute.
+
+    """
+    if tag not in import_id.cache:
+        import_id.cache[tag] = _import_id(tag)
+
+    #in the case of bad module names, we cached the ImportError exception
+    rval = import_id.cache[tag]
+    if isinstance(rval, Exception):
+        raise rval
+    return rval
+import_id.cache = {'python':sys.version}
+
+def get_all_src_versions() :
+    """
+    Get the version of all loaded module.
+    Calls src_version on all loaded modules. These modules are found
+    using sys.modules.
+
+    Returns a dictionnary: name->version.
+    
+    @RETURN dict Dictionnary (module's name) -> (version)
+    @SEE src_version
+    """
+    allmodules = sys.modules
+    d = dict()
+    for m in allmodules :
+        try:
+            d[m] = import_id(m)
+        except:
+            pass
+    return d
+
+
+if __name__ == "__main__" :
+
+    if len(sys.argv) == 2 :
+        print 'testing on', sys.argv[1]
+        print import_id(sys.argv[1])
+