Mercurial > pylearn

--- a/_test_dataset.py	Wed Mar 26 18:23:44 2008 -0400
+++ b/_test_dataset.py	Wed Mar 26 21:05:14 2008 -0400
@@ -12,28 +12,67 @@
     def setUp(self):
         numpy.random.seed(123456)

-    def test0(self):
-        a=ArrayDataSet(data=numpy.random.rand(8,3),fields={"x":slice(2),"y":slice(1,3)})
-        s=0
-        for example in a:
-            s+=_sum_all(example.x)
-        #print s
-        self.failUnless(abs(s-7.25967597)<1e-6)
+
+    def test_ctor_len(self):
+        n = numpy.random.rand(8,3)
+        a=ArrayDataSet(n)
+        self.failUnless(a.data is n)
+        self.failUnless(a.fields is None)
+
+        self.failUnless(len(a) == n.shape[0])
+        self.failUnless(a[0].shape == (n.shape[1],))
+
+    def test_iter(self):
+        arr = numpy.random.rand(8,3)
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,3)})
+        for i, example in enumerate(a):
+            self.failUnless(numpy.all( example.x == arr[i,:2]))
+            self.failUnless(numpy.all( example.y == arr[i,1:3]))
+
+    def test_zip(self):
+        arr = numpy.random.rand(8,3)
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,3)})
+        for i, x in enumerate(a.zip("x")):
+            self.failUnless(numpy.all( x == arr[i,:2]))
+
+    def test_minibatch_basic(self):
+        arr = numpy.random.rand(10,4)
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
+        for i, mb in enumerate(a.minibatches(minibatch_size=2)): #all fields
+            self.failUnless(numpy.all( mb.x == arr[i*2:i*2+2,0:2]))
+            self.failUnless(numpy.all( mb.y == arr[i*2:i*2+2,1:4]))

-    def test1(self):
-        a=ArrayDataSet(data=numpy.random.rand(10,4),fields={"x":slice(2),"y":slice(1,4)})
-        s=0
-        for mb in a.minibatches(2):
-            s+=_sum_all(numpy.array(mb))
-        s+=a[3:6].x[1,1]
-        for mb in ArrayDataSet(data=a.y).minibatches(2):
-            for e in mb:
-                s+=sum(e)
-        #print numpy.array(a)
-        #print a.y[4:9:2]
-        s+= _sum_all(a.y[4:9:2])
-        #print s
-        self.failUnless(abs(s-39.0334797)<1e-6)
+    def test_getattr(self):
+        arr = numpy.random.rand(10,4)
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
+        a_y = a.y
+        self.failUnless(numpy.all( a_y == arr[:,1:4]))
+
+    def test_asarray(self):
+        arr = numpy.random.rand(3,4)
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
+        a_arr = numpy.asarray(a)
+        self.failUnless(a_arr.shape[1] == 2 + 3)
+
+    def test_minibatch_wraparound_even(self):
+        arr = numpy.random.rand(10,4)
+        arr2 = ArrayDataSet.Iterator.matcat(arr,arr)
+
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
+
+        #print arr
+        for i, x in enumerate(a.minibatches(["x"], minibatch_size=2, n_batches=8)):
+            #print 'x' , x
+            self.failUnless(numpy.all( x == arr2[i*2:i*2+2,0:2]))
+
+    def test_minibatch_wraparound_odd(self):
+        arr = numpy.random.rand(10,4)
+        arr2 = ArrayDataSet.Iterator.matcat(arr,arr)
+
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
+
+        for i, x in enumerate(a.minibatches(["x"], minibatch_size=3, n_batches=6)):
+            self.failUnless(numpy.all( x == arr2[i*3:i*3+3,0:2]))

 if __name__ == '__main__':
     unittest.main()
--- a/dataset.py	Wed Mar 26 18:23:44 2008 -0400
+++ b/dataset.py	Wed Mar 26 21:05:14 2008 -0400
@@ -1,42 +1,9 @@

-class Example(object):
-    """
-    An example is something that is like a tuple but whose elements can be named, to that
-    following syntactic constructions work as one would expect:
-       example.x = [1, 2, 3] # set a field
-       x, y, z = example
-       x = example[0]
-       x = example["x"]
-    """
-    def __init__(self,names,values):
-        assert len(values)==len(names)
-        self.__dict__['values']=values
-        self.__dict__['fields']={}
-        for i in xrange(len(values)):
-            self.fields[names[i]]=i
-
-    def __getitem__(self,i):
-        if isinstance(i,int):
-            return self.values[i]
-        else:
-            return self.values[self.fields[i]]
-
-    def __setitem__(self,i,value):
-        if isinstance(i,int):
-            self.values[i]=value
-        else:
-            self.values[self.fields[i]]=value
+from lookup_list import LookupList
+Example = LookupList

-    def __getattr__(self,name):
-        return self.values[self.fields[name]]
-
-    def __setattr__(self,name,value):
-        self.values[self.fields[name]]=value
-
-    def __len__(self):
-        return len(self.values)
-
-
+class AbstractFunction (Exception): """Derived class must override this function"""
+
 class DataSet(object):
     """A virtual base class for datasets.

@@ -73,7 +40,8 @@
         i[identifier], but the derived class is free to accept any type of
         identifier, and add extra functionality to the iterator.
         """
-        raise NotImplementedError
+        for i in self.minibatches( minibatch_size = 1):
+            yield Example(i.keys(), [v[0] for v in i.values()])

     def zip(self, *fieldnames):
         """
@@ -93,55 +61,66 @@
         The derived class may accept fieldname arguments of any type.

         """
-        raise NotImplementedError
+        for i in self.minibatches(fieldnames, minibatch_size = 1):
+            yield [f[0] for f in i]

-    def minibatches(self,minibatch_size,*fieldnames):
+    minibatches_fieldnames = None
+    minibatches_minibatch_size = 1
+    minibatches_n_batches = None
+    def minibatches(self,
+            fieldnames = minibatches_fieldnames,
+            minibatch_size = minibatches_minibatch_size,
+            n_batches = minibatches_n_batches):
         """
         Supports two forms of syntax:

-            for i in dataset.zip(f1, f2, f3): ...
+            for i in dataset.minibatches([f1, f2, f3],**kwargs): ...

-            for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
+            for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ...

         Using the first syntax, "i" will be an indexable object, such as a list,
-        tuple, or Example instance, such that on every iteration, i[0] is the f1
-        field of the current example, i[1] is the f2 field, and so on.
-
-        Using the second syntax, i1, i2, i3 will contain the the contents of the
-        f1, f2, and f3 fields of a single example on each loop iteration.
-
-        The derived class may accept fieldname arguments of any type.
+        tuple, or Example instance, such that on every iteration, i[0] is a
+        list-like container of the f1 field of a batch current examples, i[1] is
+        a list-like container of the f2 field, etc.

-        Return an iterator, whose next() method returns the next example or the next
-        minibatch in the dataset. A minibatch (of length > 1) is also an example, but
-        whose fields should be something one can iterate on again in order to obtain
-        the individual examples.
+        Using the second syntax, i1, i2, i3 will be list-like containers of the
+        f1, f2, and f3 fields of a batch of examples on each loop iteration.

-        DataSet.zip returns an iterator over only the desired fields, and each field
-        of the iterator contains one example.
+        PARAMETERS
+        - fieldnames (list of any type, default None):
+        The loop variables i1, i2, i3 (in the example above) should contain the
+        f1, f2, and f3 fields of the current batch of examples.  If None, the
+        derived class can choose a default, e.g. all fields.

-        Return an iterator which sees only the specified fields (each fieldname is a
-        field key, typically a string). The value returned at each iteration
-        is a tuple with one element per field. Hence it can be used like this:
-           for f1, f2, f3 in dataset.zip('field1','field2','field3'):
-              ... use f1, f2, and f3
-    If one iterates through minibatches of examples (with the minibatches() method
-    or with the minibatch_size argument of the zip() method), then the fields
-    returned by the iterator's next method should be iterators over the
-    individual values within the minibatch (typically these will be arrays
-    with minibatch_size rows).
-        Similar to zip but iterate over minibatches.
-        Return a minibatch iterator, whose next() method returns an 'example'
-        whose fields are iteratable objects (which can iterate over the individual
-        values of that field in the minibatch).
+        - minibatch_size (integer, default 1)
+        On every iteration, the variables i1, i2, i3 will have
+        exactly minibatch_size elements. e.g. len(i1) == minibatch_size
+
+        - n_batches (integer, default None)
+        The iterator will loop exactly this many times, and then stop.  If None,
+        the derived class can choose a default.  If (-1), then the returned
+        iterator should support looping indefinitely.
+
+        Note: A list-like container is something like a tuple, list, numpy.ndarray or
+        any other object that supports integer indexing and slicing.
+
         """
-        raise NotImplementedError
+        raise AbstractFunction()

     def fieldNames(self):
+        #Yoshua-
+        # This list may not be finite; what would make sense in the use you have
+        # in mind?
+        # -JB
         """Return the list of field names in the examples of this dataset."""
-        raise NotImplementedError
+        raise AbstractFunction()

     def rename(*new_field_specifications):
+        #Yoshua-
+        # Do you mean for this to be a virtual method?
+        # Wouldn't this functionality be easier to provide via a
+        # RenamingDataSet, such as the one I've written below?
+        # -JB
         """
         Return a new dataset that maps old fields (of self) to new fields (of the returned
         dataset). The minimal syntax that should be supported is the following:
@@ -151,7 +130,31 @@
         support additional indexing schemes within each field (e.g. column slice
         of a matrix-like field).
         """
-        raise NotImplementedError
+        raise AbstractFunction()
+
+class RenamingDataSet(DataSet):
+    """A DataSet that wraps another one, and makes it look like the field names
+    are different
+
+    Renaming is done by a dictionary that maps new names to the old ones used in
+    self.src.
+    """
+    def __init__(self, src, rename_dct):
+        DataSet.__init__(self)
+        self.src = src
+        self.rename_dct = copy.copy(rename_dct)
+
+    def minibatches(self,
+            fieldnames = DataSet.minibatches_fieldnames,
+            minibatch_size = DataSet.minibatches_minibatch_size,
+            n_batches = DataSet.minibatches_n_batches):
+        dct = self.rename_dct
+        new_fieldnames = [dct.get(f, f) for f in fieldnames]
+        return self.src.minibatches(new_fieldnames, minibatches_size, n_batches)
+
+    def fieldNames(self):
+        return [dct.get(f, f) for f in self.src.fieldNames()]
+

 class FiniteDataSet(DataSet):
     """
@@ -164,17 +167,51 @@
     a subset of examples (e.g. for splitting a dataset into training and test sets).
     """

+    class FiniteDataSetIterator(object):
+        """
+        If the fieldnames list is empty, it means that we want to see ALL the fields.
+        """
+        def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
+            self.dataset=dataset
+            self.minibatch_size=minibatch_size
+            assert minibatch_size>=1 and minibatch_size<=len(dataset)
+            self.current = -self.minibatch_size
+            self.fieldnames = fieldnames
+
+        def __iter__(self):
+            return self
+
+        def next(self):
+            self.current+=self.minibatch_size
+            if self.current>=len(self.dataset):
+                self.current=-self.minibatch_size
+                raise StopIteration
+            if self.minibatch_size==1:
+                complete_example=self.dataset[self.current]
+            else:
+                complete_example=self.dataset[self.current:self.current+self.minibatch_size]
+            if self.fieldnames:
+                return Example(self.fieldnames,list(complete_example))
+            else:
+                return complete_example
+
     def __init__(self):
         pass

-    def __iter__(self):
-        return FiniteDataSetIterator(self)
-
-    def zip(self,*fieldnames):
-        return FiniteDataSetIterator(self,1,fieldnames)
+    def minibatches(self,
+            fieldnames = DataSet.minibatches_fieldnames,
+            minibatch_size = DataSet.minibatches_minibatch_size,
+            n_batches = DataSet.minibatches_n_batches):
+        """
+        If the fieldnames list is empty, it means that we want to see ALL the fields.

-    def minibatches(self,minibatch_size,*fieldnames):
-        return FiniteDataSetIterator(self,minibatch_size,fieldnames)
+        If the n_batches is empty, we want to see all the examples possible
+        for the give minibatch_size.
+        """
+        # substitute the defaults:
+        if fieldnames is None: fieldnames = self.fieldNames()
+        if n_batches is None: n_batches = len(self) / minibatch_size
+        return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches)

     def __getattr__(self,fieldname):
         """Return an that can iterate over the values of the field in this dataset."""
@@ -186,53 +223,57 @@
         The return value's default iterator will iterate only over the given
         fields.
         """
-        raise NotImplementedError
+        raise AbstractFunction()

     def __len__(self):
         """len(dataset) returns the number of examples in the dataset."""
-        raise NotImplementedError
+        raise AbstractFunction()

     def __getitem__(self,i):
         """dataset[i] returns the (i+1)-th example of the dataset."""
-        raise NotImplementedError
+        raise AbstractFunction()

     def __getslice__(self,*slice_args):
         """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
-        raise NotImplementedError
-
-class FiniteDataSetIterator(object):
-    """
-    If the fieldnames list is empty, it means that we want to see ALL the fields.
-    """
-    def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
-        self.dataset=dataset
-        self.minibatch_size=minibatch_size
-        assert minibatch_size>=1 and minibatch_size<=len(dataset)
-        self.current = -self.minibatch_size
-        self.fieldnames = fieldnames
-
-    def __iter__(self):
-        return self
-
-    def next(self):
-        self.current+=self.minibatch_size
-        if self.current>=len(self.dataset):
-            self.current=-self.minibatch_size
-            raise StopIteration
-        if self.minibatch_size==1:
-            complete_example=self.dataset[self.current]
-        else:
-            complete_example=self.dataset[self.current:self.current+self.minibatch_size]
-        if self.fieldnames:
-            return Example(self.fieldnames,list(complete_example))
-        else:
-            return complete_example
-
+        raise AbstractFunction()

 # we may want ArrayDataSet defined in another python file

 import numpy

+def as_array_dataset(dataset):
+    # Generally datasets can be efficient by making data fields overlap, but
+    # this function doesn't know which fields overlap.  So, it should check if
+    # dataset supports an as_array_dataset member function, and return that if
+    # possible.
+    if hasattr(dataset, 'as_array_dataset'):
+        return dataset.as_array_dataset()
+
+    raise NotImplementedError()
+
+    # Make ONE big minibatch with all the examples, to separate the fields.
+    n_examples = len(dataset)
+    batch = dataset.minibatches( minibatch_size = len(dataset)).next()
+
+    # Each field of the underlying dataset must be convertible to a numpy array of the same type
+    # currently just double, but should use the smallest compatible dtype
+    n_fields = len(batch)
+    fieldnames = batch.fields.keys()
+    total_width = 0
+    type = None
+    fields = LookupList()
+    for i in xrange(n_fields):
+        field = array(batch[i])
+        assert field.shape[0]==n_examples
+        width = field.shape[1]
+        start=total_width
+        total_width += width
+        fields[fieldnames[i]]=slice(start,total_width,1)
+    # many complicated things remain to be done:
+    #  - find common dtype
+    #  - decide what to do with extra dimensions if not the same in all fields
+    #  - try to see if we can avoid the copy?
+
 class ArrayDataSet(FiniteDataSet):
     """
     An ArrayDataSet behaves like a numpy array but adds the notion of named fields
@@ -246,43 +287,79 @@
     by the numpy.array(dataset) call.
     """

-    def __init__(self,dataset=None,data=None,fields={}):
+    class Iterator(object):
+        """An iterator over a finite dataset that implements wrap-around"""
+        def __init__(self, dataset, fieldnames, minibatch_size, next_max):
+            self.dataset=dataset
+            self.fieldnames = fieldnames
+            self.minibatch_size=minibatch_size
+            self.next_count = 0
+            self.next_max = next_max
+            self.current = -self.minibatch_size
+            assert minibatch_size > 0
+            if minibatch_size >= len(dataset):
+                raise NotImplementedError()
+
+        def __iter__(self):
+            #Why do we do this?  -JB
+            return self
+
+        @staticmethod
+        def matcat(a, b):
+            a0, a1 = a.shape
+            b0, b1 = b.shape
+            assert a1 == b1
+            assert a.dtype is b.dtype
+            rval = numpy.empty( (a0 + b0, a1), dtype=a.dtype)
+            rval[:a0,:] = a
+            rval[a0:,:] = b
+            return rval
+
+        def next(self):
+
+            #check for end-of-loop
+            self.next_count += 1
+            if self.next_count == self.next_max:
+                raise StopIteration
+
+            #determine the first and last elements of the slice we'll return
+            self.current += self.minibatch_size
+            if self.current >= len(self.dataset):
+                self.current -= len(self.dataset)
+            upper = self.current + self.minibatch_size
+
+            if upper <= len(self.dataset):
+                #this is the easy case, we only need once slice
+                dataview = self.dataset.data[self.current:upper]
+            else:
+                # the minibatch wraps around the end of the dataset
+                dataview = self.dataset.data[self.current:]
+                upper -= len(self.dataset)
+                assert upper > 0
+                dataview = self.matcat(dataview, self.dataset.data[:upper])
+
+
+            rval = [dataview[:, self.dataset.fields[f]] for f in self.fieldnames]
+
+            if self.fieldnames:
+                rval = Example(self.fieldnames, rval)
+
+            return rval
+
+
+    def __init__(self, data, fields=None):
         """
         There are two ways to construct an ArrayDataSet: (1) from an
         existing dataset (which may result in a copy of the data in a numpy array),
         or (2) from a numpy.array (the data argument), along with an optional description
-        of the fields (dictionary of column slices indexed by field names).
+        of the fields (a LookupList of column slices indexed by field names).
         """
-        if dataset!=None:
-            assert data==None and fields=={}
-            # Make ONE big minibatch with all the examples, to separate the fields.
-            n_examples=len(dataset)
-            batch = dataset.minibatches(n_examples).next()
-            # Each field of the underlying dataset must be convertible to a numpy array of the same type
-            # currently just double, but should use the smallest compatible dtype
-            n_fields = len(batch)
-            fieldnames = batch.fields.keys()
-            total_width = 0
-            type = None
-            for i in xrange(n_fields):
-                field = array(batch[i])
-                assert field.shape[0]==n_examples
-                width = field.shape[1]
-                start=total_width
-                total_width += width
-                fields[fieldnames[i]]=slice(start,total_width,1)
-            # many complicated things remain to be done:
-            #  - find common dtype
-            #  - decide what to do with extra dimensions if not the same in all fields
-            #  - try to see if we can avoid the copy?
-            raise NotImplementedError
-        if data!=None:
-            assert dataset==None
-            self.data=data
-            self.fields=fields
-            self.width = data.shape[1]
-            for fieldname in fields:
-                fieldslice=fields[fieldname]
+        self.data=data
+        self.fields=fields
+        rows, cols = data.shape
+
+        if fields:
+            for fieldname,fieldslice in fields.items():
                 # make sure fieldslice.start and fieldslice.step are defined
                 start=fieldslice.start
                 step=fieldslice.step
@@ -293,7 +370,22 @@
                 if not fieldslice.start or not fieldslice.step:
                     fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
                 # and coherent with the data array
-                assert fieldslice.start>=0 and fieldslice.stop<=self.width
+                assert fieldslice.start >= 0 and fieldslice.stop <= cols
+
+    def minibatches(self,
+            fieldnames = DataSet.minibatches_fieldnames,
+            minibatch_size = DataSet.minibatches_minibatch_size,
+            n_batches = DataSet.minibatches_n_batches):
+        """
+        If the fieldnames list is empty, it means that we want to see ALL the fields.
+
+        If the n_batches is empty, we want to see all the examples possible
+        for the give minibatch_size.
+        """
+        # substitute the defaults:
+        if fieldnames is None: fieldnames = self.fieldNames()
+        if n_batches is None: n_batches = len(self) / minibatch_size
+        return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches)

     def __getattr__(self,fieldname):
         """
@@ -312,10 +404,10 @@
         for field_slice in self.fields.values():
             min_col=min(min_col,field_slice.start)
             max_col=max(max_col,field_slice.stop)
-        new_fields={}
-        for field in self.fields:
-            new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step)
-        return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields)
+        new_fields=LookupList()
+        for fieldname,fieldslice in self.fields.items():
+            new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step)
+        return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields)

     def fieldNames(self):
         """Return the list of field names that are supported by getattr and getFields."""
@@ -332,13 +424,13 @@
         """
         if self.fields:
             fieldnames,fieldslices=zip(*self.fields.items())
-            return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices])
+            return Example(self.fields.keys(),[self.data[i,fieldslice] for fieldslice in self.fields.values()])
         else:
             return self.data[i]

-    def __getslice__(self,*slice_args):
+    def __getslice__(self,*args):
         """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
-        return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields)
+        return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields)

     def __array__(self):
         """Return an view of this dataset which is an numpy.ndarray
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lookup_list.py	Wed Mar 26 21:05:14 2008 -0400
@@ -0,0 +1,69 @@
+
+class LookupList(object):
+    """
+    A LookupList is a sequence whose elements can be named (and unlike
+    a dictionary the order of the elements depends not on their key but
+    on the order given by the user through construction) so that
+    following syntactic constructions work as one would expect:
+       example = Example(['x','y','z'],[1,2,3])
+       example.x = [1, 2, 3] # set or change a field
+       x, y, z = example
+       x = example[0]
+       x = example["x"]
+       print example.keys() # returns ['x','y','z']
+       print example.values() # returns [[1,2,3],2,3]
+    """
+    def __init__(self,names=[],values=[]):
+        assert len(values)==len(names)
+        self.__dict__['_values']=values
+        self.__dict__['_name2index']={}
+        self.__dict__['_names']=names
+        for i in xrange(len(values)):
+            self._name2index[names[i]]=i
+
+    def keys(self):
+        return self._names
+
+    def values(self):
+        return self._values
+
+    def items(self):
+        return zip(self._names,self._values)
+
+    def __getitem__(self,key):
+        """
+        The key in example[key] can either be an integer to index the fields
+        or the name of the field.
+        """
+        if isinstance(key,int):
+            return self._values[key]
+        else: # if not an int, key must be a name
+            return self._values[self._name2index[key]]
+
+    def __setitem__(self,key,value):
+        if isinstance(key,int):
+            self._values[key]=value
+        else: # if not an int, key must be a name
+            if key in self._name2index:
+                self._values[self._name2index[key]]=value
+            else:
+                self._name2index[key]=len(self)
+                self._values.append(value)
+                self._names.append(key)
+
+    def __getattr__(self,name):
+        return self._values[self._name2index[name]]
+
+    def __setattr__(self,name,value):
+        if name in self._name2index:
+            self._values[self._name2index[name]]=value
+        else:
+            self._name2index[name]=len(self)
+            self._values.append(value)
+            self._names.append(name)
+
+    def __len__(self):
+        return len(self._values)
+
+    def __repr__(self):
+        return "{%s}" % ", ".join([str(k) + "=" + repr(v) for k,v in self.items()])