changeset 17:759d17112b23

more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
author bergstrj@iro.umontreal.ca
date Wed, 26 Mar 2008 21:05:14 -0400
parents 813723310d75 (diff) ff4e551490f1 (current diff)
children 60b164a0d84a
files _test_dataset.py dataset.py lookup_list.py
diffstat 3 files changed, 375 insertions(+), 147 deletions(-) [+]
line wrap: on
line diff
--- a/_test_dataset.py	Wed Mar 26 18:21:57 2008 -0400
+++ b/_test_dataset.py	Wed Mar 26 21:05:14 2008 -0400
@@ -12,28 +12,67 @@
     def setUp(self):
         numpy.random.seed(123456)
 
-    def test0(self):
-        a=ArrayDataSet(data=numpy.random.rand(8,3),fields={"x":slice(2),"y":slice(1,3)})
-        s=0
-        for example in a:
-            s+=_sum_all(example.x)
-        #print s
-        self.failUnless(abs(s-7.25967597)<1e-6)
+
+    def test_ctor_len(self):
+        n = numpy.random.rand(8,3)
+        a=ArrayDataSet(n)
+        self.failUnless(a.data is n)
+        self.failUnless(a.fields is None)
+
+        self.failUnless(len(a) == n.shape[0])
+        self.failUnless(a[0].shape == (n.shape[1],))
+
+    def test_iter(self):
+        arr = numpy.random.rand(8,3)
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,3)})
+        for i, example in enumerate(a):
+            self.failUnless(numpy.all( example.x == arr[i,:2]))
+            self.failUnless(numpy.all( example.y == arr[i,1:3]))
+
+    def test_zip(self):
+        arr = numpy.random.rand(8,3)
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,3)})
+        for i, x in enumerate(a.zip("x")):
+            self.failUnless(numpy.all( x == arr[i,:2]))
+
+    def test_minibatch_basic(self):
+        arr = numpy.random.rand(10,4)
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
+        for i, mb in enumerate(a.minibatches(minibatch_size=2)): #all fields
+            self.failUnless(numpy.all( mb.x == arr[i*2:i*2+2,0:2]))
+            self.failUnless(numpy.all( mb.y == arr[i*2:i*2+2,1:4]))
 
-    def test1(self):
-        a=ArrayDataSet(data=numpy.random.rand(10,4),fields={"x":slice(2),"y":slice(1,4)})
-        s=0
-        for mb in a.minibatches(2):
-            s+=_sum_all(numpy.array(mb))
-        s+=a[3:6].x[1,1]
-        for mb in ArrayDataSet(data=a.y).minibatches(2):
-            for e in mb:
-                s+=sum(e)
-        #print numpy.array(a)
-        #print a.y[4:9:2]
-        s+= _sum_all(a.y[4:9:2])
-        #print s
-        self.failUnless(abs(s-39.0334797)<1e-6)
+    def test_getattr(self):
+        arr = numpy.random.rand(10,4)
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
+        a_y = a.y
+        self.failUnless(numpy.all( a_y == arr[:,1:4]))
+
+    def test_asarray(self):
+        arr = numpy.random.rand(3,4)
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
+        a_arr = numpy.asarray(a)
+        self.failUnless(a_arr.shape[1] == 2 + 3)
+
+    def test_minibatch_wraparound_even(self):
+        arr = numpy.random.rand(10,4)
+        arr2 = ArrayDataSet.Iterator.matcat(arr,arr)
+
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
+
+        #print arr
+        for i, x in enumerate(a.minibatches(["x"], minibatch_size=2, n_batches=8)):
+            #print 'x' , x
+            self.failUnless(numpy.all( x == arr2[i*2:i*2+2,0:2]))
+
+    def test_minibatch_wraparound_odd(self):
+        arr = numpy.random.rand(10,4)
+        arr2 = ArrayDataSet.Iterator.matcat(arr,arr)
+
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
+
+        for i, x in enumerate(a.minibatches(["x"], minibatch_size=3, n_batches=6)):
+            self.failUnless(numpy.all( x == arr2[i*3:i*3+3,0:2]))
         
 if __name__ == '__main__':
     unittest.main()
--- a/dataset.py	Wed Mar 26 18:21:57 2008 -0400
+++ b/dataset.py	Wed Mar 26 21:05:14 2008 -0400
@@ -1,62 +1,126 @@
 
 from lookup_list import LookupList
 Example = LookupList
+
+class AbstractFunction (Exception): """Derived class must override this function"""
         
 class DataSet(object):
-    """
-    This is a virtual base class or interface for datasets.
-    A dataset is basically an iterator over Examples (or anything that
-    behaves like an Example). It does not necessarily
-    have a fixed length (this is useful for 'streams' which feed on-line learning).
-    Datasets with fixed and known length are instances of FiniteDataSet, a subclass
-    which supports indexing (dataset[i]) and slicing (dataset[1000:2000]).
-    To iterate over a subset of the fields, one should use the dataset.zip(field1, field2,field3, ...)
-    method which returns an iterator over only the desired fields.
-    Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
-    The content of a field can be of any type, but often will be a numpy array.
-    If one iterates through minibatches of examples (with the minibatches() method
-    or with the minibatch_size argument of the zip() method), then the fields
-    returned by the iterator's next method should be iterators over the 
-    individual values within the minibatch (typically these will be arrays
-    with minibatch_size rows).
+    """A virtual base class for datasets.
+
+    A DataSet is a generator of iterators; these iterators can run through the
+    examples in a variety of ways.  A DataSet need not necessarily have a finite
+    or known length, so this class can be used to interface to a 'stream' which
+    feed on-line learning.
+
+    To iterate over examples, there are several possibilities:
+    - for i in dataset.zip(field1, field2,field3, ...)
+    - for i in dataset.minibatches(N, field1, field2, ...)
+    - for i in dataset
+    Each of these is documented below.
+
+    Note: For a dataset of fixed and known length, which can implement item
+    random-access efficiently (e.g. indexing and slicing), and which can profit
+    from the FiniteDataSetIterator, consider using base class FiniteDataSet.
+
+    Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
+
+    Note: The content of a field can be of any type.
+
     """
 
     def __init__(self):
         pass
     
     def __iter__(self):
+        """Supports the syntax "for i in dataset: ..."
+
+        Using this syntax, "i" will be an Example instance (or equivalent) with
+        all the fields of DataSet self.  Every field of "i" will give access to
+        a the field of a single example.  Fields should be accessible via
+        i[identifier], but the derived class is free to accept any type of
+        identifier, and add extra functionality to the iterator.
         """
-        Return an iterator, whose next() method returns the next example or the next 
-        minibatch in the dataset. A minibatch (of length > 1) is also an example, but
-        whose fields should be something one can iterate on again in order to obtain
-        the individual examples.
+        for i in self.minibatches( minibatch_size = 1):
+            yield Example(i.keys(), [v[0] for v in i.values()])
+
+    def zip(self, *fieldnames):
         """
-        raise NotImplementedError
+        Supports two forms of syntax:
+
+            for i in dataset.zip(f1, f2, f3): ...
+
+            for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
 
-    def zip(self,*fieldnames):
+        Using the first syntax, "i" will be an indexable object, such as a list,
+        tuple, or Example instance, such that on every iteration, i[0] is the f1
+        field of the current example, i[1] is the f2 field, and so on.
+
+        Using the second syntax, i1, i2, i3 will contain the the contents of the
+        f1, f2, and f3 fields of a single example on each loop iteration.
+
+        The derived class may accept fieldname arguments of any type.
+
         """
-        Return an iterator which sees only the specified fields (each fieldname is a
-        field key, typically a string). The value returned at each iteration
-        is a tuple with one element per field. Hence it can be used like this:
-           for f1, f2, f3 in dataset.zip('field1','field2','field3'):
-              ... use f1, f2, and f3
+        for i in self.minibatches(fieldnames, minibatch_size = 1):
+            yield [f[0] for f in i]
+
+    minibatches_fieldnames = None
+    minibatches_minibatch_size = 1
+    minibatches_n_batches = None
+    def minibatches(self,
+            fieldnames = minibatches_fieldnames,
+            minibatch_size = minibatches_minibatch_size,
+            n_batches = minibatches_n_batches):
         """
-        raise NotImplementedError
+        Supports two forms of syntax:
+
+            for i in dataset.minibatches([f1, f2, f3],**kwargs): ...
+
+            for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ...
+
+        Using the first syntax, "i" will be an indexable object, such as a list,
+        tuple, or Example instance, such that on every iteration, i[0] is a
+        list-like container of the f1 field of a batch current examples, i[1] is
+        a list-like container of the f2 field, etc.
 
-    def minibatches(self,minibatch_size,*fieldnames):
+        Using the second syntax, i1, i2, i3 will be list-like containers of the
+        f1, f2, and f3 fields of a batch of examples on each loop iteration.
+
+        PARAMETERS
+        - fieldnames (list of any type, default None):
+        The loop variables i1, i2, i3 (in the example above) should contain the
+        f1, f2, and f3 fields of the current batch of examples.  If None, the
+        derived class can choose a default, e.g. all fields.
+
+        - minibatch_size (integer, default 1)
+        On every iteration, the variables i1, i2, i3 will have
+        exactly minibatch_size elements. e.g. len(i1) == minibatch_size
+
+        - n_batches (integer, default None)
+        The iterator will loop exactly this many times, and then stop.  If None,
+        the derived class can choose a default.  If (-1), then the returned
+        iterator should support looping indefinitely.
+
+        Note: A list-like container is something like a tuple, list, numpy.ndarray or
+        any other object that supports integer indexing and slicing.
+
         """
-        Similar to zip but iterate over minibatches.
-        Return a minibatch iterator, whose next() method returns an 'example'
-        whose fields are iteratable objects (which can iterate over the individual
-        values of that field in the minibatch).
-        """
-        raise NotImplementedError
+        raise AbstractFunction()
     
     def fieldNames(self):
+        #Yoshua- 
+        # This list may not be finite; what would make sense in the use you have
+        # in mind?
+        # -JB
         """Return the list of field names in the examples of this dataset."""
-        raise NotImplementedError
+        raise AbstractFunction()
 
     def rename(*new_field_specifications):
+        #Yoshua- 
+        # Do you mean for this to be a virtual method?
+        # Wouldn't this functionality be easier to provide via a
+        # RenamingDataSet, such as the one I've written below?
+        # -JB
         """
         Return a new dataset that maps old fields (of self) to new fields (of the returned 
         dataset). The minimal syntax that should be supported is the following:
@@ -66,7 +130,31 @@
         support additional indexing schemes within each field (e.g. column slice
         of a matrix-like field).
         """
-        raise NotImplementedError
+        raise AbstractFunction()
+
+class RenamingDataSet(DataSet):
+    """A DataSet that wraps another one, and makes it look like the field names
+    are different
+
+    Renaming is done by a dictionary that maps new names to the old ones used in
+    self.src.
+    """
+    def __init__(self, src, rename_dct):
+        DataSet.__init__(self)
+        self.src = src
+        self.rename_dct = copy.copy(rename_dct)
+
+    def minibatches(self,
+            fieldnames = DataSet.minibatches_fieldnames,
+            minibatch_size = DataSet.minibatches_minibatch_size,
+            n_batches = DataSet.minibatches_n_batches):
+        dct = self.rename_dct
+        new_fieldnames = [dct.get(f, f) for f in fieldnames]
+        return self.src.minibatches(new_fieldnames, minibatches_size, n_batches)
+
+    def fieldNames(self):
+        return [dct.get(f, f) for f in self.src.fieldNames()]
+
 
 class FiniteDataSet(DataSet):
     """
@@ -79,71 +167,113 @@
     a subset of examples (e.g. for splitting a dataset into training and test sets).
     """
 
+    class FiniteDataSetIterator(object):
+        """
+        If the fieldnames list is empty, it means that we want to see ALL the fields.
+        """
+        def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
+            self.dataset=dataset
+            self.minibatch_size=minibatch_size
+            assert minibatch_size>=1 and minibatch_size<=len(dataset)
+            self.current = -self.minibatch_size
+            self.fieldnames = fieldnames
+
+        def __iter__(self):
+            return self
+        
+        def next(self):
+            self.current+=self.minibatch_size
+            if self.current>=len(self.dataset):
+                self.current=-self.minibatch_size
+                raise StopIteration
+            if self.minibatch_size==1:
+                complete_example=self.dataset[self.current]
+            else:
+                complete_example=self.dataset[self.current:self.current+self.minibatch_size]
+            if self.fieldnames:
+                return Example(self.fieldnames,list(complete_example))
+            else:
+                return complete_example
+
     def __init__(self):
         pass
 
-    def __iter__(self):
-        return FiniteDataSetIterator(self)
-    
-    def zip(self,*fieldnames):
-        return FiniteDataSetIterator(self,1,fieldnames)
+    def minibatches(self,
+            fieldnames = DataSet.minibatches_fieldnames,
+            minibatch_size = DataSet.minibatches_minibatch_size,
+            n_batches = DataSet.minibatches_n_batches):
+        """
+        If the fieldnames list is empty, it means that we want to see ALL the fields.
 
-    def minibatches(self,minibatch_size,*fieldnames):
-        return FiniteDataSetIterator(self,minibatch_size,fieldnames)
+        If the n_batches is empty, we want to see all the examples possible
+        for the give minibatch_size.
+        """
+        # substitute the defaults:
+        if fieldnames is None: fieldnames = self.fieldNames()
+        if n_batches is None: n_batches = len(self) / minibatch_size
+        return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
 
     def __getattr__(self,fieldname):
         """Return an that can iterate over the values of the field in this dataset."""
         return self(fieldname)
 
     def __call__(self,*fieldnames):
-        """Return a sub-dataset containing only the given fieldnames as fields."""
-        raise NotImplementedError
+        """Return a sub-dataset containing only the given fieldnames as fields.
+        
+        The return value's default iterator will iterate only over the given
+        fields.
+        """
+        raise AbstractFunction()
 
     def __len__(self):
         """len(dataset) returns the number of examples in the dataset."""
-        raise NotImplementedError
+        raise AbstractFunction()
     
     def __getitem__(self,i):
         """dataset[i] returns the (i+1)-th example of the dataset."""
-        raise NotImplementedError
+        raise AbstractFunction()
 
     def __getslice__(self,*slice_args):
         """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
-        raise NotImplementedError
-
-class FiniteDataSetIterator(object):
-    """
-    If the fieldnames list is empty, it means that we want to see ALL the fields.
-    """
-    def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
-        self.dataset=dataset
-        self.minibatch_size=minibatch_size
-        assert minibatch_size>=1 and minibatch_size<=len(dataset)
-        self.current = -self.minibatch_size
-        self.fieldnames = fieldnames
-
-    def __iter__(self):
-        return self
-    
-    def next(self):
-        self.current+=self.minibatch_size
-        if self.current>=len(self.dataset):
-            self.current=-self.minibatch_size
-            raise StopIteration
-        if self.minibatch_size==1:
-            complete_example=self.dataset[self.current]
-        else:
-            complete_example=self.dataset[self.current:self.current+self.minibatch_size]
-        if self.fieldnames:
-            return Example(self.fieldnames,list(complete_example))
-        else:
-            return complete_example
-
+        raise AbstractFunction()
 
 # we may want ArrayDataSet defined in another python file
 
 import numpy
 
+def as_array_dataset(dataset):
+    # Generally datasets can be efficient by making data fields overlap, but
+    # this function doesn't know which fields overlap.  So, it should check if
+    # dataset supports an as_array_dataset member function, and return that if
+    # possible.
+    if hasattr(dataset, 'as_array_dataset'):
+        return dataset.as_array_dataset()
+
+    raise NotImplementedError()
+
+    # Make ONE big minibatch with all the examples, to separate the fields.
+    n_examples = len(dataset)
+    batch = dataset.minibatches( minibatch_size = len(dataset)).next()
+
+    # Each field of the underlying dataset must be convertible to a numpy array of the same type
+    # currently just double, but should use the smallest compatible dtype
+    n_fields = len(batch)
+    fieldnames = batch.fields.keys()
+    total_width = 0
+    type = None
+    fields = LookupList()
+    for i in xrange(n_fields):
+        field = array(batch[i])
+        assert field.shape[0]==n_examples
+        width = field.shape[1]
+        start=total_width
+        total_width += width
+        fields[fieldnames[i]]=slice(start,total_width,1)
+    # many complicated things remain to be done:
+    #  - find common dtype
+    #  - decide what to do with extra dimensions if not the same in all fields
+    #  - try to see if we can avoid the copy?
+
 class ArrayDataSet(FiniteDataSet):
     """
     An ArrayDataSet behaves like a numpy array but adds the notion of named fields
@@ -157,55 +287,105 @@
     by the numpy.array(dataset) call.
     """
 
-    def __init__(self,dataset=None,data=None,fields=None):
+    class Iterator(object):
+        """An iterator over a finite dataset that implements wrap-around"""
+        def __init__(self, dataset, fieldnames, minibatch_size, next_max):
+            self.dataset=dataset
+            self.fieldnames = fieldnames
+            self.minibatch_size=minibatch_size
+            self.next_count = 0
+            self.next_max = next_max
+            self.current = -self.minibatch_size
+            assert minibatch_size > 0
+            if minibatch_size >= len(dataset):
+                raise NotImplementedError()
+
+        def __iter__(self):
+            #Why do we do this?  -JB
+            return self
+
+        @staticmethod
+        def matcat(a, b):
+            a0, a1 = a.shape
+            b0, b1 = b.shape
+            assert a1 == b1
+            assert a.dtype is b.dtype
+            rval = numpy.empty( (a0 + b0, a1), dtype=a.dtype)
+            rval[:a0,:] = a
+            rval[a0:,:] = b
+            return rval
+        
+        def next(self):
+
+            #check for end-of-loop
+            self.next_count += 1
+            if self.next_count == self.next_max:
+                raise StopIteration
+
+            #determine the first and last elements of the slice we'll return
+            self.current += self.minibatch_size
+            if self.current >= len(self.dataset):
+                self.current -= len(self.dataset)
+            upper = self.current + self.minibatch_size
+
+            if upper <= len(self.dataset):
+                #this is the easy case, we only need once slice
+                dataview = self.dataset.data[self.current:upper]
+            else:
+                # the minibatch wraps around the end of the dataset
+                dataview = self.dataset.data[self.current:]
+                upper -= len(self.dataset)
+                assert upper > 0
+                dataview = self.matcat(dataview, self.dataset.data[:upper])
+
+
+            rval = [dataview[:, self.dataset.fields[f]] for f in self.fieldnames]
+
+            if self.fieldnames:
+                rval = Example(self.fieldnames, rval)
+
+            return rval
+
+
+    def __init__(self, data, fields=None):
         """
         There are two ways to construct an ArrayDataSet: (1) from an
         existing dataset (which may result in a copy of the data in a numpy array),
         or (2) from a numpy.array (the data argument), along with an optional description
         of the fields (a LookupList of column slices indexed by field names).
         """
-        if dataset!=None:
-            assert data==None and fields==None
-            # Make ONE big minibatch with all the examples, to separate the fields.
-            n_examples=len(dataset)
-            batch = dataset.minibatches(n_examples).next()
-            # Each field of the underlying dataset must be convertible to a numpy array of the same type
-            # currently just double, but should use the smallest compatible dtype
-            n_fields = len(batch)
-            fieldnames = batch.fields.keys()
-            total_width = 0
-            type = None
-            fields = LookupList()
-            for i in xrange(n_fields):
-                field = array(batch[i])
-                assert field.shape[0]==n_examples
-                width = field.shape[1]
-                start=total_width
-                total_width += width
-                fields[fieldnames[i]]=slice(start,total_width,1)
-            # many complicated things remain to be done:
-            #  - find common dtype
-            #  - decide what to do with extra dimensions if not the same in all fields
-            #  - try to see if we can avoid the copy?
-            raise NotImplementedError
-        if data!=None:
-            assert dataset==None
-            self.data=data
-            self.fields=fields
-            self.width = data.shape[1]
-            if fields:
-                for fieldname,fieldslice in fields.items():
-                    # make sure fieldslice.start and fieldslice.step are defined
-                    start=fieldslice.start
-                    step=fieldslice.step
-                    if not start:
-                        start=0
-                    if not step:
-                        step=1
-                    if not fieldslice.start or not fieldslice.step:
-                        fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
-                    # and coherent with the data array
-                    assert fieldslice.start>=0 and fieldslice.stop<=self.width
+        self.data=data
+        self.fields=fields
+        rows, cols = data.shape
+
+        if fields:
+            for fieldname,fieldslice in fields.items():
+                # make sure fieldslice.start and fieldslice.step are defined
+                start=fieldslice.start
+                step=fieldslice.step
+                if not start:
+                    start=0
+                if not step:
+                    step=1
+                if not fieldslice.start or not fieldslice.step:
+                    fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
+                # and coherent with the data array
+                assert fieldslice.start >= 0 and fieldslice.stop <= cols
+
+    def minibatches(self,
+            fieldnames = DataSet.minibatches_fieldnames,
+            minibatch_size = DataSet.minibatches_minibatch_size,
+            n_batches = DataSet.minibatches_n_batches):
+        """
+        If the fieldnames list is empty, it means that we want to see ALL the fields.
+
+        If the n_batches is empty, we want to see all the examples possible
+        for the give minibatch_size.
+        """
+        # substitute the defaults:
+        if fieldnames is None: fieldnames = self.fieldNames()
+        if n_batches is None: n_batches = len(self) / minibatch_size
+        return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
 
     def __getattr__(self,fieldname):
         """
@@ -227,7 +407,7 @@
         new_fields=LookupList()
         for fieldname,fieldslice in self.fields.items():
             new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step)
-        return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields)
+        return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields)
 
     def fieldNames(self):
         """Return the list of field names that are supported by getattr and getFields."""
@@ -248,11 +428,20 @@
         else:
             return self.data[i]
 
-    def __getslice__(self,*slice_args):
+    def __getslice__(self,*args):
         """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
-        return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields)
+        return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields)
 
     def __array__(self):
+        """Return an view of this dataset which is an numpy.ndarray
+
+        Numpy uses this special function name to retrieve an ndarray view for
+        function such as numpy.sum, numpy.dot, numpy.asarray, etc.
+
+        If this dataset has no fields, then we simply return self.data,
+        otherwise things are complicated. 
+        - why do we want this behaviour when there are fields? (JB)
+        """
         if not self.fields:
             return self.data
         # else, select subsets of columns mapped by the fields
--- a/lookup_list.py	Wed Mar 26 18:21:57 2008 -0400
+++ b/lookup_list.py	Wed Mar 26 21:05:14 2008 -0400
@@ -22,10 +22,10 @@
             self._name2index[names[i]]=i
 
     def keys(self):
-        return _names
+        return self._names
 
     def values(self):
-        return _values
+        return self._values
 
     def items(self):
         return zip(self._names,self._values)