diff dataset.py @ 17:759d17112b23

more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
author bergstrj@iro.umontreal.ca
date Wed, 26 Mar 2008 21:05:14 -0400
parents 813723310d75 ff4e551490f1
children 57f4015e2e09
line wrap: on
line diff
--- a/dataset.py	Wed Mar 26 18:23:44 2008 -0400
+++ b/dataset.py	Wed Mar 26 21:05:14 2008 -0400
@@ -1,42 +1,9 @@
 
-class Example(object):
-    """
-    An example is something that is like a tuple but whose elements can be named, to that
-    following syntactic constructions work as one would expect:
-       example.x = [1, 2, 3] # set a field
-       x, y, z = example
-       x = example[0]
-       x = example["x"]
-    """
-    def __init__(self,names,values):
-        assert len(values)==len(names)
-        self.__dict__['values']=values
-        self.__dict__['fields']={}
-        for i in xrange(len(values)):
-            self.fields[names[i]]=i
-            
-    def __getitem__(self,i):
-        if isinstance(i,int):
-            return self.values[i]
-        else:
-            return self.values[self.fields[i]]
-    
-    def __setitem__(self,i,value):
-        if isinstance(i,int):
-            self.values[i]=value
-        else:
-            self.values[self.fields[i]]=value
+from lookup_list import LookupList
+Example = LookupList
 
-    def __getattr__(self,name):
-        return self.values[self.fields[name]]
-
-    def __setattr__(self,name,value):
-        self.values[self.fields[name]]=value
-
-    def __len__(self):
-        return len(self.values)
-
-    
+class AbstractFunction (Exception): """Derived class must override this function"""
+        
 class DataSet(object):
     """A virtual base class for datasets.
 
@@ -73,7 +40,8 @@
         i[identifier], but the derived class is free to accept any type of
         identifier, and add extra functionality to the iterator.
         """
-        raise NotImplementedError
+        for i in self.minibatches( minibatch_size = 1):
+            yield Example(i.keys(), [v[0] for v in i.values()])
 
     def zip(self, *fieldnames):
         """
@@ -93,55 +61,66 @@
         The derived class may accept fieldname arguments of any type.
 
         """
-        raise NotImplementedError
+        for i in self.minibatches(fieldnames, minibatch_size = 1):
+            yield [f[0] for f in i]
 
-    def minibatches(self,minibatch_size,*fieldnames):
+    minibatches_fieldnames = None
+    minibatches_minibatch_size = 1
+    minibatches_n_batches = None
+    def minibatches(self,
+            fieldnames = minibatches_fieldnames,
+            minibatch_size = minibatches_minibatch_size,
+            n_batches = minibatches_n_batches):
         """
         Supports two forms of syntax:
 
-            for i in dataset.zip(f1, f2, f3): ...
+            for i in dataset.minibatches([f1, f2, f3],**kwargs): ...
 
-            for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
+            for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ...
 
         Using the first syntax, "i" will be an indexable object, such as a list,
-        tuple, or Example instance, such that on every iteration, i[0] is the f1
-        field of the current example, i[1] is the f2 field, and so on.
-
-        Using the second syntax, i1, i2, i3 will contain the the contents of the
-        f1, f2, and f3 fields of a single example on each loop iteration.
-
-        The derived class may accept fieldname arguments of any type.
+        tuple, or Example instance, such that on every iteration, i[0] is a
+        list-like container of the f1 field of a batch current examples, i[1] is
+        a list-like container of the f2 field, etc.
 
-        Return an iterator, whose next() method returns the next example or the next 
-        minibatch in the dataset. A minibatch (of length > 1) is also an example, but
-        whose fields should be something one can iterate on again in order to obtain
-        the individual examples.
+        Using the second syntax, i1, i2, i3 will be list-like containers of the
+        f1, f2, and f3 fields of a batch of examples on each loop iteration.
 
-        DataSet.zip returns an iterator over only the desired fields, and each field
-        of the iterator contains one example.
+        PARAMETERS
+        - fieldnames (list of any type, default None):
+        The loop variables i1, i2, i3 (in the example above) should contain the
+        f1, f2, and f3 fields of the current batch of examples.  If None, the
+        derived class can choose a default, e.g. all fields.
 
-        Return an iterator which sees only the specified fields (each fieldname is a
-        field key, typically a string). The value returned at each iteration
-        is a tuple with one element per field. Hence it can be used like this:
-           for f1, f2, f3 in dataset.zip('field1','field2','field3'):
-              ... use f1, f2, and f3
-    If one iterates through minibatches of examples (with the minibatches() method
-    or with the minibatch_size argument of the zip() method), then the fields
-    returned by the iterator's next method should be iterators over the 
-    individual values within the minibatch (typically these will be arrays
-    with minibatch_size rows).
-        Similar to zip but iterate over minibatches.
-        Return a minibatch iterator, whose next() method returns an 'example'
-        whose fields are iteratable objects (which can iterate over the individual
-        values of that field in the minibatch).
+        - minibatch_size (integer, default 1)
+        On every iteration, the variables i1, i2, i3 will have
+        exactly minibatch_size elements. e.g. len(i1) == minibatch_size
+
+        - n_batches (integer, default None)
+        The iterator will loop exactly this many times, and then stop.  If None,
+        the derived class can choose a default.  If (-1), then the returned
+        iterator should support looping indefinitely.
+
+        Note: A list-like container is something like a tuple, list, numpy.ndarray or
+        any other object that supports integer indexing and slicing.
+
         """
-        raise NotImplementedError
+        raise AbstractFunction()
     
     def fieldNames(self):
+        #Yoshua- 
+        # This list may not be finite; what would make sense in the use you have
+        # in mind?
+        # -JB
         """Return the list of field names in the examples of this dataset."""
-        raise NotImplementedError
+        raise AbstractFunction()
 
     def rename(*new_field_specifications):
+        #Yoshua- 
+        # Do you mean for this to be a virtual method?
+        # Wouldn't this functionality be easier to provide via a
+        # RenamingDataSet, such as the one I've written below?
+        # -JB
         """
         Return a new dataset that maps old fields (of self) to new fields (of the returned 
         dataset). The minimal syntax that should be supported is the following:
@@ -151,7 +130,31 @@
         support additional indexing schemes within each field (e.g. column slice
         of a matrix-like field).
         """
-        raise NotImplementedError
+        raise AbstractFunction()
+
+class RenamingDataSet(DataSet):
+    """A DataSet that wraps another one, and makes it look like the field names
+    are different
+
+    Renaming is done by a dictionary that maps new names to the old ones used in
+    self.src.
+    """
+    def __init__(self, src, rename_dct):
+        DataSet.__init__(self)
+        self.src = src
+        self.rename_dct = copy.copy(rename_dct)
+
+    def minibatches(self,
+            fieldnames = DataSet.minibatches_fieldnames,
+            minibatch_size = DataSet.minibatches_minibatch_size,
+            n_batches = DataSet.minibatches_n_batches):
+        dct = self.rename_dct
+        new_fieldnames = [dct.get(f, f) for f in fieldnames]
+        return self.src.minibatches(new_fieldnames, minibatches_size, n_batches)
+
+    def fieldNames(self):
+        return [dct.get(f, f) for f in self.src.fieldNames()]
+
 
 class FiniteDataSet(DataSet):
     """
@@ -164,17 +167,51 @@
     a subset of examples (e.g. for splitting a dataset into training and test sets).
     """
 
+    class FiniteDataSetIterator(object):
+        """
+        If the fieldnames list is empty, it means that we want to see ALL the fields.
+        """
+        def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
+            self.dataset=dataset
+            self.minibatch_size=minibatch_size
+            assert minibatch_size>=1 and minibatch_size<=len(dataset)
+            self.current = -self.minibatch_size
+            self.fieldnames = fieldnames
+
+        def __iter__(self):
+            return self
+        
+        def next(self):
+            self.current+=self.minibatch_size
+            if self.current>=len(self.dataset):
+                self.current=-self.minibatch_size
+                raise StopIteration
+            if self.minibatch_size==1:
+                complete_example=self.dataset[self.current]
+            else:
+                complete_example=self.dataset[self.current:self.current+self.minibatch_size]
+            if self.fieldnames:
+                return Example(self.fieldnames,list(complete_example))
+            else:
+                return complete_example
+
     def __init__(self):
         pass
 
-    def __iter__(self):
-        return FiniteDataSetIterator(self)
-    
-    def zip(self,*fieldnames):
-        return FiniteDataSetIterator(self,1,fieldnames)
+    def minibatches(self,
+            fieldnames = DataSet.minibatches_fieldnames,
+            minibatch_size = DataSet.minibatches_minibatch_size,
+            n_batches = DataSet.minibatches_n_batches):
+        """
+        If the fieldnames list is empty, it means that we want to see ALL the fields.
 
-    def minibatches(self,minibatch_size,*fieldnames):
-        return FiniteDataSetIterator(self,minibatch_size,fieldnames)
+        If the n_batches is empty, we want to see all the examples possible
+        for the give minibatch_size.
+        """
+        # substitute the defaults:
+        if fieldnames is None: fieldnames = self.fieldNames()
+        if n_batches is None: n_batches = len(self) / minibatch_size
+        return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
 
     def __getattr__(self,fieldname):
         """Return an that can iterate over the values of the field in this dataset."""
@@ -186,53 +223,57 @@
         The return value's default iterator will iterate only over the given
         fields.
         """
-        raise NotImplementedError
+        raise AbstractFunction()
 
     def __len__(self):
         """len(dataset) returns the number of examples in the dataset."""
-        raise NotImplementedError
+        raise AbstractFunction()
     
     def __getitem__(self,i):
         """dataset[i] returns the (i+1)-th example of the dataset."""
-        raise NotImplementedError
+        raise AbstractFunction()
 
     def __getslice__(self,*slice_args):
         """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
-        raise NotImplementedError
-
-class FiniteDataSetIterator(object):
-    """
-    If the fieldnames list is empty, it means that we want to see ALL the fields.
-    """
-    def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
-        self.dataset=dataset
-        self.minibatch_size=minibatch_size
-        assert minibatch_size>=1 and minibatch_size<=len(dataset)
-        self.current = -self.minibatch_size
-        self.fieldnames = fieldnames
-
-    def __iter__(self):
-        return self
-    
-    def next(self):
-        self.current+=self.minibatch_size
-        if self.current>=len(self.dataset):
-            self.current=-self.minibatch_size
-            raise StopIteration
-        if self.minibatch_size==1:
-            complete_example=self.dataset[self.current]
-        else:
-            complete_example=self.dataset[self.current:self.current+self.minibatch_size]
-        if self.fieldnames:
-            return Example(self.fieldnames,list(complete_example))
-        else:
-            return complete_example
-
+        raise AbstractFunction()
 
 # we may want ArrayDataSet defined in another python file
 
 import numpy
 
+def as_array_dataset(dataset):
+    # Generally datasets can be efficient by making data fields overlap, but
+    # this function doesn't know which fields overlap.  So, it should check if
+    # dataset supports an as_array_dataset member function, and return that if
+    # possible.
+    if hasattr(dataset, 'as_array_dataset'):
+        return dataset.as_array_dataset()
+
+    raise NotImplementedError()
+
+    # Make ONE big minibatch with all the examples, to separate the fields.
+    n_examples = len(dataset)
+    batch = dataset.minibatches( minibatch_size = len(dataset)).next()
+
+    # Each field of the underlying dataset must be convertible to a numpy array of the same type
+    # currently just double, but should use the smallest compatible dtype
+    n_fields = len(batch)
+    fieldnames = batch.fields.keys()
+    total_width = 0
+    type = None
+    fields = LookupList()
+    for i in xrange(n_fields):
+        field = array(batch[i])
+        assert field.shape[0]==n_examples
+        width = field.shape[1]
+        start=total_width
+        total_width += width
+        fields[fieldnames[i]]=slice(start,total_width,1)
+    # many complicated things remain to be done:
+    #  - find common dtype
+    #  - decide what to do with extra dimensions if not the same in all fields
+    #  - try to see if we can avoid the copy?
+
 class ArrayDataSet(FiniteDataSet):
     """
     An ArrayDataSet behaves like a numpy array but adds the notion of named fields
@@ -246,43 +287,79 @@
     by the numpy.array(dataset) call.
     """
 
-    def __init__(self,dataset=None,data=None,fields={}):
+    class Iterator(object):
+        """An iterator over a finite dataset that implements wrap-around"""
+        def __init__(self, dataset, fieldnames, minibatch_size, next_max):
+            self.dataset=dataset
+            self.fieldnames = fieldnames
+            self.minibatch_size=minibatch_size
+            self.next_count = 0
+            self.next_max = next_max
+            self.current = -self.minibatch_size
+            assert minibatch_size > 0
+            if minibatch_size >= len(dataset):
+                raise NotImplementedError()
+
+        def __iter__(self):
+            #Why do we do this?  -JB
+            return self
+
+        @staticmethod
+        def matcat(a, b):
+            a0, a1 = a.shape
+            b0, b1 = b.shape
+            assert a1 == b1
+            assert a.dtype is b.dtype
+            rval = numpy.empty( (a0 + b0, a1), dtype=a.dtype)
+            rval[:a0,:] = a
+            rval[a0:,:] = b
+            return rval
+        
+        def next(self):
+
+            #check for end-of-loop
+            self.next_count += 1
+            if self.next_count == self.next_max:
+                raise StopIteration
+
+            #determine the first and last elements of the slice we'll return
+            self.current += self.minibatch_size
+            if self.current >= len(self.dataset):
+                self.current -= len(self.dataset)
+            upper = self.current + self.minibatch_size
+
+            if upper <= len(self.dataset):
+                #this is the easy case, we only need once slice
+                dataview = self.dataset.data[self.current:upper]
+            else:
+                # the minibatch wraps around the end of the dataset
+                dataview = self.dataset.data[self.current:]
+                upper -= len(self.dataset)
+                assert upper > 0
+                dataview = self.matcat(dataview, self.dataset.data[:upper])
+
+
+            rval = [dataview[:, self.dataset.fields[f]] for f in self.fieldnames]
+
+            if self.fieldnames:
+                rval = Example(self.fieldnames, rval)
+
+            return rval
+
+
+    def __init__(self, data, fields=None):
         """
         There are two ways to construct an ArrayDataSet: (1) from an
         existing dataset (which may result in a copy of the data in a numpy array),
         or (2) from a numpy.array (the data argument), along with an optional description
-        of the fields (dictionary of column slices indexed by field names).
+        of the fields (a LookupList of column slices indexed by field names).
         """
-        if dataset!=None:
-            assert data==None and fields=={}
-            # Make ONE big minibatch with all the examples, to separate the fields.
-            n_examples=len(dataset)
-            batch = dataset.minibatches(n_examples).next()
-            # Each field of the underlying dataset must be convertible to a numpy array of the same type
-            # currently just double, but should use the smallest compatible dtype
-            n_fields = len(batch)
-            fieldnames = batch.fields.keys()
-            total_width = 0
-            type = None
-            for i in xrange(n_fields):
-                field = array(batch[i])
-                assert field.shape[0]==n_examples
-                width = field.shape[1]
-                start=total_width
-                total_width += width
-                fields[fieldnames[i]]=slice(start,total_width,1)
-            # many complicated things remain to be done:
-            #  - find common dtype
-            #  - decide what to do with extra dimensions if not the same in all fields
-            #  - try to see if we can avoid the copy?
-            raise NotImplementedError
-        if data!=None:
-            assert dataset==None
-            self.data=data
-            self.fields=fields
-            self.width = data.shape[1]
-            for fieldname in fields:
-                fieldslice=fields[fieldname]
+        self.data=data
+        self.fields=fields
+        rows, cols = data.shape
+
+        if fields:
+            for fieldname,fieldslice in fields.items():
                 # make sure fieldslice.start and fieldslice.step are defined
                 start=fieldslice.start
                 step=fieldslice.step
@@ -293,7 +370,22 @@
                 if not fieldslice.start or not fieldslice.step:
                     fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
                 # and coherent with the data array
-                assert fieldslice.start>=0 and fieldslice.stop<=self.width
+                assert fieldslice.start >= 0 and fieldslice.stop <= cols
+
+    def minibatches(self,
+            fieldnames = DataSet.minibatches_fieldnames,
+            minibatch_size = DataSet.minibatches_minibatch_size,
+            n_batches = DataSet.minibatches_n_batches):
+        """
+        If the fieldnames list is empty, it means that we want to see ALL the fields.
+
+        If the n_batches is empty, we want to see all the examples possible
+        for the give minibatch_size.
+        """
+        # substitute the defaults:
+        if fieldnames is None: fieldnames = self.fieldNames()
+        if n_batches is None: n_batches = len(self) / minibatch_size
+        return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
 
     def __getattr__(self,fieldname):
         """
@@ -312,10 +404,10 @@
         for field_slice in self.fields.values():
             min_col=min(min_col,field_slice.start)
             max_col=max(max_col,field_slice.stop)
-        new_fields={}
-        for field in self.fields:
-            new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step)
-        return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields)
+        new_fields=LookupList()
+        for fieldname,fieldslice in self.fields.items():
+            new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step)
+        return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields)
 
     def fieldNames(self):
         """Return the list of field names that are supported by getattr and getFields."""
@@ -332,13 +424,13 @@
         """
         if self.fields:
             fieldnames,fieldslices=zip(*self.fields.items())
-            return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices])
+            return Example(self.fields.keys(),[self.data[i,fieldslice] for fieldslice in self.fields.values()])
         else:
             return self.data[i]
 
-    def __getslice__(self,*slice_args):
+    def __getslice__(self,*args):
         """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
-        return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields)
+        return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields)
 
     def __array__(self):
         """Return an view of this dataset which is an numpy.ndarray