diff dataset.py @ 16:813723310d75

commenting
author bergstrj@iro.umontreal.ca
date Wed, 26 Mar 2008 18:23:44 -0400
parents 88168361a5ab be128b9127c8
children 759d17112b23
line wrap: on
line diff
--- a/dataset.py	Tue Mar 25 13:38:51 2008 -0400
+++ b/dataset.py	Wed Mar 26 18:23:44 2008 -0400
@@ -1,60 +1,193 @@
+
+class Example(object):
+    """
+    An example is something that is like a tuple but whose elements can be named, to that
+    following syntactic constructions work as one would expect:
+       example.x = [1, 2, 3] # set a field
+       x, y, z = example
+       x = example[0]
+       x = example["x"]
+    """
+    def __init__(self,names,values):
+        assert len(values)==len(names)
+        self.__dict__['values']=values
+        self.__dict__['fields']={}
+        for i in xrange(len(values)):
+            self.fields[names[i]]=i
+            
+    def __getitem__(self,i):
+        if isinstance(i,int):
+            return self.values[i]
+        else:
+            return self.values[self.fields[i]]
+    
+    def __setitem__(self,i,value):
+        if isinstance(i,int):
+            self.values[i]=value
+        else:
+            self.values[self.fields[i]]=value
+
+    def __getattr__(self,name):
+        return self.values[self.fields[name]]
+
+    def __setattr__(self,name,value):
+        self.values[self.fields[name]]=value
+
+    def __len__(self):
+        return len(self.values)
 
     
 class DataSet(object):
-    """
-    This is a virtual base class or interface for datasets.
-    A dataset is basically an iterator over examples. It does not necessarily
-    have a fixed length (this is useful for 'streams' which feed on-line learning).
-    Datasets with fixed and known length are FiniteDataSet, a subclass of DataSet.
-    Examples and datasets optionally have named fields. 
-    One can obtain a sub-dataset by taking dataset.field or dataset(field1,field2,field3,...).
-    Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
-    The content of a field can be of any type, but often will be a numpy array.
-    The minibatch_size attribute, if different than 1, means that the iterator (next() method)
-    returns not a single example but an array of length minibatch_size, i.e., an indexable
-    object with minibatch_size examples in it.
+    """A virtual base class for datasets.
+
+    A DataSet is a generator of iterators; these iterators can run through the
+    examples in a variety of ways.  A DataSet need not necessarily have a finite
+    or known length, so this class can be used to interface to a 'stream' which
+    feed on-line learning.
+
+    To iterate over examples, there are several possibilities:
+    - for i in dataset.zip(field1, field2,field3, ...)
+    - for i in dataset.minibatches(N, field1, field2, ...)
+    - for i in dataset
+    Each of these is documented below.
+
+    Note: For a dataset of fixed and known length, which can implement item
+    random-access efficiently (e.g. indexing and slicing), and which can profit
+    from the FiniteDataSetIterator, consider using base class FiniteDataSet.
+
+    Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
+
+    Note: The content of a field can be of any type.
+
     """
 
-    def __init__(self,minibatch_size=1):
-        assert minibatch_size>0
-        self.minibatch_size=minibatch_size
+    def __init__(self):
+        pass
+    
+    def __iter__(self):
+        """Supports the syntax "for i in dataset: ..."
 
-    def __iter__(self):
+        Using this syntax, "i" will be an Example instance (or equivalent) with
+        all the fields of DataSet self.  Every field of "i" will give access to
+        a the field of a single example.  Fields should be accessible via
+        i[identifier], but the derived class is free to accept any type of
+        identifier, and add extra functionality to the iterator.
+        """
+        raise NotImplementedError
+
+    def zip(self, *fieldnames):
         """
-        Return an iterator, whose next() method returns the next example or the next 
-        minibatch in the dataset. A minibatch (of length > 1) should be something one 
-        can iterate on again in order to obtain the individual examples. If the dataset 
-        has fields, then the example or the minibatch must have the same fields
-        (typically this is implemented by returning another smaller dataset, when
-        there are fields).
+        Supports two forms of syntax:
+
+            for i in dataset.zip(f1, f2, f3): ...
+
+            for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
+
+        Using the first syntax, "i" will be an indexable object, such as a list,
+        tuple, or Example instance, such that on every iteration, i[0] is the f1
+        field of the current example, i[1] is the f2 field, and so on.
+
+        Using the second syntax, i1, i2, i3 will contain the the contents of the
+        f1, f2, and f3 fields of a single example on each loop iteration.
+
+        The derived class may accept fieldname arguments of any type.
+
         """
         raise NotImplementedError
 
-    def __getattr__(self,fieldname):
-        """Return a sub-dataset containing only the given fieldname as field."""
-        return self(fieldname)
+    def minibatches(self,minibatch_size,*fieldnames):
+        """
+        Supports two forms of syntax:
+
+            for i in dataset.zip(f1, f2, f3): ...
+
+            for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
+
+        Using the first syntax, "i" will be an indexable object, such as a list,
+        tuple, or Example instance, such that on every iteration, i[0] is the f1
+        field of the current example, i[1] is the f2 field, and so on.
+
+        Using the second syntax, i1, i2, i3 will contain the the contents of the
+        f1, f2, and f3 fields of a single example on each loop iteration.
+
+        The derived class may accept fieldname arguments of any type.
+
+        Return an iterator, whose next() method returns the next example or the next 
+        minibatch in the dataset. A minibatch (of length > 1) is also an example, but
+        whose fields should be something one can iterate on again in order to obtain
+        the individual examples.
 
-    def __call__(self,*fieldnames):
-        """Return a sub-dataset containing only the given fieldnames as fields."""
+        DataSet.zip returns an iterator over only the desired fields, and each field
+        of the iterator contains one example.
+
+        Return an iterator which sees only the specified fields (each fieldname is a
+        field key, typically a string). The value returned at each iteration
+        is a tuple with one element per field. Hence it can be used like this:
+           for f1, f2, f3 in dataset.zip('field1','field2','field3'):
+              ... use f1, f2, and f3
+    If one iterates through minibatches of examples (with the minibatches() method
+    or with the minibatch_size argument of the zip() method), then the fields
+    returned by the iterator's next method should be iterators over the 
+    individual values within the minibatch (typically these will be arrays
+    with minibatch_size rows).
+        Similar to zip but iterate over minibatches.
+        Return a minibatch iterator, whose next() method returns an 'example'
+        whose fields are iteratable objects (which can iterate over the individual
+        values of that field in the minibatch).
+        """
+        raise NotImplementedError
+    
+    def fieldNames(self):
+        """Return the list of field names in the examples of this dataset."""
         raise NotImplementedError
 
-    def fieldNames(self):
-        """Return the list of field names that are supported by getattr and getFields."""
+    def rename(*new_field_specifications):
+        """
+        Return a new dataset that maps old fields (of self) to new fields (of the returned 
+        dataset). The minimal syntax that should be supported is the following:
+           new_field_specifications = [new_field_spec1, new_field_spec2, ...]
+           new_field_spec = ([old_field1, old_field2, ...], new_field)
+        In general both old_field and new_field should be strings, but some datasets may also
+        support additional indexing schemes within each field (e.g. column slice
+        of a matrix-like field).
+        """
         raise NotImplementedError
 
 class FiniteDataSet(DataSet):
     """
     Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
     Examples are indexed by an integer between 0 and self.length()-1,
-    and a subdataset can be obtained by slicing.
+    and a subdataset can be obtained by slicing. This may not be appropriate in general
+    but only for datasets which can be thought of like ones that access rows AND fields
+    in an efficient random access way. Users are encouraged to expect only the generic dataset
+    interface in general. A FiniteDataSet is mainly useful when one has to obtain
+    a subset of examples (e.g. for splitting a dataset into training and test sets).
     """
 
-    def __init__(self,minibatch_size):
-        DataSet.__init__(self,minibatch_size)
+    def __init__(self):
+        pass
 
     def __iter__(self):
         return FiniteDataSetIterator(self)
     
+    def zip(self,*fieldnames):
+        return FiniteDataSetIterator(self,1,fieldnames)
+
+    def minibatches(self,minibatch_size,*fieldnames):
+        return FiniteDataSetIterator(self,minibatch_size,fieldnames)
+
+    def __getattr__(self,fieldname):
+        """Return an that can iterate over the values of the field in this dataset."""
+        return self(fieldname)
+
+    def __call__(self,*fieldnames):
+        """Return a sub-dataset containing only the given fieldnames as fields.
+        
+        The return value's default iterator will iterate only over the given
+        fields.
+        """
+        raise NotImplementedError
+
     def __len__(self):
         """len(dataset) returns the number of examples in the dataset."""
         raise NotImplementedError
@@ -68,32 +201,32 @@
         raise NotImplementedError
 
 class FiniteDataSetIterator(object):
-    def __init__(self,dataset):
+    """
+    If the fieldnames list is empty, it means that we want to see ALL the fields.
+    """
+    def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
         self.dataset=dataset
-        self.current = -self.dataset.minibatch_size
-        
+        self.minibatch_size=minibatch_size
+        assert minibatch_size>=1 and minibatch_size<=len(dataset)
+        self.current = -self.minibatch_size
+        self.fieldnames = fieldnames
+
+    def __iter__(self):
+        return self
+    
     def next(self):
-        """
-        Return the next example(s) in the dataset. If self.dataset.minibatch_size>1 return that
-        many examples. If the dataset has fields, the example or the minibatch of examples
-        is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed),
-        but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate
-        example-wise on it. On the other hand, if the dataset has no fields (e.g. because
-        it is already the field of a bigger dataset), then the returned example or minibatch
-        may be any indexable object, such as a numpy array. Following the array semantics of indexing
-        and slicing, if the minibatch_size is 1 (and there are no fields), then the result is an array
-        with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding
-        to a row. Again, if the minibatch_size is >1, one can iterate on the result to
-        obtain individual examples (as rows).
-        """
-        self.current+=self.dataset.minibatch_size
+        self.current+=self.minibatch_size
         if self.current>=len(self.dataset):
-            self.current=-self.dataset.minibatch_size
+            self.current=-self.minibatch_size
             raise StopIteration
-        if self.dataset.minibatch_size==1:
-            return self.dataset[self.current]
+        if self.minibatch_size==1:
+            complete_example=self.dataset[self.current]
         else:
-            return self.dataset[self.current:self.current+self.dataset.minibatch_size]
+            complete_example=self.dataset[self.current:self.current+self.minibatch_size]
+        if self.fieldnames:
+            return Example(self.fieldnames,list(complete_example))
+        else:
+            return complete_example
 
 
 # we may want ArrayDataSet defined in another python file
@@ -102,27 +235,46 @@
 
 class ArrayDataSet(FiniteDataSet):
     """
-    An ArrayDataSet behaves like a numpy array but adds the notion of fields
-    and minibatch_size from DataSet. It is a  fixed-length and fixed-width dataset 
+    An ArrayDataSet behaves like a numpy array but adds the notion of named fields
+    from DataSet (and the ability to view multiple field values as an 'Example').
+    It is a  fixed-length and fixed-width dataset 
     in which each element is a numpy array or a number, hence the whole 
     dataset corresponds to a numpy array. Fields
     must correspond to a slice of array columns. If the dataset has fields,
     each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
     Any dataset can also be converted to a numpy array (losing the notion of fields
-    and of minibatch_size) by the numpy.array(dataset) call.
+    by the numpy.array(dataset) call.
     """
 
-    def __init__(self,dataset=None,data=None,fields={},minibatch_size=1):
+    def __init__(self,dataset=None,data=None,fields={}):
         """
-	There are two ways to construct an ArrayDataSet: (1) from an
-	existing dataset (which may result in a copy of the data in a numpy array),
-	or (2) from a numpy.array (the data argument), along with an optional description
-	of the fields (dictionary of column slices indexed by field names).
+        There are two ways to construct an ArrayDataSet: (1) from an
+        existing dataset (which may result in a copy of the data in a numpy array),
+        or (2) from a numpy.array (the data argument), along with an optional description
+        of the fields (dictionary of column slices indexed by field names).
         """
-        FiniteDataSet.__init__(self,minibatch_size)
         if dataset!=None:
             assert data==None and fields=={}
-            # convert dataset to an ArrayDataSet
+            # Make ONE big minibatch with all the examples, to separate the fields.
+            n_examples=len(dataset)
+            batch = dataset.minibatches(n_examples).next()
+            # Each field of the underlying dataset must be convertible to a numpy array of the same type
+            # currently just double, but should use the smallest compatible dtype
+            n_fields = len(batch)
+            fieldnames = batch.fields.keys()
+            total_width = 0
+            type = None
+            for i in xrange(n_fields):
+                field = array(batch[i])
+                assert field.shape[0]==n_examples
+                width = field.shape[1]
+                start=total_width
+                total_width += width
+                fields[fieldnames[i]]=slice(start,total_width,1)
+            # many complicated things remain to be done:
+            #  - find common dtype
+            #  - decide what to do with extra dimensions if not the same in all fields
+            #  - try to see if we can avoid the copy?
             raise NotImplementedError
         if data!=None:
             assert dataset==None
@@ -142,13 +294,12 @@
                     fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
                 # and coherent with the data array
                 assert fieldslice.start>=0 and fieldslice.stop<=self.width
-        assert minibatch_size<=len(self.data)
 
     def __getattr__(self,fieldname):
         """
         Return a numpy array with the content associated with the given field name.
         If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
-        than the dataset.data) is returned.
+        than the dataset itself) is returned.
         """
         if len(self.data)==1:
             return self.data[0,self.fields[fieldname]]
@@ -164,7 +315,7 @@
         new_fields={}
         for field in self.fields:
             new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step)
-        return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields,minibatch_size=self.minibatch_size)
+        return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields)
 
     def fieldNames(self):
         """Return the list of field names that are supported by getattr and getFields."""
@@ -176,13 +327,12 @@
     
     def __getitem__(self,i):
         """
-        dataset[i] returns the (i+1)-th example of the dataset. If the dataset has fields
-        then a one-example dataset is returned (to be able to handle example.field accesses).
+        dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
+        the result is just a numpy array (for the i-th row of the dataset data matrix).
         """
         if self.fields:
-            if isinstance(i,slice):
-                return ArrayDataSet(data=data[slice],fields=self.fields)
-            return ArrayDataSet(data=self.data[i:i+1],fields=self.fields)
+            fieldnames,fieldslices=zip(*self.fields.items())
+            return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices])
         else:
             return self.data[i]
 
@@ -241,3 +391,5 @@
             result[:,slice(c,slice_width)]=self.data[:,field_slice]
             c+=slice_width
         return result
+
+