Mercurial > pylearn

--- a/dataset.py	Wed Mar 26 15:01:30 2008 -0400
+++ b/dataset.py	Wed Mar 26 18:23:44 2008 -0400
@@ -38,48 +38,98 @@


 class DataSet(object):
-    """
-    This is a virtual base class or interface for datasets.
-    A dataset is basically an iterator over Examples (or anything that
-    behaves like an Example). It does not necessarily
-    have a fixed length (this is useful for 'streams' which feed on-line learning).
-    Datasets with fixed and known length are instances of FiniteDataSet, a subclass
-    which supports indexing (dataset[i]) and slicing (dataset[1000:2000]).
-    To iterate over a subset of the fields, one should use the dataset.zip(field1, field2,field3, ...)
-    method which returns an iterator over only the desired fields.
-    Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
-    The content of a field can be of any type, but often will be a numpy array.
-    If one iterates through minibatches of examples (with the minibatches() method
-    or with the minibatch_size argument of the zip() method), then the fields
-    returned by the iterator's next method should be iterators over the
-    individual values within the minibatch (typically these will be arrays
-    with minibatch_size rows).
+    """A virtual base class for datasets.
+
+    A DataSet is a generator of iterators; these iterators can run through the
+    examples in a variety of ways.  A DataSet need not necessarily have a finite
+    or known length, so this class can be used to interface to a 'stream' which
+    feed on-line learning.
+
+    To iterate over examples, there are several possibilities:
+    - for i in dataset.zip(field1, field2,field3, ...)
+    - for i in dataset.minibatches(N, field1, field2, ...)
+    - for i in dataset
+    Each of these is documented below.
+
+    Note: For a dataset of fixed and known length, which can implement item
+    random-access efficiently (e.g. indexing and slicing), and which can profit
+    from the FiniteDataSetIterator, consider using base class FiniteDataSet.
+
+    Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
+
+    Note: The content of a field can be of any type.
+
     """

     def __init__(self):
         pass

     def __iter__(self):
+        """Supports the syntax "for i in dataset: ..."
+
+        Using this syntax, "i" will be an Example instance (or equivalent) with
+        all the fields of DataSet self.  Every field of "i" will give access to
+        a the field of a single example.  Fields should be accessible via
+        i[identifier], but the derived class is free to accept any type of
+        identifier, and add extra functionality to the iterator.
         """
+        raise NotImplementedError
+
+    def zip(self, *fieldnames):
+        """
+        Supports two forms of syntax:
+
+            for i in dataset.zip(f1, f2, f3): ...
+
+            for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
+
+        Using the first syntax, "i" will be an indexable object, such as a list,
+        tuple, or Example instance, such that on every iteration, i[0] is the f1
+        field of the current example, i[1] is the f2 field, and so on.
+
+        Using the second syntax, i1, i2, i3 will contain the the contents of the
+        f1, f2, and f3 fields of a single example on each loop iteration.
+
+        The derived class may accept fieldname arguments of any type.
+
+        """
+        raise NotImplementedError
+
+    def minibatches(self,minibatch_size,*fieldnames):
+        """
+        Supports two forms of syntax:
+
+            for i in dataset.zip(f1, f2, f3): ...
+
+            for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
+
+        Using the first syntax, "i" will be an indexable object, such as a list,
+        tuple, or Example instance, such that on every iteration, i[0] is the f1
+        field of the current example, i[1] is the f2 field, and so on.
+
+        Using the second syntax, i1, i2, i3 will contain the the contents of the
+        f1, f2, and f3 fields of a single example on each loop iteration.
+
+        The derived class may accept fieldname arguments of any type.
+
         Return an iterator, whose next() method returns the next example or the next
         minibatch in the dataset. A minibatch (of length > 1) is also an example, but
         whose fields should be something one can iterate on again in order to obtain
         the individual examples.
-        """
-        raise NotImplementedError

-    def zip(self,*fieldnames):
-        """
+        DataSet.zip returns an iterator over only the desired fields, and each field
+        of the iterator contains one example.
+
         Return an iterator which sees only the specified fields (each fieldname is a
         field key, typically a string). The value returned at each iteration
         is a tuple with one element per field. Hence it can be used like this:
            for f1, f2, f3 in dataset.zip('field1','field2','field3'):
               ... use f1, f2, and f3
-        """
-        raise NotImplementedError
-
-    def minibatches(self,minibatch_size,*fieldnames):
-        """
+    If one iterates through minibatches of examples (with the minibatches() method
+    or with the minibatch_size argument of the zip() method), then the fields
+    returned by the iterator's next method should be iterators over the
+    individual values within the minibatch (typically these will be arrays
+    with minibatch_size rows).
         Similar to zip but iterate over minibatches.
         Return a minibatch iterator, whose next() method returns an 'example'
         whose fields are iteratable objects (which can iterate over the individual
@@ -131,7 +181,11 @@
         return self(fieldname)

     def __call__(self,*fieldnames):
-        """Return a sub-dataset containing only the given fieldnames as fields."""
+        """Return a sub-dataset containing only the given fieldnames as fields.
+
+        The return value's default iterator will iterate only over the given
+        fields.
+        """
         raise NotImplementedError

     def __len__(self):
@@ -287,6 +341,15 @@
         return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields)

     def __array__(self):
+        """Return an view of this dataset which is an numpy.ndarray
+
+        Numpy uses this special function name to retrieve an ndarray view for
+        function such as numpy.sum, numpy.dot, numpy.asarray, etc.
+
+        If this dataset has no fields, then we simply return self.data,
+        otherwise things are complicated.
+        - why do we want this behaviour when there are fields? (JB)
+        """
         if not self.fields:
             return self.data
         # else, select subsets of columns mapped by the fields