diff dataset.py @ 20:266c68cb6136

Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
author bengioy@bengiomac.local
date Mon, 07 Apr 2008 09:48:39 -0400
parents 57f4015e2e09
children b6b36f65664f
line wrap: on
line diff
--- a/dataset.py	Thu Mar 27 01:59:44 2008 -0400
+++ b/dataset.py	Mon Apr 07 09:48:39 2008 -0400
@@ -10,12 +10,13 @@
     A DataSet is a generator of iterators; these iterators can run through the
     examples in a variety of ways.  A DataSet need not necessarily have a finite
     or known length, so this class can be used to interface to a 'stream' which
-    feed on-line learning.
+    feeds on-line learning.
 
     To iterate over examples, there are several possibilities:
-    - for i in dataset.zip(field1, field2,field3, ...)
-    - for i in dataset.minibatches(N, field1, field2, ...)
-    - for i in dataset
+    - for example in dataset.zip([field1, field2,field3, ...])
+    - for val1,val2,val3 in dataset.zip([field1, field2,field3])
+    - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N)
+    - for example in dataset
     Each of these is documented below.
 
     Note: For a dataset of fixed and known length, which can implement item
@@ -36,19 +37,19 @@
 
         Using this syntax, "i" will be an Example instance (or equivalent) with
         all the fields of DataSet self.  Every field of "i" will give access to
-        a the field of a single example.  Fields should be accessible via
-        i[identifier], but the derived class is free to accept any type of
-        identifier, and add extra functionality to the iterator.
+        a field of a single example.  Fields should be accessible via
+        i["fielname"] or i[3] (in the fieldNames() order), but the derived class is free
+        to accept any type of identifier, and add extra functionality to the iterator.
         """
-        raise AbstractFunction()
+        return self.zip(*self.fieldNames())
 
     def zip(self, *fieldnames):
         """
         Supports two forms of syntax:
 
-            for i in dataset.zip(f1, f2, f3): ...
+            for i in dataset.zip([f1, f2, f3]): ...
 
-            for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
+            for i1, i2, i3 in dataset.zip([f1, f2, f3]): ...
 
         Using the first syntax, "i" will be an indexable object, such as a list,
         tuple, or Example instance, such that on every iteration, i[0] is the f1
@@ -120,7 +121,27 @@
         # This list may not be finite; what would make sense in the use you have
         # in mind?
         # -JB
-        """Return the list of field names in the examples of this dataset."""
+        #James-
+        # You are right. I had put this to be able to iterate over the fields
+        # but maybe an iterator mechanism (over fields rather than examples)
+        # would be more appropriate. Fieldnames are needed in general
+        # by the iterators over examples or minibatches, to construct
+        # examples or minibatches with the corresponding names as attributes.
+        # -YB
+        """
+        Return an iterator (an object with an __iter__ method) that
+        iterates over the names of the fields. As a special cases,
+        a list or a tuple of field names can be returned.
+        """"
+        # Note that some datasets
+        # may have virtual fields and support a virtually infinite number
+        # of possible field names. In that case, fieldNames() should
+        # either raise an error or iterate over a particular set of
+        # names as appropriate. Another option would be to iterate
+        # over the sub-datasets comprising a single field at a time.
+        # I am not sure yet what is most appropriate.
+        #  -YB
+        """
         raise AbstractFunction()
 
     def rename(*new_field_specifications):
@@ -129,6 +150,9 @@
         # Wouldn't this functionality be easier to provide via a
         # RenamingDataSet, such as the one I've written below?
         # -JB
+        # You are right. Whichever implementation, however, we need a generic way to
+        # 'concatenate' fields, to handle the ([old_field1, old_field2, ...], new_field) semantics.
+        # -YB
         """
         Return a new dataset that maps old fields (of self) to new fields (of the returned 
         dataset). The minimal syntax that should be supported is the following:
@@ -140,6 +164,24 @@
         """
         raise AbstractFunction()
 
+
+    def apply_function(function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True):
+        """
+        Return a dataset that contains as fields the results of applying
+        the given function (example-wise) to the specified input_fields. The
+        function should return a sequence whose elements will be stored in
+        fields whose names are given in the output_fields list. If copy_inputs
+        is True then the resulting dataset will also contain the fields of self.
+        If accept_minibatches, then the function may be called
+        with minibatches as arguments (what is returned by the minibatches
+        iterator). In any case, the computations may be delayed until the examples
+        of the resulting dataset are requested. If cache is True, then
+        once the output fields for some examples have been computed, then
+        are cached (to avoid recomputation if the same examples are again
+        requested).
+        """
+        return ApplyFunctionDataSet(function, input_fields, output_fields, copy_inputs, accept_minibatches, cache)
+
 class RenamingDataSet(DataSet):
     """A DataSet that wraps another one, and makes it look like the field names
     are different
@@ -287,9 +329,9 @@
 class ArrayDataSet(FiniteDataSet):
     """
     An ArrayDataSet behaves like a numpy array but adds the notion of named fields
-    from DataSet (and the ability to view multiple field values as an 'Example').
+    from DataSet (and the ability to view the values of multiple fields as an 'Example').
     It is a  fixed-length and fixed-width dataset 
-    in which each element is a numpy array or a number, hence the whole 
+    in which each element is a fixed dimension numpy array or a number, hence the whole 
     dataset corresponds to a numpy array. Fields
     must correspond to a slice of array columns. If the dataset has fields,
     each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
@@ -382,9 +424,6 @@
                 # and coherent with the data array
                 assert fieldslice.start >= 0 and fieldslice.stop <= cols
 
-    def __iter__(self):
-        return self.zip(*self.fieldNames())
-
     def minibatches(self,
             fieldnames = DataSet.minibatches_fieldnames,
             minibatch_size = DataSet.minibatches_minibatch_size,
@@ -446,7 +485,8 @@
         return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields)
 
     def __array__(self):
-        """Return an view of this dataset which is an numpy.ndarray
+        """Return a view of this dataset which is an numpy.ndarray (i.e. losing
+        the identity and name of fields within the dataset).
 
         Numpy uses this special function name to retrieve an ndarray view for
         function such as numpy.sum, numpy.dot, numpy.asarray, etc.
@@ -454,6 +494,9 @@
         If this dataset has no fields, then we simply return self.data,
         otherwise things are complicated. 
         - why do we want this behaviour when there are fields? (JB)
+        - for convenience and completeness (but maybe it would make
+          more sense to implement this through a 'field-merging'
+          dataset). (YB)
         """
         if not self.fields:
             return self.data
@@ -497,4 +540,75 @@
             c+=slice_width
         return result
 
+class ApplyFunctionDataset(DataSet):
+    """
+    A dataset that contains as fields the results of applying
+    a given function (example-wise) to specified input_fields of a source
+    dataset. The function should return a sequence whose elements will be stored in
+    fields whose names are given in the output_fields list. If copy_inputs
+    is True then the resulting dataset will also contain the fields of the source.
+    dataset. If accept_minibatches, then the function expects 
+    minibatches as arguments (what is returned by the minibatches
+    iterator). In any case, the computations may be delayed until the examples
+    of self are requested. If cache is True, then
+    once the output fields for some examples have been computed, then
+    are cached (to avoid recomputation if the same examples are again requested).
+    """
+    def __init__(src,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True):
+        DataSet.__init__(self)
+        self.src=src
+        self.function=function
+        self.input_fields=input_fields
+        self.output_fields=output_fields
+        self.copy_inputs=copy_inputs
+        self.accept_minibatches=accept_minibatches
+        src_fieldnames = src.fieldNames()
+        if copy_inputs:
+            for src_field in src_fieldnames:
+                assert src_field not in output_fields
+            self.fieldnames=src_fieldnames+output_fields
+        else:
+            self.fieldnames=output_fields
+        for input_field in input_fields:
+            assert input_field in src_fieldnames
+        self.cache=cache
+        if cache:
+            # maybe a fixed-size array kind of structure would be more efficient than a list
+            # in the case where src is FiniteDataSet. -YB
+            self.cached_examples = [] 
 
+    def fieldNames(self): return self.fieldnames
+    
+    def minibatches(self,
+                    fieldnames = DataSet.minibatches_fieldnames,
+                    minibatch_size = DataSet.minibatches_minibatch_size,
+                    n_batches = DataSet.minibatches_n_batches):
+
+        class Iterator(LookupList):
+
+            def __init__(self,dataset):
+                LookupList.__init__(self, fieldnames, [0]*len(fieldnames))
+                self.dataset=dataset
+                if dataset.copy_inputs:
+                    src_fields=dataset.fieldNames()
+                else:
+                    src_fields=dataset.input_fields
+                self.src_iterator=self.src.minibatches(src_fields,minibatch_size,n_batches)
+                                                       
+            def __iter__(self):
+                return self
+
+            def next(self):
+                src_examples = self.src_iterator.next()
+                if self.dataset.copy_inputs:
+                    function_inputs = src_examples
+                else:
+                    function_inputs = 
+                    [src_examples[field_name] for field_name in self.dataset.input_fields])
+                return self.dataset.function(*function_inputs)
+
+        for fieldname in fieldnames:
+            assert fieldname in self.input_fields
+        return Iterator(self)
+
+