Mercurial > pylearn

--- a/dataset.py	Tue Apr 29 16:09:17 2008 -0400
+++ b/dataset.py	Tue Apr 29 17:45:16 2008 -0400
@@ -80,12 +80,14 @@

     * dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in.

-    * dataset['key'] returns a property associated with the given 'key' string.
-      If 'key' is a fieldname, then the VStacked field values (iterable over
-      field values) for that field is returned. Other keys may be supported
-      by different dataset subclasses. The following key names are should be supported:
+    * dataset[fieldname] an iterable over the values of the field fieldname across
+      the dataset (the iterable is obtained by default by calling valuesVStack
+      over the values for individual examples).
+
+    * dataset.<property> returns the value of a property associated with
+      the name <property>. The following properties should be supported:
           - 'description': a textual description or name for the dataset
-          - '<fieldname>.type': a type name or value for a given <fieldname>
+          - 'fieldtypes': a list of types (one per field)

     Datasets can be concatenated either vertically (increasing the length) or
     horizontally (augmenting the set of fields), if they are compatible, using
@@ -125,12 +127,12 @@
       * __iter__
     """

-    def __init__(self,description=None,field_types=None):
+    def __init__(self,description=None,fieldtypes=None):
         if description is None:
             # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)"
             description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )"
         self.description=description
-        self.field_types=field_types
+        self.fieldtypes=field_types

     class MinibatchToSingleExampleIterator(object):
         """
@@ -944,7 +946,34 @@
                 return self.minibatch

         return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
-
+
+
+class CachedDataSet(DataSet):
+  """
+  Wrap a dataset whose values are computationally expensive to obtain
+  (e.g. because they involve some computation, or disk access),
+  so that repeated accesses to the same example are done cheaply,
+  by caching every example value that has been accessed at least once.
+
+  Optionally, for finite-length dataset, all the values can be computed
+  (and cached) upon construction of the CachedDataSet, rather at the
+  first access.
+  """
+
+class ApplyFunctionDataSet(DataSet):
+  """
+  A dataset that contains as fields the results of applying a given function
+  example-wise or minibatch-wise to all the fields of an input dataset.
+  The output of the function should be an iterable (e.g. a list or a LookupList)
+  over the resulting values. In minibatch mode, the function is expected
+  to work on minibatches (takes a minibatch in input and returns a minibatch
+  in output).
+
+  The function is applied each time an example or a minibatch is accessed.
+  To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
+  """
+
+
 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
     """
     Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the