changeset 59:ac9aff8d5743

Automated merge with ssh://p-omega1@lgcm.iro.umontreal.ca/tlearn
author Frederic Bastien <bastienf@iro.umontreal.ca>
date Thu, 01 May 2008 16:19:31 -0400
parents 1aabd2e2bb5f (diff) 17729d7104fa (current diff)
children 9165d86855ab
files
diffstat 1 files changed, 60 insertions(+), 16 deletions(-) [+]
line wrap: on
line diff
--- a/dataset.py	Thu May 01 16:17:10 2008 -0400
+++ b/dataset.py	Thu May 01 16:19:31 2008 -0400
@@ -80,12 +80,14 @@
 
     * dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in.
 
-    * dataset['key'] returns a property associated with the given 'key' string.
-      If 'key' is a fieldname, then the VStacked field values (iterable over
-      field values) for that field is returned. Other keys may be supported
-      by different dataset subclasses. The following key names are should be supported:
+    * dataset[fieldname] an iterable over the values of the field fieldname across
+      the dataset (the iterable is obtained by default by calling valuesVStack
+      over the values for individual examples).
+
+    * dataset.<property> returns the value of a property associated with
+      the name <property>. The following properties should be supported:
           - 'description': a textual description or name for the dataset
-          - '<fieldname>.type': a type name or value for a given <fieldname>
+          - 'fieldtypes': a list of types (one per field)
 
     Datasets can be concatenated either vertically (increasing the length) or
     horizontally (augmenting the set of fields), if they are compatible, using
@@ -125,12 +127,12 @@
       * __iter__
     """
 
-    def __init__(self,description=None,field_types=None):
+    def __init__(self,description=None,fieldtypes=None):
         if description is None:
             # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)"
             description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )"
         self.description=description
-        self.field_types=field_types
+        self.fieldtypes=field_types
     
     class MinibatchToSingleExampleIterator(object):
         """
@@ -603,6 +605,7 @@
                 Example(self.fields.keys(),[field[i] for field in self.fields])),self.fields)
         if self.hasFields(i):
             return self.fields[i]
+        assert i in self.__dict__ # else it means we are trying to access a non-existing property
         return self.__dict__[i]
 
     def fieldNames(self):
@@ -874,13 +877,13 @@
     values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2).
     """
 
-    """
-    Construct an ArrayDataSet from the underlying numpy array (data) and
-    a map (fields_columns) from fieldnames to field columns. The columns of a field are specified
-    using the standard arguments for indexing/slicing: integer for a column index,
-    slice for an interval of columns (with possible stride), or iterable of column indices.
-    """
     def __init__(self, data_array, fields_columns):
+        """
+        Construct an ArrayDataSet from the underlying numpy array (data) and
+        a map (fields_columns) from fieldnames to field columns. The columns of a field are specified
+        using the standard arguments for indexing/slicing: integer for a column index,
+        slice for an interval of columns (with possible stride), or iterable of column indices.
+        """
         self.data=data_array
         self.fields_columns=fields_columns
 
@@ -906,8 +909,22 @@
     def __len__(self):
         return len(self.data)
 
-    #def __getitem__(self,i):
-    #    """More efficient implementation than the default"""
+    def __getitem__(self,i):
+        """More efficient implementation than the default __getitem__"""
+        fieldnames=self.fields_columns.keys()
+        if type(i) is int:
+            return Example(fieldnames,
+                           [self.data[i,self.fields_columns[f]] for f in fieldnames])
+        if type(i) in (slice,list):
+            return MinibatchDataSet(Example(fieldnames,
+                                            [self.data[i,self.fields_columns[f]] for f in fieldnames]))
+        # else check for a fieldname
+        if self.hasFields(i):
+            return Example([i],[self.data[self.fields_columns[i],:]])
+        # else we are trying to access a property of the dataset
+        assert i in self.__dict__ # else it means we are trying to access a non-existing property
+        return self.__dict__[i]
+        
             
     def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
         class ArrayDataSetIterator(object):
@@ -929,7 +946,34 @@
                 return self.minibatch
 
         return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
-        
+
+
+class CachedDataSet(DataSet):
+  """
+  Wrap a dataset whose values are computationally expensive to obtain
+  (e.g. because they involve some computation, or disk access),
+  so that repeated accesses to the same example are done cheaply,
+  by caching every example value that has been accessed at least once.
+
+  Optionally, for finite-length dataset, all the values can be computed
+  (and cached) upon construction of the CachedDataSet, rather at the
+  first access.
+  """
+
+class ApplyFunctionDataSet(DataSet):
+  """
+  A dataset that contains as fields the results of applying a given function
+  example-wise or minibatch-wise to all the fields of an input dataset.
+  The output of the function should be an iterable (e.g. a list or a LookupList)
+  over the resulting values. In minibatch mode, the function is expected
+  to work on minibatches (takes a minibatch in input and returns a minibatch
+  in output).
+
+  The function is applied each time an example or a minibatch is accessed.
+  To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
+  """
+  
+
 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
     """
     Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the