changeset 2:3fddb1c8f955

Rewrote DataSet interface and created FiniteDataSet interface.
author bengioy@bengiomac.local
date Sat, 22 Mar 2008 22:21:59 -0400
parents 2cd82666b9a7
children 378b68d5c4ad
files dataset.py
diffstat 1 files changed, 41 insertions(+), 36 deletions(-) [+]
line wrap: on
line diff
--- a/dataset.py	Fri Mar 14 11:28:08 2008 -0400
+++ b/dataset.py	Sat Mar 22 22:21:59 2008 -0400
@@ -1,53 +1,58 @@
 
     
 class DataSet(object):
-    """Base class for representing a fixed-size or variable-size (online learning)
-    data set. A DataSet is used in a Learner to represent a training set or a
-    validation set. It is an indexed collection of examples. An example
-    is expected to obey the syntax of dictionaries, i.e., it contains named
-    fields that can be accessed via the [fieldname] syntax.
-    If one views a DataSet as a matrix, the [i] operator selects a row while the .fieldname
-    operator selects a named 'field' or column. However, each of the entries in one of
-    these 'columns' can be any python object, not just a number. One can also
-    use the slicing notation to select a subset of example and the getFields
-    method to select a subset of the fields."""
+    """
+    This is a virtual base class or interface for datasets.
+    A dataset is basically an iterator over examples. It does not necessarily
+    have a fixed length (this is useful for 'streams' which feed on-line learning).
+    Datasets with fixed and known length are FiniteDataSet, a subclass of DataSet.
+    Examples and datasets have named fields. 
+    One can obtain a sub-dataset by taking dataset.field or dataset(field1,field2,field3,...).
+    Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
+    The content of a field can be of any type, but often will be a numpy tensor.
+    """
 
-    __init__(self):
+    def __init__(self):
         pass
 
-    size(self):
-        """Return -1 for variable-size DataSets (for on-line learning), and
-        the actual size otherwise"""
-        return 0
+    def __iter__():
+        return self
+
+    def next():
+        """Return the next example in the dataset."""
+        raise NotImplementedError
+
+    def __getattr__(fieldname):
+        """Return a sub-dataset containing only the given fieldname as field."""
+        return self(fieldname)
+
+    def __call__(*fieldnames):
+        """Return a sub-dataset containing only the given fieldnames as fields."""
+        raise NotImplementedError
 
     fieldNames(self):
         """Return the list of field names that are supported by getattr and getFields."""
         raise NotImplementedError
 
-    __getitem__(self, i):
-        """dataset[i] returns i-th example from DataSet. For fixed-size DataSets i should be
-        between 0 and size()-1. For on-line DataSets, the argument is ignored (and
-        should be -1 by convention to make it clear that it is not used), and
-        the next available example in the example stream is returned."""
-        return self.get_slice(i)
-        
+class FiniteDataSet(DataSet):
+    """
+    Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
+    Examples are indexed by an integer between 0 and self.length()-1,
+    and a subdataset can be obtained by slicing.
+    """
 
-    __getslice__(self,*args):
-        """Return a DataSet that is a subset of self, by specifying either
-        an interval of indices or list of indices, in the standard slicing notation."""
-        return self.get_slice(slice(*args))
+    def __init__(self):
+        pass
 
-    get_slice(self,slice_or_index):
-        """This method should be redefined to do the actual work of slicing / getting an element."""
+    def __len__(self):
+        """len(dataset) returns the number of examples in the dataset."""
+        raise NotImplementedError
+    
+    def __getitem__(self,i):
+        """dataset[i] returns the (i+1)-th example of the dataset."""
         raise NotImplementedError
 
-    __getattr__(self, attribute):
-        """Return a DataSet that only contains the requested attribute from the examples."""
+    def __getslice__(self,*slice_args):
+        """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
         raise NotImplementedError
-
-    getFields(self,fields):
-        """Return an DataSet that only sees the fields named in the argument."""
-        raise NotImplementedError
-
     
-