diff dataset.py @ 1:2cd82666b9a7

Added statscollector and started writing dataset and learner.
author bengioy@esprit.iro.umontreal.ca
date Fri, 14 Mar 2008 11:28:08 -0400
parents 586dcaa4b2df
children 3fddb1c8f955
line wrap: on
line diff
--- a/dataset.py	Fri Mar 14 10:07:50 2008 -0400
+++ b/dataset.py	Fri Mar 14 11:28:08 2008 -0400
@@ -0,0 +1,53 @@
+
+    
+class DataSet(object):
+    """Base class for representing a fixed-size or variable-size (online learning)
+    data set. A DataSet is used in a Learner to represent a training set or a
+    validation set. It is an indexed collection of examples. An example
+    is expected to obey the syntax of dictionaries, i.e., it contains named
+    fields that can be accessed via the [fieldname] syntax.
+    If one views a DataSet as a matrix, the [i] operator selects a row while the .fieldname
+    operator selects a named 'field' or column. However, each of the entries in one of
+    these 'columns' can be any python object, not just a number. One can also
+    use the slicing notation to select a subset of example and the getFields
+    method to select a subset of the fields."""
+
+    __init__(self):
+        pass
+
+    size(self):
+        """Return -1 for variable-size DataSets (for on-line learning), and
+        the actual size otherwise"""
+        return 0
+
+    fieldNames(self):
+        """Return the list of field names that are supported by getattr and getFields."""
+        raise NotImplementedError
+
+    __getitem__(self, i):
+        """dataset[i] returns i-th example from DataSet. For fixed-size DataSets i should be
+        between 0 and size()-1. For on-line DataSets, the argument is ignored (and
+        should be -1 by convention to make it clear that it is not used), and
+        the next available example in the example stream is returned."""
+        return self.get_slice(i)
+        
+
+    __getslice__(self,*args):
+        """Return a DataSet that is a subset of self, by specifying either
+        an interval of indices or list of indices, in the standard slicing notation."""
+        return self.get_slice(slice(*args))
+
+    get_slice(self,slice_or_index):
+        """This method should be redefined to do the actual work of slicing / getting an element."""
+        raise NotImplementedError
+
+    __getattr__(self, attribute):
+        """Return a DataSet that only contains the requested attribute from the examples."""
+        raise NotImplementedError
+
+    getFields(self,fields):
+        """Return an DataSet that only sees the fields named in the argument."""
+        raise NotImplementedError
+
+    
+