changeset 1:2cd82666b9a7

Added statscollector and started writing dataset and learner.
author bengioy@esprit.iro.umontreal.ca
date Fri, 14 Mar 2008 11:28:08 -0400
parents 586dcaa4b2df
children 3fddb1c8f955
files dataset.py learner.py statscollector.py
diffstat 3 files changed, 153 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/dataset.py	Fri Mar 14 10:07:50 2008 -0400
+++ b/dataset.py	Fri Mar 14 11:28:08 2008 -0400
@@ -0,0 +1,53 @@
+
+    
+class DataSet(object):
+    """Base class for representing a fixed-size or variable-size (online learning)
+    data set. A DataSet is used in a Learner to represent a training set or a
+    validation set. It is an indexed collection of examples. An example
+    is expected to obey the syntax of dictionaries, i.e., it contains named
+    fields that can be accessed via the [fieldname] syntax.
+    If one views a DataSet as a matrix, the [i] operator selects a row while the .fieldname
+    operator selects a named 'field' or column. However, each of the entries in one of
+    these 'columns' can be any python object, not just a number. One can also
+    use the slicing notation to select a subset of example and the getFields
+    method to select a subset of the fields."""
+
+    __init__(self):
+        pass
+
+    size(self):
+        """Return -1 for variable-size DataSets (for on-line learning), and
+        the actual size otherwise"""
+        return 0
+
+    fieldNames(self):
+        """Return the list of field names that are supported by getattr and getFields."""
+        raise NotImplementedError
+
+    __getitem__(self, i):
+        """dataset[i] returns i-th example from DataSet. For fixed-size DataSets i should be
+        between 0 and size()-1. For on-line DataSets, the argument is ignored (and
+        should be -1 by convention to make it clear that it is not used), and
+        the next available example in the example stream is returned."""
+        return self.get_slice(i)
+        
+
+    __getslice__(self,*args):
+        """Return a DataSet that is a subset of self, by specifying either
+        an interval of indices or list of indices, in the standard slicing notation."""
+        return self.get_slice(slice(*args))
+
+    get_slice(self,slice_or_index):
+        """This method should be redefined to do the actual work of slicing / getting an element."""
+        raise NotImplementedError
+
+    __getattr__(self, attribute):
+        """Return a DataSet that only contains the requested attribute from the examples."""
+        raise NotImplementedError
+
+    getFields(self,fields):
+        """Return an DataSet that only sees the fields named in the argument."""
+        raise NotImplementedError
+
+    
+    
--- a/learner.py	Fri Mar 14 10:07:50 2008 -0400
+++ b/learner.py	Fri Mar 14 11:28:08 2008 -0400
@@ -0,0 +1,63 @@
+
+from dataset import *
+from statscollector import *
+    
+class Learner(object):
+    """Base class for learning algorithms, provides an interface
+    that allows various algorithms to be applicable to generic learning
+    algorithms.
+
+    A Learner often works in stages, and the user can control when
+    each stage is executed by calling train repetively (with a different
+    target_stage).
+    """
+    
+    __init__(self,training_set=None,verbosity=0):
+        self.stage=0
+        self.training_set=training_set
+        if training_set:
+            assert isinstance(training_set,DataSet), "training_set should be a DataSet"
+        self.verbosity=verbosity
+
+
+    train(self,target_stage=None,training_set=None,from_scratch=True,train_stats=None):
+        """The main method of a learner, to 'train' it. This modifies the Learner
+        so that its behavior upon a next call to use may be different.
+        The training set may have already been provided (by setting the training_set
+        attribute) or a new / different training set may be provided here (which will
+        set the training_set attribute). Training may occur in stages. The current
+        stage may be set by the user and all the stages until and including the target_stage
+        will be performed. If from_scratch then stage is set to 0 initially.
+        If the train_stats argument is provided, it should be a StatsCollector object.
+        In that case, performance statistics will be computed on the training set
+        and accumulated into the train_stats object. If from_scratch though, the
+        train_stats object will be cleared automatically. The same train_stats object
+        can thus be reused on consecutive calls to train (where all but the first
+        are from scratch).
+
+        Subclasses may call Learner.train to set stage and training_set according
+        to the above arguments. The actual training should then be performed
+        within the subclass train method.
+        """
+        if from_scratch:
+            stage=0
+            if train_stats:
+                train_stats.clear()
+        if training_set:
+        if training_set:
+            self.training_set=training_set
+            assert isinstance(training_set,DataSet), "training_set should be a DataSet"
+            
+        return
+        
+
+    use(self,input_dataset,test_stats=None,output_fields=None):
+        """Once a Learner has been trained by one or more call to 'train', it can
+        be used with one or more calls to 'use'. The argument is a DataSet (possibly
+        containing a single example) and the result is a DataSet of the same size.
+        If output_fields is specified, it may be use to indicate which fields should
+        be constructed in the output DataSet (for example ['output','classification_error']).
+        If a test_stats object is provided, then performance statistics will be computed to
+        measure the performance of the learner on the given dataset, and accumulated into
+        the test_stats (hence it must be cleared before the call)."""
+        raise NotImplementedError
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/statscollector.py	Fri Mar 14 11:28:08 2008 -0400
@@ -0,0 +1,37 @@
+
+from numpy import *
+
+class StatsCollector(object):
+    """A StatsCollector object is used to record performance statistics during training
+    or testing of a learner. It can be configured to measure different things and
+    accumulate the appropriate statistics. From these statistics it can be interrogated
+    to obtain performance measures of interest (such as maxima, minima, mean, standard
+    deviation, standard error, etc.). Optionally, the observations can be weighted
+    (yielded weighted mean, weighted variance, etc., where applicable). The statistics
+    that are desired can be specified among a list supported by the StatsCollector
+    class or subclass. When some statistics are requested, others become automatically
+    available (e.g., sum or mean)."""
+
+    default_statistics = [mean,standard_deviation,min,max]
+    
+    __init__(self,n_quantities_observed, statistics=default_statistics):
+        self.n_quantities_observed=n_quantities_observed
+
+    clear(self):
+        raise NotImplementedError
+
+    update(self,observations):
+        """The observations is a numpy vector of length n_quantities_observed. Some
+        entries can be 'missing' (with a NaN entry) and will not be counted in the
+        statistics."""
+        raise NotImplementedError
+
+    __getattr__(self, statistic)
+        """Return a particular statistic, which may be inferred from the collected statistics.
+        The argument is a string naming that statistic."""
+        
+
+    
+
+    
+