# HG changeset patch # User bengioy@esprit.iro.umontreal.ca # Date 1205508488 14400 # Node ID 2cd82666b9a7e428d05618a41cb4f1f3bc0b9483 # Parent 586dcaa4b2df64a9fde0e88f0af4f27b5a97f592 Added statscollector and started writing dataset and learner. diff -r 586dcaa4b2df -r 2cd82666b9a7 dataset.py --- a/dataset.py Fri Mar 14 10:07:50 2008 -0400 +++ b/dataset.py Fri Mar 14 11:28:08 2008 -0400 @@ -0,0 +1,53 @@ + + +class DataSet(object): + """Base class for representing a fixed-size or variable-size (online learning) + data set. A DataSet is used in a Learner to represent a training set or a + validation set. It is an indexed collection of examples. An example + is expected to obey the syntax of dictionaries, i.e., it contains named + fields that can be accessed via the [fieldname] syntax. + If one views a DataSet as a matrix, the [i] operator selects a row while the .fieldname + operator selects a named 'field' or column. However, each of the entries in one of + these 'columns' can be any python object, not just a number. One can also + use the slicing notation to select a subset of example and the getFields + method to select a subset of the fields.""" + + __init__(self): + pass + + size(self): + """Return -1 for variable-size DataSets (for on-line learning), and + the actual size otherwise""" + return 0 + + fieldNames(self): + """Return the list of field names that are supported by getattr and getFields.""" + raise NotImplementedError + + __getitem__(self, i): + """dataset[i] returns i-th example from DataSet. For fixed-size DataSets i should be + between 0 and size()-1. For on-line DataSets, the argument is ignored (and + should be -1 by convention to make it clear that it is not used), and + the next available example in the example stream is returned.""" + return self.get_slice(i) + + + __getslice__(self,*args): + """Return a DataSet that is a subset of self, by specifying either + an interval of indices or list of indices, in the standard slicing notation.""" + return self.get_slice(slice(*args)) + + get_slice(self,slice_or_index): + """This method should be redefined to do the actual work of slicing / getting an element.""" + raise NotImplementedError + + __getattr__(self, attribute): + """Return a DataSet that only contains the requested attribute from the examples.""" + raise NotImplementedError + + getFields(self,fields): + """Return an DataSet that only sees the fields named in the argument.""" + raise NotImplementedError + + + diff -r 586dcaa4b2df -r 2cd82666b9a7 learner.py --- a/learner.py Fri Mar 14 10:07:50 2008 -0400 +++ b/learner.py Fri Mar 14 11:28:08 2008 -0400 @@ -0,0 +1,63 @@ + +from dataset import * +from statscollector import * + +class Learner(object): + """Base class for learning algorithms, provides an interface + that allows various algorithms to be applicable to generic learning + algorithms. + + A Learner often works in stages, and the user can control when + each stage is executed by calling train repetively (with a different + target_stage). + """ + + __init__(self,training_set=None,verbosity=0): + self.stage=0 + self.training_set=training_set + if training_set: + assert isinstance(training_set,DataSet), "training_set should be a DataSet" + self.verbosity=verbosity + + + train(self,target_stage=None,training_set=None,from_scratch=True,train_stats=None): + """The main method of a learner, to 'train' it. This modifies the Learner + so that its behavior upon a next call to use may be different. + The training set may have already been provided (by setting the training_set + attribute) or a new / different training set may be provided here (which will + set the training_set attribute). Training may occur in stages. The current + stage may be set by the user and all the stages until and including the target_stage + will be performed. If from_scratch then stage is set to 0 initially. + If the train_stats argument is provided, it should be a StatsCollector object. + In that case, performance statistics will be computed on the training set + and accumulated into the train_stats object. If from_scratch though, the + train_stats object will be cleared automatically. The same train_stats object + can thus be reused on consecutive calls to train (where all but the first + are from scratch). + + Subclasses may call Learner.train to set stage and training_set according + to the above arguments. The actual training should then be performed + within the subclass train method. + """ + if from_scratch: + stage=0 + if train_stats: + train_stats.clear() + if training_set: + if training_set: + self.training_set=training_set + assert isinstance(training_set,DataSet), "training_set should be a DataSet" + + return + + + use(self,input_dataset,test_stats=None,output_fields=None): + """Once a Learner has been trained by one or more call to 'train', it can + be used with one or more calls to 'use'. The argument is a DataSet (possibly + containing a single example) and the result is a DataSet of the same size. + If output_fields is specified, it may be use to indicate which fields should + be constructed in the output DataSet (for example ['output','classification_error']). + If a test_stats object is provided, then performance statistics will be computed to + measure the performance of the learner on the given dataset, and accumulated into + the test_stats (hence it must be cleared before the call).""" + raise NotImplementedError diff -r 586dcaa4b2df -r 2cd82666b9a7 statscollector.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/statscollector.py Fri Mar 14 11:28:08 2008 -0400 @@ -0,0 +1,37 @@ + +from numpy import * + +class StatsCollector(object): + """A StatsCollector object is used to record performance statistics during training + or testing of a learner. It can be configured to measure different things and + accumulate the appropriate statistics. From these statistics it can be interrogated + to obtain performance measures of interest (such as maxima, minima, mean, standard + deviation, standard error, etc.). Optionally, the observations can be weighted + (yielded weighted mean, weighted variance, etc., where applicable). The statistics + that are desired can be specified among a list supported by the StatsCollector + class or subclass. When some statistics are requested, others become automatically + available (e.g., sum or mean).""" + + default_statistics = [mean,standard_deviation,min,max] + + __init__(self,n_quantities_observed, statistics=default_statistics): + self.n_quantities_observed=n_quantities_observed + + clear(self): + raise NotImplementedError + + update(self,observations): + """The observations is a numpy vector of length n_quantities_observed. Some + entries can be 'missing' (with a NaN entry) and will not be counted in the + statistics.""" + raise NotImplementedError + + __getattr__(self, statistic) + """Return a particular statistic, which may be inferred from the collected statistics. + The argument is a string naming that statistic.""" + + + + + +