view statscollector.py @ 216:4b7e89b75e2b

Modified ArrayDataSet's handling of column fields. Previously, if a fieldname were associated with an integer column index (by opposition to a column range or slice) then it would be returned as a Nx1 matrix. Now if a fieldname is associated with an integer column index, then it will make a field which is a vector of length N. The old behaviour can still be achieved by associating a fieldname with the slice(col, col+1).
author James Bergstra <bergstrj@iro.umontreal.ca>
date Thu, 22 May 2008 19:07:51 -0400
parents 50a8302addaf
children fe57b96f33d4
line wrap: on
line source


# Here is how I see stats collectors:

#    def my_stats((residue,nll),(regularizer)):
#            mse=examplewise_mean(square_norm(residue))
# 	         training_loss=regularizer+examplewise_sum(nll)
#            set_names(locals())
#            return ((residue,nll),(regularizer),(),(mse,training_loss))
#    my_stats_collector = make_stats_collector(my_stats)
#
# where make_stats_collector calls my_stats(examplewise_fields, attributes) to
# construct its update function, and figure out what are the input fields (here "residue"
# and "nll") and input attributes (here "regularizer") it needs, and the output
# attributes that it computes (here "mse" and "training_loss"). Remember that
# fields are examplewise quantities, but attributes are not, in my jargon.
# In the above example, I am highlighting that some operations done in my_stats
# are examplewise and some are not.  I am hoping that theano Ops can do these
# kinds of internal side-effect operations (and proper initialization of these hidden
# variables). I expect that a StatsCollector (returned by make_stats_collector)
# knows the following methods:
#     stats_collector.input_fieldnames
#     stats_collector.input_attribute_names
#     stats_collector.output_attribute_names
#     stats_collector.update(mini_dataset)
#     stats_collector['mse']
# where mini_dataset has the input_fieldnames() as fields and the input_attribute_names()
# as attributes, and in the resulting dataset the output_attribute_names() are set to the
# proper numeric values.



import theano
from theano import tensor as t
from Learner import Learner
from lookup_list import LookupList

class StatsCollectorModel(AttributesHolder):
    def __init__(self,stats_collector):
        self.stats_collector = stats_collector
        self.outputs = LookupList(stats_collector.output_names,[None for name in stats_collector.output_names])
        # the statistics get initialized here
        self.update_function = theano.function(input_attributes+input_fields,output_attributes+output_fields,linker="c|py")
        for name,value in self.outputs.items():
            self.__setattribute__(name,value)
    def update(self,dataset):
        input_fields = dataset.fields()(self.stats_collector.input_field_names)
        input_attributes = dataset.getAttributes(self.stats_collector.input_attribute_names)
        self.outputs._values = self.update_function(input_attributes+input_fields)
        for name,value in self.outputs.items():
            self.__setattribute__(name,value)
    def __call__(self):
        return self.outputs
    def attributeNames(self):
        return self.outputs.keys()
    
class StatsCollector(AttributesHolder):
        
    def __init__(self,input_attributes, input_fields, outputs):
        self.input_attributes = input_attributes
        self.input_fields = input_fields
        self.outputs = outputs
        self.input_attribute_names = [v.name for v in input_attributes]
        self.input_field_names = [v.name for v in input_fields]
        self.output_names = [v.name for v in output_attributes]
            
    def __call__(self,dataset=None):
        model = StatsCollectorModel(self)
        if dataset:
            self.update(dataset)
        return model

if __name__ == '__main__':
    def my_statscollector():
        regularizer = t.scalar()
        nll = t.matrix()
        class_error = t.matrix()
        total_loss = regularizer+t.examplewise_sum(nll)
        avg_nll = t.examplewise_mean(nll)
        avg_class_error = t.examplewise_mean(class_error)
        for name,val in locals().items(): val.name = name
        return StatsCollector([regularizer],[nll,class_error],[total_loss,avg_nll,avg_class_error])
    



# OLD DESIGN:
#
# class StatsCollector(object):
#     """A StatsCollector object is used to record performance statistics during training
#     or testing of a learner. It can be configured to measure different things and
#     accumulate the appropriate statistics. From these statistics it can be interrogated
#     to obtain performance measures of interest (such as maxima, minima, mean, standard
#     deviation, standard error, etc.). Optionally, the observations can be weighted
#     (yielded weighted mean, weighted variance, etc., where applicable). The statistics
#     that are desired can be specified among a list supported by the StatsCollector
#     class or subclass. When some statistics are requested, others become automatically
#     available (e.g., sum or mean)."""
#
#     default_statistics = [mean,standard_deviation,min,max]
#    
#     __init__(self,n_quantities_observed, statistics=default_statistics):
#         self.n_quantities_observed=n_quantities_observed
#
#     clear(self):
#         raise NotImplementedError
#
#     update(self,observations):
#         """The observations is a numpy vector of length n_quantities_observed. Some
#         entries can be 'missing' (with a NaN entry) and will not be counted in the
#         statistics."""
#         raise NotImplementedError
#
#     __getattr__(self, statistic)
#         """Return a particular statistic, which may be inferred from the collected statistics.
#         The argument is a string naming that statistic."""