view dataset.py @ 1:2cd82666b9a7

Added statscollector and started writing dataset and learner.
author bengioy@esprit.iro.umontreal.ca
date Fri, 14 Mar 2008 11:28:08 -0400
parents 586dcaa4b2df
children 3fddb1c8f955
line wrap: on
line source


    
class DataSet(object):
    """Base class for representing a fixed-size or variable-size (online learning)
    data set. A DataSet is used in a Learner to represent a training set or a
    validation set. It is an indexed collection of examples. An example
    is expected to obey the syntax of dictionaries, i.e., it contains named
    fields that can be accessed via the [fieldname] syntax.
    If one views a DataSet as a matrix, the [i] operator selects a row while the .fieldname
    operator selects a named 'field' or column. However, each of the entries in one of
    these 'columns' can be any python object, not just a number. One can also
    use the slicing notation to select a subset of example and the getFields
    method to select a subset of the fields."""

    __init__(self):
        pass

    size(self):
        """Return -1 for variable-size DataSets (for on-line learning), and
        the actual size otherwise"""
        return 0

    fieldNames(self):
        """Return the list of field names that are supported by getattr and getFields."""
        raise NotImplementedError

    __getitem__(self, i):
        """dataset[i] returns i-th example from DataSet. For fixed-size DataSets i should be
        between 0 and size()-1. For on-line DataSets, the argument is ignored (and
        should be -1 by convention to make it clear that it is not used), and
        the next available example in the example stream is returned."""
        return self.get_slice(i)
        

    __getslice__(self,*args):
        """Return a DataSet that is a subset of self, by specifying either
        an interval of indices or list of indices, in the standard slicing notation."""
        return self.get_slice(slice(*args))

    get_slice(self,slice_or_index):
        """This method should be redefined to do the actual work of slicing / getting an element."""
        raise NotImplementedError

    __getattr__(self, attribute):
        """Return a DataSet that only contains the requested attribute from the examples."""
        raise NotImplementedError

    getFields(self,fields):
        """Return an DataSet that only sees the fields named in the argument."""
        raise NotImplementedError