# HG changeset patch # User bengioy@bengiomac.local # Date 1206238919 14400 # Node ID 3fddb1c8f9552dfa9d5eaa978b839d0b3231f057 # Parent 2cd82666b9a7e428d05618a41cb4f1f3bc0b9483 Rewrote DataSet interface and created FiniteDataSet interface. diff -r 2cd82666b9a7 -r 3fddb1c8f955 dataset.py --- a/dataset.py Fri Mar 14 11:28:08 2008 -0400 +++ b/dataset.py Sat Mar 22 22:21:59 2008 -0400 @@ -1,53 +1,58 @@ class DataSet(object): - """Base class for representing a fixed-size or variable-size (online learning) - data set. A DataSet is used in a Learner to represent a training set or a - validation set. It is an indexed collection of examples. An example - is expected to obey the syntax of dictionaries, i.e., it contains named - fields that can be accessed via the [fieldname] syntax. - If one views a DataSet as a matrix, the [i] operator selects a row while the .fieldname - operator selects a named 'field' or column. However, each of the entries in one of - these 'columns' can be any python object, not just a number. One can also - use the slicing notation to select a subset of example and the getFields - method to select a subset of the fields.""" + """ + This is a virtual base class or interface for datasets. + A dataset is basically an iterator over examples. It does not necessarily + have a fixed length (this is useful for 'streams' which feed on-line learning). + Datasets with fixed and known length are FiniteDataSet, a subclass of DataSet. + Examples and datasets have named fields. + One can obtain a sub-dataset by taking dataset.field or dataset(field1,field2,field3,...). + Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. + The content of a field can be of any type, but often will be a numpy tensor. + """ - __init__(self): + def __init__(self): pass - size(self): - """Return -1 for variable-size DataSets (for on-line learning), and - the actual size otherwise""" - return 0 + def __iter__(): + return self + + def next(): + """Return the next example in the dataset.""" + raise NotImplementedError + + def __getattr__(fieldname): + """Return a sub-dataset containing only the given fieldname as field.""" + return self(fieldname) + + def __call__(*fieldnames): + """Return a sub-dataset containing only the given fieldnames as fields.""" + raise NotImplementedError fieldNames(self): """Return the list of field names that are supported by getattr and getFields.""" raise NotImplementedError - __getitem__(self, i): - """dataset[i] returns i-th example from DataSet. For fixed-size DataSets i should be - between 0 and size()-1. For on-line DataSets, the argument is ignored (and - should be -1 by convention to make it clear that it is not used), and - the next available example in the example stream is returned.""" - return self.get_slice(i) - +class FiniteDataSet(DataSet): + """ + Virtual interface, a subclass of DataSet for datasets which have a finite, known length. + Examples are indexed by an integer between 0 and self.length()-1, + and a subdataset can be obtained by slicing. + """ - __getslice__(self,*args): - """Return a DataSet that is a subset of self, by specifying either - an interval of indices or list of indices, in the standard slicing notation.""" - return self.get_slice(slice(*args)) + def __init__(self): + pass - get_slice(self,slice_or_index): - """This method should be redefined to do the actual work of slicing / getting an element.""" + def __len__(self): + """len(dataset) returns the number of examples in the dataset.""" + raise NotImplementedError + + def __getitem__(self,i): + """dataset[i] returns the (i+1)-th example of the dataset.""" raise NotImplementedError - __getattr__(self, attribute): - """Return a DataSet that only contains the requested attribute from the examples.""" + def __getslice__(self,*slice_args): + """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" raise NotImplementedError - - getFields(self,fields): - """Return an DataSet that only sees the fields named in the argument.""" - raise NotImplementedError - -