comparison dataset.py @ 2:3fddb1c8f955

Rewrote DataSet interface and created FiniteDataSet interface.
author bengioy@bengiomac.local
date Sat, 22 Mar 2008 22:21:59 -0400
parents 2cd82666b9a7
children 378b68d5c4ad
comparison
equal deleted inserted replaced
1:2cd82666b9a7 2:3fddb1c8f955
1 1
2 2
3 class DataSet(object): 3 class DataSet(object):
4 """Base class for representing a fixed-size or variable-size (online learning) 4 """
5 data set. A DataSet is used in a Learner to represent a training set or a 5 This is a virtual base class or interface for datasets.
6 validation set. It is an indexed collection of examples. An example 6 A dataset is basically an iterator over examples. It does not necessarily
7 is expected to obey the syntax of dictionaries, i.e., it contains named 7 have a fixed length (this is useful for 'streams' which feed on-line learning).
8 fields that can be accessed via the [fieldname] syntax. 8 Datasets with fixed and known length are FiniteDataSet, a subclass of DataSet.
9 If one views a DataSet as a matrix, the [i] operator selects a row while the .fieldname 9 Examples and datasets have named fields.
10 operator selects a named 'field' or column. However, each of the entries in one of 10 One can obtain a sub-dataset by taking dataset.field or dataset(field1,field2,field3,...).
11 these 'columns' can be any python object, not just a number. One can also 11 Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
12 use the slicing notation to select a subset of example and the getFields 12 The content of a field can be of any type, but often will be a numpy tensor.
13 method to select a subset of the fields.""" 13 """
14 14
15 __init__(self): 15 def __init__(self):
16 pass 16 pass
17 17
18 size(self): 18 def __iter__():
19 """Return -1 for variable-size DataSets (for on-line learning), and 19 return self
20 the actual size otherwise""" 20
21 return 0 21 def next():
22 """Return the next example in the dataset."""
23 raise NotImplementedError
24
25 def __getattr__(fieldname):
26 """Return a sub-dataset containing only the given fieldname as field."""
27 return self(fieldname)
28
29 def __call__(*fieldnames):
30 """Return a sub-dataset containing only the given fieldnames as fields."""
31 raise NotImplementedError
22 32
23 fieldNames(self): 33 fieldNames(self):
24 """Return the list of field names that are supported by getattr and getFields.""" 34 """Return the list of field names that are supported by getattr and getFields."""
25 raise NotImplementedError 35 raise NotImplementedError
26 36
27 __getitem__(self, i): 37 class FiniteDataSet(DataSet):
28 """dataset[i] returns i-th example from DataSet. For fixed-size DataSets i should be 38 """
29 between 0 and size()-1. For on-line DataSets, the argument is ignored (and 39 Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
30 should be -1 by convention to make it clear that it is not used), and 40 Examples are indexed by an integer between 0 and self.length()-1,
31 the next available example in the example stream is returned.""" 41 and a subdataset can be obtained by slicing.
32 return self.get_slice(i) 42 """
33
34 43
35 __getslice__(self,*args): 44 def __init__(self):
36 """Return a DataSet that is a subset of self, by specifying either 45 pass
37 an interval of indices or list of indices, in the standard slicing notation."""
38 return self.get_slice(slice(*args))
39 46
40 get_slice(self,slice_or_index): 47 def __len__(self):
41 """This method should be redefined to do the actual work of slicing / getting an element.""" 48 """len(dataset) returns the number of examples in the dataset."""
49 raise NotImplementedError
50
51 def __getitem__(self,i):
52 """dataset[i] returns the (i+1)-th example of the dataset."""
42 raise NotImplementedError 53 raise NotImplementedError
43 54
44 __getattr__(self, attribute): 55 def __getslice__(self,*slice_args):
45 """Return a DataSet that only contains the requested attribute from the examples.""" 56 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
46 raise NotImplementedError 57 raise NotImplementedError
47
48 getFields(self,fields):
49 """Return an DataSet that only sees the fields named in the argument."""
50 raise NotImplementedError
51
52 58
53