Mercurial > pylearn
comparison dataset.py @ 2:3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
author | bengioy@bengiomac.local |
---|---|
date | Sat, 22 Mar 2008 22:21:59 -0400 |
parents | 2cd82666b9a7 |
children | 378b68d5c4ad |
comparison
equal
deleted
inserted
replaced
1:2cd82666b9a7 | 2:3fddb1c8f955 |
---|---|
1 | 1 |
2 | 2 |
3 class DataSet(object): | 3 class DataSet(object): |
4 """Base class for representing a fixed-size or variable-size (online learning) | 4 """ |
5 data set. A DataSet is used in a Learner to represent a training set or a | 5 This is a virtual base class or interface for datasets. |
6 validation set. It is an indexed collection of examples. An example | 6 A dataset is basically an iterator over examples. It does not necessarily |
7 is expected to obey the syntax of dictionaries, i.e., it contains named | 7 have a fixed length (this is useful for 'streams' which feed on-line learning). |
8 fields that can be accessed via the [fieldname] syntax. | 8 Datasets with fixed and known length are FiniteDataSet, a subclass of DataSet. |
9 If one views a DataSet as a matrix, the [i] operator selects a row while the .fieldname | 9 Examples and datasets have named fields. |
10 operator selects a named 'field' or column. However, each of the entries in one of | 10 One can obtain a sub-dataset by taking dataset.field or dataset(field1,field2,field3,...). |
11 these 'columns' can be any python object, not just a number. One can also | 11 Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. |
12 use the slicing notation to select a subset of example and the getFields | 12 The content of a field can be of any type, but often will be a numpy tensor. |
13 method to select a subset of the fields.""" | 13 """ |
14 | 14 |
15 __init__(self): | 15 def __init__(self): |
16 pass | 16 pass |
17 | 17 |
18 size(self): | 18 def __iter__(): |
19 """Return -1 for variable-size DataSets (for on-line learning), and | 19 return self |
20 the actual size otherwise""" | 20 |
21 return 0 | 21 def next(): |
22 """Return the next example in the dataset.""" | |
23 raise NotImplementedError | |
24 | |
25 def __getattr__(fieldname): | |
26 """Return a sub-dataset containing only the given fieldname as field.""" | |
27 return self(fieldname) | |
28 | |
29 def __call__(*fieldnames): | |
30 """Return a sub-dataset containing only the given fieldnames as fields.""" | |
31 raise NotImplementedError | |
22 | 32 |
23 fieldNames(self): | 33 fieldNames(self): |
24 """Return the list of field names that are supported by getattr and getFields.""" | 34 """Return the list of field names that are supported by getattr and getFields.""" |
25 raise NotImplementedError | 35 raise NotImplementedError |
26 | 36 |
27 __getitem__(self, i): | 37 class FiniteDataSet(DataSet): |
28 """dataset[i] returns i-th example from DataSet. For fixed-size DataSets i should be | 38 """ |
29 between 0 and size()-1. For on-line DataSets, the argument is ignored (and | 39 Virtual interface, a subclass of DataSet for datasets which have a finite, known length. |
30 should be -1 by convention to make it clear that it is not used), and | 40 Examples are indexed by an integer between 0 and self.length()-1, |
31 the next available example in the example stream is returned.""" | 41 and a subdataset can be obtained by slicing. |
32 return self.get_slice(i) | 42 """ |
33 | |
34 | 43 |
35 __getslice__(self,*args): | 44 def __init__(self): |
36 """Return a DataSet that is a subset of self, by specifying either | 45 pass |
37 an interval of indices or list of indices, in the standard slicing notation.""" | |
38 return self.get_slice(slice(*args)) | |
39 | 46 |
40 get_slice(self,slice_or_index): | 47 def __len__(self): |
41 """This method should be redefined to do the actual work of slicing / getting an element.""" | 48 """len(dataset) returns the number of examples in the dataset.""" |
49 raise NotImplementedError | |
50 | |
51 def __getitem__(self,i): | |
52 """dataset[i] returns the (i+1)-th example of the dataset.""" | |
42 raise NotImplementedError | 53 raise NotImplementedError |
43 | 54 |
44 __getattr__(self, attribute): | 55 def __getslice__(self,*slice_args): |
45 """Return a DataSet that only contains the requested attribute from the examples.""" | 56 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" |
46 raise NotImplementedError | 57 raise NotImplementedError |
47 | |
48 getFields(self,fields): | |
49 """Return an DataSet that only sees the fields named in the argument.""" | |
50 raise NotImplementedError | |
51 | |
52 | 58 |
53 |