Mercurial > pylearn
comparison dataset.py @ 11:be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
author | bengioy@esprit.iro.umontreal.ca |
---|---|
date | Wed, 26 Mar 2008 15:01:30 -0400 |
parents | de616c423dbd |
children | ff4e551490f1 813723310d75 |
comparison
equal
deleted
inserted
replaced
10:80bf5492e571 | 11:be128b9127c8 |
---|---|
1 | |
2 class Example(object): | |
3 """ | |
4 An example is something that is like a tuple but whose elements can be named, to that | |
5 following syntactic constructions work as one would expect: | |
6 example.x = [1, 2, 3] # set a field | |
7 x, y, z = example | |
8 x = example[0] | |
9 x = example["x"] | |
10 """ | |
11 def __init__(self,names,values): | |
12 assert len(values)==len(names) | |
13 self.__dict__['values']=values | |
14 self.__dict__['fields']={} | |
15 for i in xrange(len(values)): | |
16 self.fields[names[i]]=i | |
17 | |
18 def __getitem__(self,i): | |
19 if isinstance(i,int): | |
20 return self.values[i] | |
21 else: | |
22 return self.values[self.fields[i]] | |
23 | |
24 def __setitem__(self,i,value): | |
25 if isinstance(i,int): | |
26 self.values[i]=value | |
27 else: | |
28 self.values[self.fields[i]]=value | |
29 | |
30 def __getattr__(self,name): | |
31 return self.values[self.fields[name]] | |
32 | |
33 def __setattr__(self,name,value): | |
34 self.values[self.fields[name]]=value | |
35 | |
36 def __len__(self): | |
37 return len(self.values) | |
1 | 38 |
2 | 39 |
3 class DataSet(object): | 40 class DataSet(object): |
4 """ | 41 """ |
5 This is a virtual base class or interface for datasets. | 42 This is a virtual base class or interface for datasets. |
6 A dataset is basically an iterator over examples. It does not necessarily | 43 A dataset is basically an iterator over Examples (or anything that |
44 behaves like an Example). It does not necessarily | |
7 have a fixed length (this is useful for 'streams' which feed on-line learning). | 45 have a fixed length (this is useful for 'streams' which feed on-line learning). |
8 Datasets with fixed and known length are FiniteDataSet, a subclass of DataSet. | 46 Datasets with fixed and known length are instances of FiniteDataSet, a subclass |
9 Examples and datasets optionally have named fields. | 47 which supports indexing (dataset[i]) and slicing (dataset[1000:2000]). |
10 One can obtain a sub-dataset by taking dataset.field or dataset(field1,field2,field3,...). | 48 To iterate over a subset of the fields, one should use the dataset.zip(field1, field2,field3, ...) |
49 method which returns an iterator over only the desired fields. | |
11 Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. | 50 Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. |
12 The content of a field can be of any type, but often will be a numpy array. | 51 The content of a field can be of any type, but often will be a numpy array. |
13 The minibatch_size attribute, if different than 1, means that the iterator (next() method) | 52 If one iterates through minibatches of examples (with the minibatches() method |
14 returns not a single example but an array of length minibatch_size, i.e., an indexable | 53 or with the minibatch_size argument of the zip() method), then the fields |
15 object with minibatch_size examples in it. | 54 returned by the iterator's next method should be iterators over the |
16 """ | 55 individual values within the minibatch (typically these will be arrays |
17 | 56 with minibatch_size rows). |
18 def __init__(self,minibatch_size=1): | 57 """ |
19 assert minibatch_size>0 | 58 |
20 self.minibatch_size=minibatch_size | 59 def __init__(self): |
21 | 60 pass |
61 | |
22 def __iter__(self): | 62 def __iter__(self): |
23 """ | 63 """ |
24 Return an iterator, whose next() method returns the next example or the next | 64 Return an iterator, whose next() method returns the next example or the next |
25 minibatch in the dataset. A minibatch (of length > 1) should be something one | 65 minibatch in the dataset. A minibatch (of length > 1) is also an example, but |
26 can iterate on again in order to obtain the individual examples. If the dataset | 66 whose fields should be something one can iterate on again in order to obtain |
27 has fields, then the example or the minibatch must have the same fields | 67 the individual examples. |
28 (typically this is implemented by returning another smaller dataset, when | 68 """ |
29 there are fields). | 69 raise NotImplementedError |
30 """ | 70 |
31 raise NotImplementedError | 71 def zip(self,*fieldnames): |
72 """ | |
73 Return an iterator which sees only the specified fields (each fieldname is a | |
74 field key, typically a string). The value returned at each iteration | |
75 is a tuple with one element per field. Hence it can be used like this: | |
76 for f1, f2, f3 in dataset.zip('field1','field2','field3'): | |
77 ... use f1, f2, and f3 | |
78 """ | |
79 raise NotImplementedError | |
80 | |
81 def minibatches(self,minibatch_size,*fieldnames): | |
82 """ | |
83 Similar to zip but iterate over minibatches. | |
84 Return a minibatch iterator, whose next() method returns an 'example' | |
85 whose fields are iteratable objects (which can iterate over the individual | |
86 values of that field in the minibatch). | |
87 """ | |
88 raise NotImplementedError | |
89 | |
90 def fieldNames(self): | |
91 """Return the list of field names in the examples of this dataset.""" | |
92 raise NotImplementedError | |
93 | |
94 def rename(*new_field_specifications): | |
95 """ | |
96 Return a new dataset that maps old fields (of self) to new fields (of the returned | |
97 dataset). The minimal syntax that should be supported is the following: | |
98 new_field_specifications = [new_field_spec1, new_field_spec2, ...] | |
99 new_field_spec = ([old_field1, old_field2, ...], new_field) | |
100 In general both old_field and new_field should be strings, but some datasets may also | |
101 support additional indexing schemes within each field (e.g. column slice | |
102 of a matrix-like field). | |
103 """ | |
104 raise NotImplementedError | |
105 | |
106 class FiniteDataSet(DataSet): | |
107 """ | |
108 Virtual interface, a subclass of DataSet for datasets which have a finite, known length. | |
109 Examples are indexed by an integer between 0 and self.length()-1, | |
110 and a subdataset can be obtained by slicing. This may not be appropriate in general | |
111 but only for datasets which can be thought of like ones that access rows AND fields | |
112 in an efficient random access way. Users are encouraged to expect only the generic dataset | |
113 interface in general. A FiniteDataSet is mainly useful when one has to obtain | |
114 a subset of examples (e.g. for splitting a dataset into training and test sets). | |
115 """ | |
116 | |
117 def __init__(self): | |
118 pass | |
119 | |
120 def __iter__(self): | |
121 return FiniteDataSetIterator(self) | |
122 | |
123 def zip(self,*fieldnames): | |
124 return FiniteDataSetIterator(self,1,fieldnames) | |
125 | |
126 def minibatches(self,minibatch_size,*fieldnames): | |
127 return FiniteDataSetIterator(self,minibatch_size,fieldnames) | |
32 | 128 |
33 def __getattr__(self,fieldname): | 129 def __getattr__(self,fieldname): |
34 """Return a sub-dataset containing only the given fieldname as field.""" | 130 """Return an that can iterate over the values of the field in this dataset.""" |
35 return self(fieldname) | 131 return self(fieldname) |
36 | 132 |
37 def __call__(self,*fieldnames): | 133 def __call__(self,*fieldnames): |
38 """Return a sub-dataset containing only the given fieldnames as fields.""" | 134 """Return a sub-dataset containing only the given fieldnames as fields.""" |
39 raise NotImplementedError | 135 raise NotImplementedError |
40 | 136 |
41 def fieldNames(self): | |
42 """Return the list of field names that are supported by getattr and getFields.""" | |
43 raise NotImplementedError | |
44 | |
45 class FiniteDataSet(DataSet): | |
46 """ | |
47 Virtual interface, a subclass of DataSet for datasets which have a finite, known length. | |
48 Examples are indexed by an integer between 0 and self.length()-1, | |
49 and a subdataset can be obtained by slicing. | |
50 """ | |
51 | |
52 def __init__(self,minibatch_size): | |
53 DataSet.__init__(self,minibatch_size) | |
54 | |
55 def __iter__(self): | |
56 return FiniteDataSetIterator(self) | |
57 | |
58 def __len__(self): | 137 def __len__(self): |
59 """len(dataset) returns the number of examples in the dataset.""" | 138 """len(dataset) returns the number of examples in the dataset.""" |
60 raise NotImplementedError | 139 raise NotImplementedError |
61 | 140 |
62 def __getitem__(self,i): | 141 def __getitem__(self,i): |
66 def __getslice__(self,*slice_args): | 145 def __getslice__(self,*slice_args): |
67 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" | 146 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" |
68 raise NotImplementedError | 147 raise NotImplementedError |
69 | 148 |
70 class FiniteDataSetIterator(object): | 149 class FiniteDataSetIterator(object): |
71 def __init__(self,dataset): | 150 """ |
151 If the fieldnames list is empty, it means that we want to see ALL the fields. | |
152 """ | |
153 def __init__(self,dataset,minibatch_size=1,fieldnames=[]): | |
72 self.dataset=dataset | 154 self.dataset=dataset |
73 self.current = -self.dataset.minibatch_size | 155 self.minibatch_size=minibatch_size |
74 | 156 assert minibatch_size>=1 and minibatch_size<=len(dataset) |
157 self.current = -self.minibatch_size | |
158 self.fieldnames = fieldnames | |
159 | |
160 def __iter__(self): | |
161 return self | |
162 | |
75 def next(self): | 163 def next(self): |
76 """ | 164 self.current+=self.minibatch_size |
77 Return the next example(s) in the dataset. If self.dataset.minibatch_size>1 return that | |
78 many examples. If the dataset has fields, the example or the minibatch of examples | |
79 is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed), | |
80 but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate | |
81 example-wise on it. On the other hand, if the dataset has no fields (e.g. because | |
82 it is already the field of a bigger dataset), then the returned example or minibatch | |
83 may be any indexable object, such as a numpy array. Following the array semantics of indexing | |
84 and slicing, if the minibatch_size is 1 (and there are no fields), then the result is an array | |
85 with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding | |
86 to a row. Again, if the minibatch_size is >1, one can iterate on the result to | |
87 obtain individual examples (as rows). | |
88 """ | |
89 self.current+=self.dataset.minibatch_size | |
90 if self.current>=len(self.dataset): | 165 if self.current>=len(self.dataset): |
91 self.current=-self.dataset.minibatch_size | 166 self.current=-self.minibatch_size |
92 raise StopIteration | 167 raise StopIteration |
93 if self.dataset.minibatch_size==1: | 168 if self.minibatch_size==1: |
94 return self.dataset[self.current] | 169 complete_example=self.dataset[self.current] |
95 else: | 170 else: |
96 return self.dataset[self.current:self.current+self.dataset.minibatch_size] | 171 complete_example=self.dataset[self.current:self.current+self.minibatch_size] |
172 if self.fieldnames: | |
173 return Example(self.fieldnames,list(complete_example)) | |
174 else: | |
175 return complete_example | |
97 | 176 |
98 | 177 |
99 # we may want ArrayDataSet defined in another python file | 178 # we may want ArrayDataSet defined in another python file |
100 | 179 |
101 import numpy | 180 import numpy |
102 | 181 |
103 class ArrayDataSet(FiniteDataSet): | 182 class ArrayDataSet(FiniteDataSet): |
104 """ | 183 """ |
105 An ArrayDataSet behaves like a numpy array but adds the notion of fields | 184 An ArrayDataSet behaves like a numpy array but adds the notion of named fields |
106 and minibatch_size from DataSet. It is a fixed-length and fixed-width dataset | 185 from DataSet (and the ability to view multiple field values as an 'Example'). |
186 It is a fixed-length and fixed-width dataset | |
107 in which each element is a numpy array or a number, hence the whole | 187 in which each element is a numpy array or a number, hence the whole |
108 dataset corresponds to a numpy array. Fields | 188 dataset corresponds to a numpy array. Fields |
109 must correspond to a slice of array columns. If the dataset has fields, | 189 must correspond to a slice of array columns. If the dataset has fields, |
110 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. | 190 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. |
111 Any dataset can also be converted to a numpy array (losing the notion of fields | 191 Any dataset can also be converted to a numpy array (losing the notion of fields |
112 and of minibatch_size) by the numpy.array(dataset) call. | 192 by the numpy.array(dataset) call. |
113 """ | 193 """ |
114 | 194 |
115 def __init__(self,dataset=None,data=None,fields={},minibatch_size=1): | 195 def __init__(self,dataset=None,data=None,fields={}): |
116 """ | 196 """ |
117 There are two ways to construct an ArrayDataSet: (1) from an | 197 There are two ways to construct an ArrayDataSet: (1) from an |
118 existing dataset (which may result in a copy of the data in a numpy array), | 198 existing dataset (which may result in a copy of the data in a numpy array), |
119 or (2) from a numpy.array (the data argument), along with an optional description | 199 or (2) from a numpy.array (the data argument), along with an optional description |
120 of the fields (dictionary of column slices indexed by field names). | 200 of the fields (dictionary of column slices indexed by field names). |
121 """ | 201 """ |
122 FiniteDataSet.__init__(self,minibatch_size) | |
123 if dataset!=None: | 202 if dataset!=None: |
124 assert data==None and fields=={} | 203 assert data==None and fields=={} |
125 # convert dataset to an ArrayDataSet | 204 # Make ONE big minibatch with all the examples, to separate the fields. |
205 n_examples=len(dataset) | |
206 batch = dataset.minibatches(n_examples).next() | |
207 # Each field of the underlying dataset must be convertible to a numpy array of the same type | |
208 # currently just double, but should use the smallest compatible dtype | |
209 n_fields = len(batch) | |
210 fieldnames = batch.fields.keys() | |
211 total_width = 0 | |
212 type = None | |
213 for i in xrange(n_fields): | |
214 field = array(batch[i]) | |
215 assert field.shape[0]==n_examples | |
216 width = field.shape[1] | |
217 start=total_width | |
218 total_width += width | |
219 fields[fieldnames[i]]=slice(start,total_width,1) | |
220 # many complicated things remain to be done: | |
221 # - find common dtype | |
222 # - decide what to do with extra dimensions if not the same in all fields | |
223 # - try to see if we can avoid the copy? | |
126 raise NotImplementedError | 224 raise NotImplementedError |
127 if data!=None: | 225 if data!=None: |
128 assert dataset==None | 226 assert dataset==None |
129 self.data=data | 227 self.data=data |
130 self.fields=fields | 228 self.fields=fields |
140 step=1 | 238 step=1 |
141 if not fieldslice.start or not fieldslice.step: | 239 if not fieldslice.start or not fieldslice.step: |
142 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) | 240 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) |
143 # and coherent with the data array | 241 # and coherent with the data array |
144 assert fieldslice.start>=0 and fieldslice.stop<=self.width | 242 assert fieldslice.start>=0 and fieldslice.stop<=self.width |
145 assert minibatch_size<=len(self.data) | |
146 | 243 |
147 def __getattr__(self,fieldname): | 244 def __getattr__(self,fieldname): |
148 """ | 245 """ |
149 Return a numpy array with the content associated with the given field name. | 246 Return a numpy array with the content associated with the given field name. |
150 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension | 247 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension |
151 than the dataset.data) is returned. | 248 than the dataset itself) is returned. |
152 """ | 249 """ |
153 if len(self.data)==1: | 250 if len(self.data)==1: |
154 return self.data[0,self.fields[fieldname]] | 251 return self.data[0,self.fields[fieldname]] |
155 return self.data[:,self.fields[fieldname]] | 252 return self.data[:,self.fields[fieldname]] |
156 | 253 |
162 min_col=min(min_col,field_slice.start) | 259 min_col=min(min_col,field_slice.start) |
163 max_col=max(max_col,field_slice.stop) | 260 max_col=max(max_col,field_slice.stop) |
164 new_fields={} | 261 new_fields={} |
165 for field in self.fields: | 262 for field in self.fields: |
166 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) | 263 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) |
167 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields,minibatch_size=self.minibatch_size) | 264 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields) |
168 | 265 |
169 def fieldNames(self): | 266 def fieldNames(self): |
170 """Return the list of field names that are supported by getattr and getFields.""" | 267 """Return the list of field names that are supported by getattr and getFields.""" |
171 return self.fields.keys() | 268 return self.fields.keys() |
172 | 269 |
174 """len(dataset) returns the number of examples in the dataset.""" | 271 """len(dataset) returns the number of examples in the dataset.""" |
175 return len(self.data) | 272 return len(self.data) |
176 | 273 |
177 def __getitem__(self,i): | 274 def __getitem__(self,i): |
178 """ | 275 """ |
179 dataset[i] returns the (i+1)-th example of the dataset. If the dataset has fields | 276 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields |
180 then a one-example dataset is returned (to be able to handle example.field accesses). | 277 the result is just a numpy array (for the i-th row of the dataset data matrix). |
181 """ | 278 """ |
182 if self.fields: | 279 if self.fields: |
183 if isinstance(i,slice): | 280 fieldnames,fieldslices=zip(*self.fields.items()) |
184 return ArrayDataSet(data=data[slice],fields=self.fields) | 281 return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices]) |
185 return ArrayDataSet(data=self.data[i:i+1],fields=self.fields) | |
186 else: | 282 else: |
187 return self.data[i] | 283 return self.data[i] |
188 | 284 |
189 def __getslice__(self,*slice_args): | 285 def __getslice__(self,*slice_args): |
190 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" | 286 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" |
230 slice_width=field_slice.stop-field_slice.start/field_slice.step | 326 slice_width=field_slice.stop-field_slice.start/field_slice.step |
231 # copy the field here | 327 # copy the field here |
232 result[:,slice(c,slice_width)]=self.data[:,field_slice] | 328 result[:,slice(c,slice_width)]=self.data[:,field_slice] |
233 c+=slice_width | 329 c+=slice_width |
234 return result | 330 return result |
331 | |
332 |