Mercurial > pylearn
comparison dataset.py @ 12:ff4e551490f1
Added LookupList type in lookup_list.py and used it to keep order
of field names in Example in ArrayDataSet. Example is now just = LookupList.
author | bengioy@esprit.iro.umontreal.ca |
---|---|
date | Wed, 26 Mar 2008 18:21:57 -0400 |
parents | be128b9127c8 |
children | 759d17112b23 |
comparison
equal
deleted
inserted
replaced
11:be128b9127c8 | 12:ff4e551490f1 |
---|---|
1 | 1 |
2 class Example(object): | 2 from lookup_list import LookupList |
3 """ | 3 Example = LookupList |
4 An example is something that is like a tuple but whose elements can be named, to that | 4 |
5 following syntactic constructions work as one would expect: | |
6 example.x = [1, 2, 3] # set a field | |
7 x, y, z = example | |
8 x = example[0] | |
9 x = example["x"] | |
10 """ | |
11 def __init__(self,names,values): | |
12 assert len(values)==len(names) | |
13 self.__dict__['values']=values | |
14 self.__dict__['fields']={} | |
15 for i in xrange(len(values)): | |
16 self.fields[names[i]]=i | |
17 | |
18 def __getitem__(self,i): | |
19 if isinstance(i,int): | |
20 return self.values[i] | |
21 else: | |
22 return self.values[self.fields[i]] | |
23 | |
24 def __setitem__(self,i,value): | |
25 if isinstance(i,int): | |
26 self.values[i]=value | |
27 else: | |
28 self.values[self.fields[i]]=value | |
29 | |
30 def __getattr__(self,name): | |
31 return self.values[self.fields[name]] | |
32 | |
33 def __setattr__(self,name,value): | |
34 self.values[self.fields[name]]=value | |
35 | |
36 def __len__(self): | |
37 return len(self.values) | |
38 | |
39 | |
40 class DataSet(object): | 5 class DataSet(object): |
41 """ | 6 """ |
42 This is a virtual base class or interface for datasets. | 7 This is a virtual base class or interface for datasets. |
43 A dataset is basically an iterator over Examples (or anything that | 8 A dataset is basically an iterator over Examples (or anything that |
44 behaves like an Example). It does not necessarily | 9 behaves like an Example). It does not necessarily |
190 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. | 155 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. |
191 Any dataset can also be converted to a numpy array (losing the notion of fields | 156 Any dataset can also be converted to a numpy array (losing the notion of fields |
192 by the numpy.array(dataset) call. | 157 by the numpy.array(dataset) call. |
193 """ | 158 """ |
194 | 159 |
195 def __init__(self,dataset=None,data=None,fields={}): | 160 def __init__(self,dataset=None,data=None,fields=None): |
196 """ | 161 """ |
197 There are two ways to construct an ArrayDataSet: (1) from an | 162 There are two ways to construct an ArrayDataSet: (1) from an |
198 existing dataset (which may result in a copy of the data in a numpy array), | 163 existing dataset (which may result in a copy of the data in a numpy array), |
199 or (2) from a numpy.array (the data argument), along with an optional description | 164 or (2) from a numpy.array (the data argument), along with an optional description |
200 of the fields (dictionary of column slices indexed by field names). | 165 of the fields (a LookupList of column slices indexed by field names). |
201 """ | 166 """ |
202 if dataset!=None: | 167 if dataset!=None: |
203 assert data==None and fields=={} | 168 assert data==None and fields==None |
204 # Make ONE big minibatch with all the examples, to separate the fields. | 169 # Make ONE big minibatch with all the examples, to separate the fields. |
205 n_examples=len(dataset) | 170 n_examples=len(dataset) |
206 batch = dataset.minibatches(n_examples).next() | 171 batch = dataset.minibatches(n_examples).next() |
207 # Each field of the underlying dataset must be convertible to a numpy array of the same type | 172 # Each field of the underlying dataset must be convertible to a numpy array of the same type |
208 # currently just double, but should use the smallest compatible dtype | 173 # currently just double, but should use the smallest compatible dtype |
209 n_fields = len(batch) | 174 n_fields = len(batch) |
210 fieldnames = batch.fields.keys() | 175 fieldnames = batch.fields.keys() |
211 total_width = 0 | 176 total_width = 0 |
212 type = None | 177 type = None |
178 fields = LookupList() | |
213 for i in xrange(n_fields): | 179 for i in xrange(n_fields): |
214 field = array(batch[i]) | 180 field = array(batch[i]) |
215 assert field.shape[0]==n_examples | 181 assert field.shape[0]==n_examples |
216 width = field.shape[1] | 182 width = field.shape[1] |
217 start=total_width | 183 start=total_width |
225 if data!=None: | 191 if data!=None: |
226 assert dataset==None | 192 assert dataset==None |
227 self.data=data | 193 self.data=data |
228 self.fields=fields | 194 self.fields=fields |
229 self.width = data.shape[1] | 195 self.width = data.shape[1] |
230 for fieldname in fields: | 196 if fields: |
231 fieldslice=fields[fieldname] | 197 for fieldname,fieldslice in fields.items(): |
232 # make sure fieldslice.start and fieldslice.step are defined | 198 # make sure fieldslice.start and fieldslice.step are defined |
233 start=fieldslice.start | 199 start=fieldslice.start |
234 step=fieldslice.step | 200 step=fieldslice.step |
235 if not start: | 201 if not start: |
236 start=0 | 202 start=0 |
237 if not step: | 203 if not step: |
238 step=1 | 204 step=1 |
239 if not fieldslice.start or not fieldslice.step: | 205 if not fieldslice.start or not fieldslice.step: |
240 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) | 206 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) |
241 # and coherent with the data array | 207 # and coherent with the data array |
242 assert fieldslice.start>=0 and fieldslice.stop<=self.width | 208 assert fieldslice.start>=0 and fieldslice.stop<=self.width |
243 | 209 |
244 def __getattr__(self,fieldname): | 210 def __getattr__(self,fieldname): |
245 """ | 211 """ |
246 Return a numpy array with the content associated with the given field name. | 212 Return a numpy array with the content associated with the given field name. |
247 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension | 213 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension |
256 min_col=self.data.shape[1] | 222 min_col=self.data.shape[1] |
257 max_col=0 | 223 max_col=0 |
258 for field_slice in self.fields.values(): | 224 for field_slice in self.fields.values(): |
259 min_col=min(min_col,field_slice.start) | 225 min_col=min(min_col,field_slice.start) |
260 max_col=max(max_col,field_slice.stop) | 226 max_col=max(max_col,field_slice.stop) |
261 new_fields={} | 227 new_fields=LookupList() |
262 for field in self.fields: | 228 for fieldname,fieldslice in self.fields.items(): |
263 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) | 229 new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step) |
264 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields) | 230 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields) |
265 | 231 |
266 def fieldNames(self): | 232 def fieldNames(self): |
267 """Return the list of field names that are supported by getattr and getFields.""" | 233 """Return the list of field names that are supported by getattr and getFields.""" |
268 return self.fields.keys() | 234 return self.fields.keys() |
276 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields | 242 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields |
277 the result is just a numpy array (for the i-th row of the dataset data matrix). | 243 the result is just a numpy array (for the i-th row of the dataset data matrix). |
278 """ | 244 """ |
279 if self.fields: | 245 if self.fields: |
280 fieldnames,fieldslices=zip(*self.fields.items()) | 246 fieldnames,fieldslices=zip(*self.fields.items()) |
281 return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices]) | 247 return Example(self.fields.keys(),[self.data[i,fieldslice] for fieldslice in self.fields.values()]) |
282 else: | 248 else: |
283 return self.data[i] | 249 return self.data[i] |
284 | 250 |
285 def __getslice__(self,*slice_args): | 251 def __getslice__(self,*slice_args): |
286 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" | 252 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" |