comparison dataset.py @ 17:759d17112b23

more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
author bergstrj@iro.umontreal.ca
date Wed, 26 Mar 2008 21:05:14 -0400
parents 813723310d75 ff4e551490f1
children 57f4015e2e09
comparison
equal deleted inserted replaced
16:813723310d75 17:759d17112b23
1 1
2 class Example(object): 2 from lookup_list import LookupList
3 """ 3 Example = LookupList
4 An example is something that is like a tuple but whose elements can be named, to that 4
5 following syntactic constructions work as one would expect: 5 class AbstractFunction (Exception): """Derived class must override this function"""
6 example.x = [1, 2, 3] # set a field 6
7 x, y, z = example
8 x = example[0]
9 x = example["x"]
10 """
11 def __init__(self,names,values):
12 assert len(values)==len(names)
13 self.__dict__['values']=values
14 self.__dict__['fields']={}
15 for i in xrange(len(values)):
16 self.fields[names[i]]=i
17
18 def __getitem__(self,i):
19 if isinstance(i,int):
20 return self.values[i]
21 else:
22 return self.values[self.fields[i]]
23
24 def __setitem__(self,i,value):
25 if isinstance(i,int):
26 self.values[i]=value
27 else:
28 self.values[self.fields[i]]=value
29
30 def __getattr__(self,name):
31 return self.values[self.fields[name]]
32
33 def __setattr__(self,name,value):
34 self.values[self.fields[name]]=value
35
36 def __len__(self):
37 return len(self.values)
38
39
40 class DataSet(object): 7 class DataSet(object):
41 """A virtual base class for datasets. 8 """A virtual base class for datasets.
42 9
43 A DataSet is a generator of iterators; these iterators can run through the 10 A DataSet is a generator of iterators; these iterators can run through the
44 examples in a variety of ways. A DataSet need not necessarily have a finite 11 examples in a variety of ways. A DataSet need not necessarily have a finite
71 all the fields of DataSet self. Every field of "i" will give access to 38 all the fields of DataSet self. Every field of "i" will give access to
72 a the field of a single example. Fields should be accessible via 39 a the field of a single example. Fields should be accessible via
73 i[identifier], but the derived class is free to accept any type of 40 i[identifier], but the derived class is free to accept any type of
74 identifier, and add extra functionality to the iterator. 41 identifier, and add extra functionality to the iterator.
75 """ 42 """
76 raise NotImplementedError 43 for i in self.minibatches( minibatch_size = 1):
44 yield Example(i.keys(), [v[0] for v in i.values()])
77 45
78 def zip(self, *fieldnames): 46 def zip(self, *fieldnames):
79 """ 47 """
80 Supports two forms of syntax: 48 Supports two forms of syntax:
81 49
91 f1, f2, and f3 fields of a single example on each loop iteration. 59 f1, f2, and f3 fields of a single example on each loop iteration.
92 60
93 The derived class may accept fieldname arguments of any type. 61 The derived class may accept fieldname arguments of any type.
94 62
95 """ 63 """
96 raise NotImplementedError 64 for i in self.minibatches(fieldnames, minibatch_size = 1):
97 65 yield [f[0] for f in i]
98 def minibatches(self,minibatch_size,*fieldnames): 66
67 minibatches_fieldnames = None
68 minibatches_minibatch_size = 1
69 minibatches_n_batches = None
70 def minibatches(self,
71 fieldnames = minibatches_fieldnames,
72 minibatch_size = minibatches_minibatch_size,
73 n_batches = minibatches_n_batches):
99 """ 74 """
100 Supports two forms of syntax: 75 Supports two forms of syntax:
101 76
102 for i in dataset.zip(f1, f2, f3): ... 77 for i in dataset.minibatches([f1, f2, f3],**kwargs): ...
103 78
104 for i1, i2, i3 in dataset.zip(f1, f2, f3): ... 79 for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ...
105 80
106 Using the first syntax, "i" will be an indexable object, such as a list, 81 Using the first syntax, "i" will be an indexable object, such as a list,
107 tuple, or Example instance, such that on every iteration, i[0] is the f1 82 tuple, or Example instance, such that on every iteration, i[0] is a
108 field of the current example, i[1] is the f2 field, and so on. 83 list-like container of the f1 field of a batch current examples, i[1] is
109 84 a list-like container of the f2 field, etc.
110 Using the second syntax, i1, i2, i3 will contain the the contents of the 85
111 f1, f2, and f3 fields of a single example on each loop iteration. 86 Using the second syntax, i1, i2, i3 will be list-like containers of the
112 87 f1, f2, and f3 fields of a batch of examples on each loop iteration.
113 The derived class may accept fieldname arguments of any type. 88
114 89 PARAMETERS
115 Return an iterator, whose next() method returns the next example or the next 90 - fieldnames (list of any type, default None):
116 minibatch in the dataset. A minibatch (of length > 1) is also an example, but 91 The loop variables i1, i2, i3 (in the example above) should contain the
117 whose fields should be something one can iterate on again in order to obtain 92 f1, f2, and f3 fields of the current batch of examples. If None, the
118 the individual examples. 93 derived class can choose a default, e.g. all fields.
119 94
120 DataSet.zip returns an iterator over only the desired fields, and each field 95 - minibatch_size (integer, default 1)
121 of the iterator contains one example. 96 On every iteration, the variables i1, i2, i3 will have
122 97 exactly minibatch_size elements. e.g. len(i1) == minibatch_size
123 Return an iterator which sees only the specified fields (each fieldname is a 98
124 field key, typically a string). The value returned at each iteration 99 - n_batches (integer, default None)
125 is a tuple with one element per field. Hence it can be used like this: 100 The iterator will loop exactly this many times, and then stop. If None,
126 for f1, f2, f3 in dataset.zip('field1','field2','field3'): 101 the derived class can choose a default. If (-1), then the returned
127 ... use f1, f2, and f3 102 iterator should support looping indefinitely.
128 If one iterates through minibatches of examples (with the minibatches() method 103
129 or with the minibatch_size argument of the zip() method), then the fields 104 Note: A list-like container is something like a tuple, list, numpy.ndarray or
130 returned by the iterator's next method should be iterators over the 105 any other object that supports integer indexing and slicing.
131 individual values within the minibatch (typically these will be arrays 106
132 with minibatch_size rows). 107 """
133 Similar to zip but iterate over minibatches. 108 raise AbstractFunction()
134 Return a minibatch iterator, whose next() method returns an 'example'
135 whose fields are iteratable objects (which can iterate over the individual
136 values of that field in the minibatch).
137 """
138 raise NotImplementedError
139 109
140 def fieldNames(self): 110 def fieldNames(self):
111 #Yoshua-
112 # This list may not be finite; what would make sense in the use you have
113 # in mind?
114 # -JB
141 """Return the list of field names in the examples of this dataset.""" 115 """Return the list of field names in the examples of this dataset."""
142 raise NotImplementedError 116 raise AbstractFunction()
143 117
144 def rename(*new_field_specifications): 118 def rename(*new_field_specifications):
119 #Yoshua-
120 # Do you mean for this to be a virtual method?
121 # Wouldn't this functionality be easier to provide via a
122 # RenamingDataSet, such as the one I've written below?
123 # -JB
145 """ 124 """
146 Return a new dataset that maps old fields (of self) to new fields (of the returned 125 Return a new dataset that maps old fields (of self) to new fields (of the returned
147 dataset). The minimal syntax that should be supported is the following: 126 dataset). The minimal syntax that should be supported is the following:
148 new_field_specifications = [new_field_spec1, new_field_spec2, ...] 127 new_field_specifications = [new_field_spec1, new_field_spec2, ...]
149 new_field_spec = ([old_field1, old_field2, ...], new_field) 128 new_field_spec = ([old_field1, old_field2, ...], new_field)
150 In general both old_field and new_field should be strings, but some datasets may also 129 In general both old_field and new_field should be strings, but some datasets may also
151 support additional indexing schemes within each field (e.g. column slice 130 support additional indexing schemes within each field (e.g. column slice
152 of a matrix-like field). 131 of a matrix-like field).
153 """ 132 """
154 raise NotImplementedError 133 raise AbstractFunction()
134
135 class RenamingDataSet(DataSet):
136 """A DataSet that wraps another one, and makes it look like the field names
137 are different
138
139 Renaming is done by a dictionary that maps new names to the old ones used in
140 self.src.
141 """
142 def __init__(self, src, rename_dct):
143 DataSet.__init__(self)
144 self.src = src
145 self.rename_dct = copy.copy(rename_dct)
146
147 def minibatches(self,
148 fieldnames = DataSet.minibatches_fieldnames,
149 minibatch_size = DataSet.minibatches_minibatch_size,
150 n_batches = DataSet.minibatches_n_batches):
151 dct = self.rename_dct
152 new_fieldnames = [dct.get(f, f) for f in fieldnames]
153 return self.src.minibatches(new_fieldnames, minibatches_size, n_batches)
154
155 def fieldNames(self):
156 return [dct.get(f, f) for f in self.src.fieldNames()]
157
155 158
156 class FiniteDataSet(DataSet): 159 class FiniteDataSet(DataSet):
157 """ 160 """
158 Virtual interface, a subclass of DataSet for datasets which have a finite, known length. 161 Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
159 Examples are indexed by an integer between 0 and self.length()-1, 162 Examples are indexed by an integer between 0 and self.length()-1,
162 in an efficient random access way. Users are encouraged to expect only the generic dataset 165 in an efficient random access way. Users are encouraged to expect only the generic dataset
163 interface in general. A FiniteDataSet is mainly useful when one has to obtain 166 interface in general. A FiniteDataSet is mainly useful when one has to obtain
164 a subset of examples (e.g. for splitting a dataset into training and test sets). 167 a subset of examples (e.g. for splitting a dataset into training and test sets).
165 """ 168 """
166 169
170 class FiniteDataSetIterator(object):
171 """
172 If the fieldnames list is empty, it means that we want to see ALL the fields.
173 """
174 def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
175 self.dataset=dataset
176 self.minibatch_size=minibatch_size
177 assert minibatch_size>=1 and minibatch_size<=len(dataset)
178 self.current = -self.minibatch_size
179 self.fieldnames = fieldnames
180
181 def __iter__(self):
182 return self
183
184 def next(self):
185 self.current+=self.minibatch_size
186 if self.current>=len(self.dataset):
187 self.current=-self.minibatch_size
188 raise StopIteration
189 if self.minibatch_size==1:
190 complete_example=self.dataset[self.current]
191 else:
192 complete_example=self.dataset[self.current:self.current+self.minibatch_size]
193 if self.fieldnames:
194 return Example(self.fieldnames,list(complete_example))
195 else:
196 return complete_example
197
167 def __init__(self): 198 def __init__(self):
168 pass 199 pass
169 200
170 def __iter__(self): 201 def minibatches(self,
171 return FiniteDataSetIterator(self) 202 fieldnames = DataSet.minibatches_fieldnames,
172 203 minibatch_size = DataSet.minibatches_minibatch_size,
173 def zip(self,*fieldnames): 204 n_batches = DataSet.minibatches_n_batches):
174 return FiniteDataSetIterator(self,1,fieldnames) 205 """
175 206 If the fieldnames list is empty, it means that we want to see ALL the fields.
176 def minibatches(self,minibatch_size,*fieldnames): 207
177 return FiniteDataSetIterator(self,minibatch_size,fieldnames) 208 If the n_batches is empty, we want to see all the examples possible
209 for the give minibatch_size.
210 """
211 # substitute the defaults:
212 if fieldnames is None: fieldnames = self.fieldNames()
213 if n_batches is None: n_batches = len(self) / minibatch_size
214 return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
178 215
179 def __getattr__(self,fieldname): 216 def __getattr__(self,fieldname):
180 """Return an that can iterate over the values of the field in this dataset.""" 217 """Return an that can iterate over the values of the field in this dataset."""
181 return self(fieldname) 218 return self(fieldname)
182 219
184 """Return a sub-dataset containing only the given fieldnames as fields. 221 """Return a sub-dataset containing only the given fieldnames as fields.
185 222
186 The return value's default iterator will iterate only over the given 223 The return value's default iterator will iterate only over the given
187 fields. 224 fields.
188 """ 225 """
189 raise NotImplementedError 226 raise AbstractFunction()
190 227
191 def __len__(self): 228 def __len__(self):
192 """len(dataset) returns the number of examples in the dataset.""" 229 """len(dataset) returns the number of examples in the dataset."""
193 raise NotImplementedError 230 raise AbstractFunction()
194 231
195 def __getitem__(self,i): 232 def __getitem__(self,i):
196 """dataset[i] returns the (i+1)-th example of the dataset.""" 233 """dataset[i] returns the (i+1)-th example of the dataset."""
197 raise NotImplementedError 234 raise AbstractFunction()
198 235
199 def __getslice__(self,*slice_args): 236 def __getslice__(self,*slice_args):
200 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" 237 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
201 raise NotImplementedError 238 raise AbstractFunction()
202
203 class FiniteDataSetIterator(object):
204 """
205 If the fieldnames list is empty, it means that we want to see ALL the fields.
206 """
207 def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
208 self.dataset=dataset
209 self.minibatch_size=minibatch_size
210 assert minibatch_size>=1 and minibatch_size<=len(dataset)
211 self.current = -self.minibatch_size
212 self.fieldnames = fieldnames
213
214 def __iter__(self):
215 return self
216
217 def next(self):
218 self.current+=self.minibatch_size
219 if self.current>=len(self.dataset):
220 self.current=-self.minibatch_size
221 raise StopIteration
222 if self.minibatch_size==1:
223 complete_example=self.dataset[self.current]
224 else:
225 complete_example=self.dataset[self.current:self.current+self.minibatch_size]
226 if self.fieldnames:
227 return Example(self.fieldnames,list(complete_example))
228 else:
229 return complete_example
230
231 239
232 # we may want ArrayDataSet defined in another python file 240 # we may want ArrayDataSet defined in another python file
233 241
234 import numpy 242 import numpy
243
244 def as_array_dataset(dataset):
245 # Generally datasets can be efficient by making data fields overlap, but
246 # this function doesn't know which fields overlap. So, it should check if
247 # dataset supports an as_array_dataset member function, and return that if
248 # possible.
249 if hasattr(dataset, 'as_array_dataset'):
250 return dataset.as_array_dataset()
251
252 raise NotImplementedError()
253
254 # Make ONE big minibatch with all the examples, to separate the fields.
255 n_examples = len(dataset)
256 batch = dataset.minibatches( minibatch_size = len(dataset)).next()
257
258 # Each field of the underlying dataset must be convertible to a numpy array of the same type
259 # currently just double, but should use the smallest compatible dtype
260 n_fields = len(batch)
261 fieldnames = batch.fields.keys()
262 total_width = 0
263 type = None
264 fields = LookupList()
265 for i in xrange(n_fields):
266 field = array(batch[i])
267 assert field.shape[0]==n_examples
268 width = field.shape[1]
269 start=total_width
270 total_width += width
271 fields[fieldnames[i]]=slice(start,total_width,1)
272 # many complicated things remain to be done:
273 # - find common dtype
274 # - decide what to do with extra dimensions if not the same in all fields
275 # - try to see if we can avoid the copy?
235 276
236 class ArrayDataSet(FiniteDataSet): 277 class ArrayDataSet(FiniteDataSet):
237 """ 278 """
238 An ArrayDataSet behaves like a numpy array but adds the notion of named fields 279 An ArrayDataSet behaves like a numpy array but adds the notion of named fields
239 from DataSet (and the ability to view multiple field values as an 'Example'). 280 from DataSet (and the ability to view multiple field values as an 'Example').
244 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. 285 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
245 Any dataset can also be converted to a numpy array (losing the notion of fields 286 Any dataset can also be converted to a numpy array (losing the notion of fields
246 by the numpy.array(dataset) call. 287 by the numpy.array(dataset) call.
247 """ 288 """
248 289
249 def __init__(self,dataset=None,data=None,fields={}): 290 class Iterator(object):
291 """An iterator over a finite dataset that implements wrap-around"""
292 def __init__(self, dataset, fieldnames, minibatch_size, next_max):
293 self.dataset=dataset
294 self.fieldnames = fieldnames
295 self.minibatch_size=minibatch_size
296 self.next_count = 0
297 self.next_max = next_max
298 self.current = -self.minibatch_size
299 assert minibatch_size > 0
300 if minibatch_size >= len(dataset):
301 raise NotImplementedError()
302
303 def __iter__(self):
304 #Why do we do this? -JB
305 return self
306
307 @staticmethod
308 def matcat(a, b):
309 a0, a1 = a.shape
310 b0, b1 = b.shape
311 assert a1 == b1
312 assert a.dtype is b.dtype
313 rval = numpy.empty( (a0 + b0, a1), dtype=a.dtype)
314 rval[:a0,:] = a
315 rval[a0:,:] = b
316 return rval
317
318 def next(self):
319
320 #check for end-of-loop
321 self.next_count += 1
322 if self.next_count == self.next_max:
323 raise StopIteration
324
325 #determine the first and last elements of the slice we'll return
326 self.current += self.minibatch_size
327 if self.current >= len(self.dataset):
328 self.current -= len(self.dataset)
329 upper = self.current + self.minibatch_size
330
331 if upper <= len(self.dataset):
332 #this is the easy case, we only need once slice
333 dataview = self.dataset.data[self.current:upper]
334 else:
335 # the minibatch wraps around the end of the dataset
336 dataview = self.dataset.data[self.current:]
337 upper -= len(self.dataset)
338 assert upper > 0
339 dataview = self.matcat(dataview, self.dataset.data[:upper])
340
341
342 rval = [dataview[:, self.dataset.fields[f]] for f in self.fieldnames]
343
344 if self.fieldnames:
345 rval = Example(self.fieldnames, rval)
346
347 return rval
348
349
350 def __init__(self, data, fields=None):
250 """ 351 """
251 There are two ways to construct an ArrayDataSet: (1) from an 352 There are two ways to construct an ArrayDataSet: (1) from an
252 existing dataset (which may result in a copy of the data in a numpy array), 353 existing dataset (which may result in a copy of the data in a numpy array),
253 or (2) from a numpy.array (the data argument), along with an optional description 354 or (2) from a numpy.array (the data argument), along with an optional description
254 of the fields (dictionary of column slices indexed by field names). 355 of the fields (a LookupList of column slices indexed by field names).
255 """ 356 """
256 if dataset!=None: 357 self.data=data
257 assert data==None and fields=={} 358 self.fields=fields
258 # Make ONE big minibatch with all the examples, to separate the fields. 359 rows, cols = data.shape
259 n_examples=len(dataset) 360
260 batch = dataset.minibatches(n_examples).next() 361 if fields:
261 # Each field of the underlying dataset must be convertible to a numpy array of the same type 362 for fieldname,fieldslice in fields.items():
262 # currently just double, but should use the smallest compatible dtype
263 n_fields = len(batch)
264 fieldnames = batch.fields.keys()
265 total_width = 0
266 type = None
267 for i in xrange(n_fields):
268 field = array(batch[i])
269 assert field.shape[0]==n_examples
270 width = field.shape[1]
271 start=total_width
272 total_width += width
273 fields[fieldnames[i]]=slice(start,total_width,1)
274 # many complicated things remain to be done:
275 # - find common dtype
276 # - decide what to do with extra dimensions if not the same in all fields
277 # - try to see if we can avoid the copy?
278 raise NotImplementedError
279 if data!=None:
280 assert dataset==None
281 self.data=data
282 self.fields=fields
283 self.width = data.shape[1]
284 for fieldname in fields:
285 fieldslice=fields[fieldname]
286 # make sure fieldslice.start and fieldslice.step are defined 363 # make sure fieldslice.start and fieldslice.step are defined
287 start=fieldslice.start 364 start=fieldslice.start
288 step=fieldslice.step 365 step=fieldslice.step
289 if not start: 366 if not start:
290 start=0 367 start=0
291 if not step: 368 if not step:
292 step=1 369 step=1
293 if not fieldslice.start or not fieldslice.step: 370 if not fieldslice.start or not fieldslice.step:
294 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) 371 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
295 # and coherent with the data array 372 # and coherent with the data array
296 assert fieldslice.start>=0 and fieldslice.stop<=self.width 373 assert fieldslice.start >= 0 and fieldslice.stop <= cols
374
375 def minibatches(self,
376 fieldnames = DataSet.minibatches_fieldnames,
377 minibatch_size = DataSet.minibatches_minibatch_size,
378 n_batches = DataSet.minibatches_n_batches):
379 """
380 If the fieldnames list is empty, it means that we want to see ALL the fields.
381
382 If the n_batches is empty, we want to see all the examples possible
383 for the give minibatch_size.
384 """
385 # substitute the defaults:
386 if fieldnames is None: fieldnames = self.fieldNames()
387 if n_batches is None: n_batches = len(self) / minibatch_size
388 return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
297 389
298 def __getattr__(self,fieldname): 390 def __getattr__(self,fieldname):
299 """ 391 """
300 Return a numpy array with the content associated with the given field name. 392 Return a numpy array with the content associated with the given field name.
301 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension 393 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
310 min_col=self.data.shape[1] 402 min_col=self.data.shape[1]
311 max_col=0 403 max_col=0
312 for field_slice in self.fields.values(): 404 for field_slice in self.fields.values():
313 min_col=min(min_col,field_slice.start) 405 min_col=min(min_col,field_slice.start)
314 max_col=max(max_col,field_slice.stop) 406 max_col=max(max_col,field_slice.stop)
315 new_fields={} 407 new_fields=LookupList()
316 for field in self.fields: 408 for fieldname,fieldslice in self.fields.items():
317 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) 409 new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step)
318 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields) 410 return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields)
319 411
320 def fieldNames(self): 412 def fieldNames(self):
321 """Return the list of field names that are supported by getattr and getFields.""" 413 """Return the list of field names that are supported by getattr and getFields."""
322 return self.fields.keys() 414 return self.fields.keys()
323 415
330 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields 422 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
331 the result is just a numpy array (for the i-th row of the dataset data matrix). 423 the result is just a numpy array (for the i-th row of the dataset data matrix).
332 """ 424 """
333 if self.fields: 425 if self.fields:
334 fieldnames,fieldslices=zip(*self.fields.items()) 426 fieldnames,fieldslices=zip(*self.fields.items())
335 return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices]) 427 return Example(self.fields.keys(),[self.data[i,fieldslice] for fieldslice in self.fields.values()])
336 else: 428 else:
337 return self.data[i] 429 return self.data[i]
338 430
339 def __getslice__(self,*slice_args): 431 def __getslice__(self,*args):
340 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" 432 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
341 return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields) 433 return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields)
342 434
343 def __array__(self): 435 def __array__(self):
344 """Return an view of this dataset which is an numpy.ndarray 436 """Return an view of this dataset which is an numpy.ndarray
345 437
346 Numpy uses this special function name to retrieve an ndarray view for 438 Numpy uses this special function name to retrieve an ndarray view for