Mercurial > pylearn
comparison pylearn/old_dataset/dataset.py @ 537:b054271b2504
new file structure layout, factories, etc.
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Wed, 12 Nov 2008 21:57:54 -0500 |
parents | dataset.py@fb62f0e4bcfe |
children |
comparison
equal
deleted
inserted
replaced
518:4aa7f74ea93f | 537:b054271b2504 |
---|---|
1 | |
2 from lookup_list import LookupList as Example | |
3 from common.misc import unique_elements_list_intersection | |
4 from string import join | |
5 from sys import maxint | |
6 import numpy, copy | |
7 | |
8 from exceptions import * | |
9 | |
10 class AttributesHolder(object): | |
11 def __init__(self): pass | |
12 | |
13 def attributeNames(self): | |
14 raise AbstractFunction() | |
15 | |
16 def setAttributes(self,attribute_names,attribute_values,make_copies=False): | |
17 """ | |
18 Allow the attribute_values to not be a list (but a single value) if the attribute_names is of length 1. | |
19 """ | |
20 if len(attribute_names)==1 and not (isinstance(attribute_values,list) or isinstance(attribute_values,tuple) ): | |
21 attribute_values = [attribute_values] | |
22 if make_copies: | |
23 for name,value in zip(attribute_names,attribute_values): | |
24 self.__setattr__(name,copy.deepcopy(value)) | |
25 else: | |
26 for name,value in zip(attribute_names,attribute_values): | |
27 self.__setattr__(name,value) | |
28 | |
29 def getAttributes(self,attribute_names=None, return_copy=False): | |
30 """ | |
31 Return all (if attribute_names=None, in the order of attributeNames()) or a specified subset of attributes. | |
32 """ | |
33 if attribute_names is None: | |
34 attribute_names = self.attributeNames() | |
35 if return_copy: | |
36 return [copy.copy(self.__getattribute__(name)) for name in attribute_names] | |
37 else: | |
38 return [self.__getattribute__(name) for name in attribute_names] | |
39 | |
40 class DataSet(AttributesHolder): | |
41 """A virtual base class for datasets. | |
42 | |
43 A DataSet can be seen as a generalization of a matrix, meant to be used in conjunction | |
44 with learning algorithms (for training and testing them): rows/records are called examples, and | |
45 columns/attributes are called fields. The field value for a particular example can be an arbitrary | |
46 python object, which depends on the particular dataset. | |
47 | |
48 We call a DataSet a 'stream' when its length is unbounded (in which case its __len__ method | |
49 should return sys.maxint). | |
50 | |
51 A DataSet is a generator of iterators; these iterators can run through the | |
52 examples or the fields in a variety of ways. A DataSet need not necessarily have a finite | |
53 or known length, so this class can be used to interface to a 'stream' which | |
54 feeds on-line learning (however, as noted below, some operations are not | |
55 feasible or not recommended on streams). | |
56 | |
57 To iterate over examples, there are several possibilities: | |
58 - for example in dataset: | |
59 - for val1,val2,... in dataset: | |
60 - for example in dataset(field1, field2,field3, ...): | |
61 - for val1,val2,val3 in dataset(field1, field2,field3): | |
62 - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N): | |
63 - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N): | |
64 Each of these is documented below. All of these iterators are expected | |
65 to provide, in addition to the usual 'next()' method, a 'next_index()' method | |
66 which returns a non-negative integer pointing to the position of the next | |
67 example that will be returned by 'next()' (or of the first example in the | |
68 next minibatch returned). This is important because these iterators | |
69 can wrap around the dataset in order to do multiple passes through it, | |
70 in possibly unregular ways if the minibatch size is not a divisor of the | |
71 dataset length. | |
72 | |
73 To iterate over fields, one can do | |
74 - for field in dataset.fields(): | |
75 for field_value in field: # iterate over the values associated to that field for all the dataset examples | |
76 - for field in dataset(field1,field2,...).fields() to select a subset of fields | |
77 - for field in dataset.fields(field1,field2,...) to select a subset of fields | |
78 and each of these fields is iterable over the examples: | |
79 - for field_examples in dataset.fields(): | |
80 for example_value in field_examples: | |
81 ... | |
82 but when the dataset is a stream (unbounded length), it is not recommended to do | |
83 such things because the underlying dataset may refuse to access the different fields in | |
84 an unsynchronized ways. Hence the fields() method is illegal for streams, by default. | |
85 The result of fields() is a L{DataSetFields} object, which iterates over fields, | |
86 and whose elements are iterable over examples. A DataSetFields object can | |
87 be turned back into a DataSet with its examples() method:: | |
88 dataset2 = dataset1.fields().examples() | |
89 and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1). | |
90 | |
91 Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. | |
92 | |
93 Note: The content of a field can be of any type. Field values can also be 'missing' | |
94 (e.g. to handle semi-supervised learning), and in the case of numeric (numpy array) | |
95 fields (i.e. an ArrayFieldsDataSet), NaN plays the role of a missing value. | |
96 What about non-numeric values? None. | |
97 | |
98 Dataset elements can be indexed and sub-datasets (with a subset | |
99 of examples) can be extracted. These operations are not supported | |
100 by default in the case of streams. | |
101 | |
102 - dataset[:n] returns an Example with the n first examples. | |
103 | |
104 - dataset[i1:i2:s] returns an Example with the examples i1,i1+s,...i2-s. | |
105 | |
106 - dataset[i] returns an Example. | |
107 | |
108 - dataset[[i1,i2,...in]] returns an Example with examples i1,i2,...in. | |
109 | |
110 A similar command gives you a DataSet instead of Examples : | |
111 | |
112 - dataset.subset[:n] returns a DataSet with the n first examples. | |
113 | |
114 - dataset.subset[i1:i2:s] returns a DataSet with the examples i1,i1+s,...i2-s. | |
115 | |
116 - dataset.subset[i] returns a DataSet. | |
117 | |
118 - dataset.subset[[i1,i2,...in]] returns a DataSet with examples i1,i2,...in. | |
119 | |
120 | |
121 - dataset.<property> returns the value of a property associated with | |
122 the name <property>. The following properties should be supported: | |
123 - 'description': a textual description or name for the dataset | |
124 - 'fieldtypes': a list of types (one per field) | |
125 A DataSet may have other attributes that it makes visible to other objects. These are | |
126 used to store information that is not example-wise but global to the dataset. | |
127 The list of names of these attributes is given by the attribute_names() method. | |
128 | |
129 Datasets can be concatenated either vertically (increasing the length) or | |
130 horizontally (augmenting the set of fields), if they are compatible, using | |
131 the following operations (with the same basic semantics as numpy.hstack | |
132 and numpy.vstack): | |
133 | |
134 - dataset1 | dataset2 | dataset3 == dataset.hstack([dataset1,dataset2,dataset3]) | |
135 | |
136 creates a new dataset whose list of fields is the concatenation of the list of | |
137 fields of the argument datasets. This only works if they all have the same length. | |
138 | |
139 - dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3]) | |
140 | |
141 creates a new dataset that concatenates the examples from the argument datasets | |
142 (and whose length is the sum of the length of the argument datasets). This only | |
143 works if they all have the same fields. | |
144 | |
145 According to the same logic, and viewing a DataSetFields object associated to | |
146 a DataSet as a kind of transpose of it, fields1 & fields2 concatenates fields of | |
147 a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their | |
148 examples. | |
149 | |
150 A dataset can hold arbitrary key-value pairs that may be used to access meta-data | |
151 or other properties of the dataset or associated with the dataset or the result | |
152 of a computation stored in a dataset. These can be accessed through the [key] syntax | |
153 when key is a string (or more specifically, neither an integer, a slice, nor a list). | |
154 | |
155 A DataSet sub-class should always redefine the following methods: | |
156 - __len__ if it is not a stream | |
157 - fieldNames | |
158 - minibatches_nowrap (called by DataSet.minibatches()) | |
159 For efficiency of implementation, a sub-class might also want to redefine | |
160 - valuesHStack | |
161 - valuesVStack | |
162 - hasFields | |
163 - __getitem__ may not be feasible with some streams | |
164 - __iter__ | |
165 A sub-class should also append attributes to self._attribute_names | |
166 (the default value returned by attributeNames()). | |
167 By convention, attributes not in attributeNames() should have a name | |
168 starting with an underscore. | |
169 @todo enforce/test that convention! | |
170 """ | |
171 | |
172 numpy_vstack = lambda fieldname,values: numpy.vstack(values) | |
173 numpy_hstack = lambda fieldnames,values: numpy.hstack(values) | |
174 | |
175 def __init__(self, description=None, fieldnames=None, fieldtypes=None): | |
176 """ | |
177 @type fieldnames: list of strings | |
178 @type fieldtypes: list of python types, same length as fieldnames | |
179 @type description: string | |
180 @param description: description/name for this dataset | |
181 """ | |
182 def default_desc(): | |
183 return type(self).__name__ \ | |
184 + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" | |
185 | |
186 #self.fieldnames = fieldnames | |
187 | |
188 self.fieldtypes = fieldtypes if fieldtypes is not None \ | |
189 else [None]*1 #len(fieldnames) | |
190 | |
191 self.description = default_desc() if description is None \ | |
192 else description | |
193 self._attribute_names = ["description"] | |
194 | |
195 | |
196 attributeNames = property(lambda self: copy.copy(self._attribute_names)) | |
197 | |
198 def __contains__(self, fieldname): | |
199 return (fieldname in self.fieldNames()) \ | |
200 or (fieldname in self.attributeNames()) | |
201 | |
202 def __iter__(self): | |
203 """Supports the syntax "for i in dataset: ..." | |
204 | |
205 Using this syntax, "i" will be an Example instance (or equivalent) with | |
206 all the fields of DataSet self. Every field of "i" will give access to | |
207 a field of a single example. Fields should be accessible via | |
208 i["fielname"] or i[3] (in the order defined by the elements of the | |
209 Example returned by this iterator), but the derived class is free | |
210 to accept any type of identifier, and add extra functionality to the iterator. | |
211 | |
212 The default implementation calls the minibatches iterator and extracts the first example of each field. | |
213 """ | |
214 return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1)) | |
215 | |
216 def __len__(self): | |
217 """ | |
218 len(dataset) returns the number of examples in the dataset. | |
219 By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint). | |
220 Sub-classes which implement finite-length datasets should redefine this method. | |
221 Some methods only make sense for finite-length datasets. | |
222 """ | |
223 from sys import maxint | |
224 return maxint | |
225 | |
226 | |
227 class MinibatchToSingleExampleIterator(object): | |
228 """ | |
229 Converts the result of minibatch iterator with minibatch_size==1 into | |
230 single-example values in the result. Therefore the result of | |
231 iterating on the dataset itself gives a sequence of single examples | |
232 (whereas the result of iterating over minibatches gives in each | |
233 Example field an iterable object over the individual examples in | |
234 the minibatch). | |
235 """ | |
236 def __init__(self, minibatch_iterator): | |
237 self.minibatch_iterator = minibatch_iterator | |
238 self.minibatch = None | |
239 def __iter__(self): #makes for loop work | |
240 return self | |
241 def next(self): | |
242 size1_minibatch = self.minibatch_iterator.next() | |
243 if not self.minibatch: | |
244 names = size1_minibatch.keys() | |
245 # next lines are a hack, but there was problem when we were getting [array(327)] for instance | |
246 try: | |
247 values = [value[0] for value in size1_minibatch.values()] | |
248 except : | |
249 values = [value for value in size1_minibatch.values()] | |
250 self.minibatch = Example(names,values) | |
251 else: | |
252 self.minibatch._values = [value[0] for value in size1_minibatch.values()] | |
253 return self.minibatch | |
254 | |
255 def next_index(self): | |
256 return self.minibatch_iterator.next_index() | |
257 | |
258 class MinibatchWrapAroundIterator(object): | |
259 """ | |
260 An iterator for minibatches that handles the case where we need to wrap around the | |
261 dataset because n_batches*minibatch_size > len(dataset). It is constructed from | |
262 a dataset that provides a minibatch iterator that does not need to handle that problem. | |
263 This class is a utility for dataset subclass writers, so that they do not have to handle | |
264 this issue multiple times, nor check that fieldnames are valid, nor handle the | |
265 empty fieldnames (meaning 'use all the fields'). | |
266 """ | |
267 def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset): | |
268 self.dataset=dataset | |
269 self.fieldnames=fieldnames | |
270 self.minibatch_size=minibatch_size | |
271 self.n_batches=n_batches | |
272 self.n_batches_done=0 | |
273 self.next_row=offset | |
274 self.L=len(dataset) | |
275 self.offset=offset % self.L | |
276 ds_nbatches = (self.L-self.next_row)/self.minibatch_size | |
277 if n_batches is not None: | |
278 ds_nbatches = min(n_batches,ds_nbatches) | |
279 if fieldnames: | |
280 assert dataset.hasFields(*fieldnames) | |
281 else: | |
282 self.fieldnames=dataset.fieldNames() | |
283 self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, ds_nbatches,self.next_row) | |
284 | |
285 def __iter__(self): | |
286 return self | |
287 | |
288 def next_index(self): | |
289 return self.next_row | |
290 | |
291 def next(self): | |
292 if self.n_batches and self.n_batches_done==self.n_batches: | |
293 raise StopIteration | |
294 elif not self.n_batches and self.next_row ==self.L: | |
295 raise StopIteration | |
296 upper = self.next_row+self.minibatch_size | |
297 if upper <=self.L: | |
298 minibatch = self.iterator.next() | |
299 else: | |
300 if not self.n_batches: | |
301 upper=min(upper, self.L) | |
302 # if their is not a fixed number of batch, we continue to the end of the dataset. | |
303 # this can create a minibatch that is smaller then the minibatch_size | |
304 assert (self.L-self.next_row)<=self.minibatch_size | |
305 minibatch = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next() | |
306 else: | |
307 # we must concatenate (vstack) the bottom and top parts of our minibatch | |
308 # first get the beginning of our minibatch (top of dataset) | |
309 first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next() | |
310 second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next() | |
311 minibatch = Example(self.fieldnames, | |
312 [self.dataset.valuesVStack(name,[first_part[name],second_part[name]]) | |
313 for name in self.fieldnames]) | |
314 self.next_row=upper | |
315 self.n_batches_done+=1 | |
316 if upper >= self.L and self.n_batches: | |
317 self.next_row -= self.L | |
318 ds_nbatches = (self.L-self.next_row)/self.minibatch_size | |
319 if self.n_batches is not None: | |
320 ds_nbatches = min(self.n_batches,ds_nbatches) | |
321 self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, | |
322 ds_nbatches,self.next_row) | |
323 return DataSetFields(MinibatchDataSet(minibatch,self.dataset.valuesVStack, | |
324 self.dataset.valuesHStack), | |
325 minibatch.keys()) | |
326 | |
327 | |
328 minibatches_fieldnames = None | |
329 minibatches_minibatch_size = 1 | |
330 minibatches_n_batches = None | |
331 def minibatches(self, | |
332 fieldnames = minibatches_fieldnames, | |
333 minibatch_size = minibatches_minibatch_size, | |
334 n_batches = minibatches_n_batches, | |
335 offset = 0): | |
336 """ | |
337 Return an iterator that supports three forms of syntax: | |
338 | |
339 for i in dataset.minibatches(None,**kwargs): ... | |
340 | |
341 for i in dataset.minibatches([f1, f2, f3],**kwargs): ... | |
342 | |
343 for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ... | |
344 | |
345 Using the first two syntaxes, "i" will be an indexable object, such as a list, | |
346 tuple, or Example instance. In both cases, i[k] is a list-like container | |
347 of a batch of current examples. In the second case, i[0] is | |
348 list-like container of the f1 field of a batch current examples, i[1] is | |
349 a list-like container of the f2 field, etc. | |
350 | |
351 Using the first syntax, all the fields will be returned in "i". | |
352 Using the third syntax, i1, i2, i3 will be list-like containers of the | |
353 f1, f2, and f3 fields of a batch of examples on each loop iteration. | |
354 | |
355 The minibatches iterator is expected to return upon each call to next() | |
356 a DataSetFields object, which is a Example (indexed by the field names) whose | |
357 elements are iterable and indexable over the minibatch examples, and which keeps a pointer to | |
358 a sub-dataset that can be used to iterate over the individual examples | |
359 in the minibatch. Hence a minibatch can be converted back to a regular | |
360 dataset or its fields can be looked at individually (and possibly iterated over). | |
361 | |
362 PARAMETERS | |
363 - fieldnames (list of any type, default None): | |
364 The loop variables i1, i2, i3 (in the example above) should contain the | |
365 f1, f2, and f3 fields of the current batch of examples. If None, the | |
366 derived class can choose a default, e.g. all fields. | |
367 | |
368 - minibatch_size (integer, default 1) | |
369 On every iteration, the variables i1, i2, i3 will have | |
370 exactly minibatch_size elements. e.g. len(i1) == minibatch_size | |
371 | |
372 @DEPRECATED n_batches : not used anywhere | |
373 - n_batches (integer, default None) | |
374 The iterator will loop exactly this many times, and then stop. If None, | |
375 the derived class can choose a default. If (-1), then the returned | |
376 iterator should support looping indefinitely. | |
377 | |
378 - offset (integer, default 0) | |
379 The iterator will start at example 'offset' in the dataset, rather than the default. | |
380 | |
381 Note: A list-like container is something like a tuple, list, numpy.ndarray or | |
382 any other object that supports integer indexing and slicing. | |
383 | |
384 @ATTENTION: now minibatches returns minibatches_nowrap, which is supposed to return complete | |
385 batches only, raise StopIteration. | |
386 @ATTENTION: minibatches returns a LookupList, we can't iterate over examples on it. | |
387 | |
388 """ | |
389 #return DataSet.MinibatchWrapAroundIterator(self, fieldnames, minibatch_size, n_batches,offset) | |
390 assert offset >= 0 | |
391 assert offset < len(self) | |
392 assert offset + minibatch_size -1 < len(self) | |
393 if fieldnames == None : | |
394 fieldnames = self.fieldNames() | |
395 return self.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset) | |
396 | |
397 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): | |
398 """ | |
399 This is the minibatches iterator generator that sub-classes must define. | |
400 It does not need to worry about wrapping around multiple times across the dataset, | |
401 as this is handled by MinibatchWrapAroundIterator when DataSet.minibatches() is called. | |
402 The next() method of the returned iterator does not even need to worry about | |
403 the termination condition (as StopIteration will be raised by DataSet.minibatches | |
404 before an improper call to minibatches_nowrap's next() is made). | |
405 That next() method can assert that its next row will always be within [0,len(dataset)). | |
406 The iterator returned by minibatches_nowrap does not need to implement | |
407 a next_index() method either, as this will be provided by MinibatchWrapAroundIterator. | |
408 """ | |
409 raise AbstractFunction() | |
410 | |
411 def is_unbounded(self): | |
412 """ | |
413 Tests whether a dataset is unbounded (e.g. a stream). | |
414 """ | |
415 return len(self)==maxint | |
416 | |
417 def hasFields(self,*fieldnames): | |
418 """ | |
419 Return true if the given field name (or field names, if multiple arguments are | |
420 given) is recognized by the DataSet (i.e. can be used as a field name in one | |
421 of the iterators). | |
422 | |
423 The default implementation may be inefficient (O(# fields in dataset)), as it calls the fieldNames() | |
424 method. Many datasets may store their field names in a dictionary, which would allow more efficiency. | |
425 """ | |
426 return len(unique_elements_list_intersection(fieldnames,self.fieldNames()))>0 | |
427 | |
428 def fieldNames(self): | |
429 """ | |
430 Return the list of field names that are supported by the iterators, | |
431 and for which hasFields(fieldname) would return True. | |
432 """ | |
433 raise AbstractFunction() | |
434 | |
435 def __call__(self,*fieldnames): | |
436 """ | |
437 Return a dataset that sees only the fields whose name are specified. | |
438 """ | |
439 assert self.hasFields(*fieldnames) | |
440 #return self.fields(*fieldnames).examples() | |
441 fieldnames_list = list(fieldnames) | |
442 return FieldsSubsetDataSet(self,fieldnames_list) | |
443 | |
444 def cached_fields_subset(self,*fieldnames) : | |
445 """ | |
446 Behaviour is supposed to be the same as __call__(*fieldnames), but the dataset returned is cached. | |
447 @see : dataset.__call__ | |
448 """ | |
449 assert self.hasFields(*fieldnames) | |
450 return self.fields(*fieldnames).examples() | |
451 | |
452 def fields(self,*fieldnames): | |
453 """ | |
454 Return a DataSetFields object associated with this dataset. | |
455 """ | |
456 return DataSetFields(self,fieldnames) | |
457 | |
458 def getitem_key(self, fieldname): | |
459 """A not-so-well thought-out place to put code that used to be in | |
460 getitem. | |
461 """ | |
462 #removing as per discussion June 4. --JSB | |
463 | |
464 i = fieldname | |
465 # else check for a fieldname | |
466 if self.hasFields(i): | |
467 return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] | |
468 # else we are trying to access a property of the dataset | |
469 assert i in self.__dict__ # else it means we are trying to access a non-existing property | |
470 return self.__dict__[i] | |
471 | |
472 def __getitem__(self,i): | |
473 """ | |
474 @rtype: Example | |
475 @returns: single or multiple examples | |
476 | |
477 @type i: integer or slice or <iterable> of integers | |
478 @param i: | |
479 dataset[i] returns the (i+1)-th example of the dataset. | |
480 dataset[i:j] returns a LookupList with examples i,i+1,...,j-1. | |
481 dataset[i:j:s] returns a LookupList with examples i,i+2,i+4...,j-2. | |
482 dataset[[i1,i2,..,in]] returns a LookupList with examples i1,i2,...,in. | |
483 | |
484 @note: | |
485 Some stream datasets may be unable to implement random access, i.e. | |
486 arbitrary slicing/indexing because they can only iterate through | |
487 examples one or a minibatch at a time and do not actually store or keep | |
488 past (or future) examples. | |
489 | |
490 The default implementation of getitem uses the minibatches iterator | |
491 to obtain one example, one slice, or a list of examples. It may not | |
492 always be the most efficient way to obtain the result, especially if | |
493 the data are actually stored in a memory array. | |
494 """ | |
495 | |
496 if type(i) is int: | |
497 assert i >= 0 # TBM: see if someone complains and want negative i | |
498 if i >= len(self) : | |
499 raise IndexError | |
500 i_batch = self.minibatches_nowrap(self.fieldNames(), | |
501 minibatch_size=1, n_batches=1, offset=i) | |
502 return DataSet.MinibatchToSingleExampleIterator(i_batch).next() | |
503 | |
504 #if i is a contiguous slice | |
505 if type(i) is slice and (i.step in (None, 1)): | |
506 offset = 0 if i.start is None else i.start | |
507 upper_bound = len(self) if i.stop is None else i.stop | |
508 upper_bound = min(len(self) , upper_bound) | |
509 #return MinibatchDataSet(self.minibatches_nowrap(self.fieldNames(), | |
510 # minibatch_size=upper_bound - offset, | |
511 # n_batches=1, | |
512 # offset=offset).next()) | |
513 # now returns a LookupList | |
514 return self.minibatches_nowrap(self.fieldNames(), | |
515 minibatch_size=upper_bound - offset, | |
516 n_batches=1, | |
517 offset=offset).next() | |
518 | |
519 # if slice has a step param, convert it to list and handle it with the | |
520 # list code | |
521 if type(i) is slice: | |
522 offset = 0 if i.start is None else i.start | |
523 upper_bound = len(self) if i.stop is None else i.stop | |
524 upper_bound = min(len(self) , upper_bound) | |
525 i = list(range(offset, upper_bound, i.step)) | |
526 | |
527 # handle tuples, arrays, lists | |
528 if hasattr(i, '__getitem__'): | |
529 for idx in i: | |
530 #dis-allow nested slices | |
531 if not isinstance(idx, int): | |
532 raise TypeError(idx) | |
533 if idx >= len(self) : | |
534 raise IndexError | |
535 # call back into self.__getitem__ | |
536 examples = [self.minibatches_nowrap(self.fieldNames(), | |
537 minibatch_size=1, n_batches=1, offset=ii).next() | |
538 for ii in i] | |
539 # re-index the fields in each example by field instead of by example | |
540 field_values = [[] for blah in self.fieldNames()] | |
541 for e in examples: | |
542 for f,v in zip(field_values, e): | |
543 f.append(v) | |
544 #build them into a LookupList (a.ka. Example) | |
545 zz = zip(self.fieldNames(),field_values) | |
546 vst = [self.valuesVStack(fieldname,field_values) for fieldname,field_values in zz] | |
547 example = Example(self.fieldNames(), vst) | |
548 #return MinibatchDataSet(example, self.valuesVStack, self.valuesHStack) | |
549 # now returns a LookupList | |
550 return example | |
551 | |
552 # what in the world is i? | |
553 raise TypeError(i, type(i)) | |
554 | |
555 | |
556 """ | |
557 Enables the call dataset.subset[a:b:c] that will return a DataSet | |
558 around the examples returned by __getitem__(slice(a,b,c)) | |
559 | |
560 @SEE DataSet.__getsubset(self) | |
561 """ | |
562 subset = property(lambda s : s.__getsubset(),doc="returns a subset as a DataSet") | |
563 | |
564 | |
565 def __getsubset(self) : | |
566 """ | |
567 Enables the call data.subset[a:b:c], returns a DataSet. | |
568 Default implementation is a simple wrap around __getitem__() using MinibatchDataSet. | |
569 | |
570 @RETURN DataSet | |
571 @SEE DataSet.subset = property(lambda s : s.__getsubset()) | |
572 """ | |
573 _self = self | |
574 class GetSliceReturnsDataSet(object) : | |
575 def __getitem__(self,slice) : | |
576 return MinibatchDataSet(_self.__getitem__(slice)) | |
577 return GetSliceReturnsDataSet() | |
578 | |
579 | |
580 | |
581 def valuesHStack(self,fieldnames,fieldvalues): | |
582 """ | |
583 Return a value that corresponds to concatenating (horizontally) several field values. | |
584 This can be useful to merge some fields. The implementation of this operation is likely | |
585 to involve a copy of the original values. When the values are numpy arrays, the | |
586 result should be numpy.hstack(values). If it makes sense, this operation should | |
587 work as well when each value corresponds to multiple examples in a minibatch | |
588 e.g. if each value is a Ni-vector and a minibatch of length L is a LxNi matrix, | |
589 then the result should be a Lx(N1+N2+..) matrix equal to numpy.hstack(values). | |
590 The default is to use numpy.hstack for numpy.ndarray values, and a list | |
591 pointing to the original values for other data types. | |
592 """ | |
593 all_numpy=True | |
594 for value in fieldvalues: | |
595 if not type(value) is numpy.ndarray: | |
596 all_numpy=False | |
597 if all_numpy: | |
598 return numpy.hstack(fieldvalues) | |
599 # the default implementation of horizontal stacking is to put values in a list | |
600 return fieldvalues | |
601 | |
602 def valuesVStack(self,fieldname,values): | |
603 """ | |
604 @param fieldname: the name of the field from which the values were taken | |
605 @type fieldname: any type | |
606 | |
607 @param values: bits near the beginning or end of the dataset | |
608 @type values: list of minibatches (returned by minibatches_nowrap) | |
609 | |
610 @return: the concatenation (stacking) of the values | |
611 @rtype: something suitable as a minibatch field | |
612 """ | |
613 rval = [] | |
614 for v in values: | |
615 rval.extend(v) | |
616 return rval | |
617 | |
618 def __or__(self,other): | |
619 """ | |
620 dataset1 | dataset2 returns a dataset whose list of fields is the concatenation of the list of | |
621 fields of the argument datasets. This only works if they all have the same length. | |
622 """ | |
623 return HStackedDataSet([self,other]) | |
624 | |
625 def __and__(self,other): | |
626 """ | |
627 dataset1 & dataset2 is a dataset that concatenates the examples from the argument datasets | |
628 (and whose length is the sum of the length of the argument datasets). This only | |
629 works if they all have the same fields. | |
630 """ | |
631 return VStackedDataSet([self,other]) | |
632 | |
633 def hstack(datasets): | |
634 """ | |
635 hstack(dataset1,dataset2,...) returns dataset1 | datataset2 | ... | |
636 which is a dataset whose fields list is the concatenation of the fields | |
637 of the individual datasets. | |
638 """ | |
639 assert len(datasets)>0 | |
640 if len(datasets)==1: | |
641 return datasets[0] | |
642 return HStackedDataSet(datasets) | |
643 | |
644 def vstack(datasets): | |
645 """ | |
646 vstack(dataset1,dataset2,...) returns dataset1 & datataset2 & ... | |
647 which is a dataset which iterates first over the examples of dataset1, then | |
648 over those of dataset2, etc. | |
649 """ | |
650 assert len(datasets)>0 | |
651 if len(datasets)==1: | |
652 return datasets[0] | |
653 return VStackedDataSet(datasets) | |
654 | |
655 class FieldsSubsetDataSet(DataSet): | |
656 """ | |
657 A sub-class of L{DataSet} that selects a subset of the fields. | |
658 """ | |
659 def __init__(self,src,fieldnames): | |
660 self.src=src | |
661 self.fieldnames=fieldnames | |
662 assert src.hasFields(*fieldnames) | |
663 self.valuesHStack = src.valuesHStack | |
664 self.valuesVStack = src.valuesVStack | |
665 | |
666 def __len__(self): return len(self.src) | |
667 | |
668 def fieldNames(self): | |
669 return self.fieldnames | |
670 | |
671 def __iter__(self): | |
672 class FieldsSubsetIterator(object): | |
673 def __init__(self,ds): | |
674 self.ds=ds | |
675 self.src_iter=ds.src.__iter__() | |
676 self.example=None | |
677 def __iter__(self): return self | |
678 def next(self): | |
679 complete_example = self.src_iter.next() | |
680 if self.example: | |
681 self.example._values=[complete_example[field] | |
682 for field in self.ds.fieldnames] | |
683 else: | |
684 self.example=Example(self.ds.fieldnames, | |
685 [complete_example[field] for field in self.ds.fieldnames]) | |
686 return self.example | |
687 return FieldsSubsetIterator(self) | |
688 | |
689 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): | |
690 assert self.hasFields(*fieldnames) | |
691 return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset) | |
692 def dontuse__getitem__(self,i): | |
693 return FieldsSubsetDataSet(self.src[i],self.fieldnames) | |
694 | |
695 class RenamedFieldsDataSet(DataSet): | |
696 """ | |
697 A sub-class of L{DataSet} that selects and renames a subset of the fields. | |
698 """ | |
699 def __init__(self,src,src_fieldnames,new_fieldnames): | |
700 self.src=src | |
701 self.src_fieldnames=src_fieldnames | |
702 self.new_fieldnames=new_fieldnames | |
703 assert src.hasFields(*src_fieldnames) | |
704 assert len(src_fieldnames)==len(new_fieldnames) | |
705 self.valuesHStack = src.valuesHStack | |
706 self.valuesVStack = src.valuesVStack | |
707 self.lookup_fields = Example(new_fieldnames,src_fieldnames) | |
708 | |
709 def __len__(self): return len(self.src) | |
710 | |
711 def fieldNames(self): | |
712 return self.new_fieldnames | |
713 | |
714 def __iter__(self): | |
715 class FieldsSubsetIterator(object): | |
716 def __init__(self,ds): | |
717 self.ds=ds | |
718 self.src_iter=ds.src.__iter__() | |
719 self.example=None | |
720 def __iter__(self): return self | |
721 def next(self): | |
722 complete_example = self.src_iter.next() | |
723 if self.example: | |
724 self.example._values=[complete_example[field] | |
725 for field in self.ds.src_fieldnames] | |
726 else: | |
727 self.example=Example(self.ds.new_fieldnames, | |
728 [complete_example[field] | |
729 for field in self.ds.src_fieldnames]) | |
730 return self.example | |
731 return FieldsSubsetIterator(self) | |
732 | |
733 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): | |
734 assert self.hasFields(*fieldnames) | |
735 cursor = Example(fieldnames,[0]*len(fieldnames)) | |
736 for batch in self.src.minibatches_nowrap([self.lookup_fields[f] for f in fieldnames],minibatch_size,n_batches,offset): | |
737 cursor._values=batch._values | |
738 yield cursor | |
739 | |
740 def __getitem__(self,i): | |
741 # return FieldsSubsetDataSet(self.src[i],self.new_fieldnames) | |
742 complete_example = self.src[i] | |
743 return Example(self.new_fieldnames, | |
744 [complete_example[field] | |
745 for field in self.src_fieldnames]) | |
746 | |
747 | |
748 | |
749 class DataSetFields(Example): | |
750 """ | |
751 Although a L{DataSet} iterates over examples (like rows of a matrix), an associated | |
752 DataSetFields iterates over fields (like columns of a matrix), and can be understood | |
753 as a transpose of the associated dataset. | |
754 | |
755 To iterate over fields, one can do | |
756 * for fields in dataset.fields() | |
757 * for fields in dataset(field1,field2,...).fields() to select a subset of fields | |
758 * for fields in dataset.fields(field1,field2,...) to select a subset of fields | |
759 and each of these fields is iterable over the examples: | |
760 * for field_examples in dataset.fields(): | |
761 for example_value in field_examples: | |
762 ... | |
763 but when the dataset is a stream (unbounded length), it is not recommended to do | |
764 such things because the underlying dataset may refuse to access the different fields in | |
765 an unsynchronized ways. Hence the fields() method is illegal for streams, by default. | |
766 The result of fields() is a DataSetFields object, which iterates over fields, | |
767 and whose elements are iterable over examples. A DataSetFields object can | |
768 be turned back into a DataSet with its examples() method: | |
769 dataset2 = dataset1.fields().examples() | |
770 and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1). | |
771 | |
772 DataSetFields can be concatenated vertically or horizontally. To be consistent with | |
773 the syntax used for DataSets, the | concatenates the fields and the & concatenates | |
774 the examples. | |
775 """ | |
776 def __init__(self,dataset,fieldnames): | |
777 original_dataset=dataset | |
778 if not fieldnames: | |
779 fieldnames=dataset.fieldNames() | |
780 elif not list(fieldnames)==list(dataset.fieldNames()): | |
781 #we must cast to list, othersize('x','y')!=['x','y'] | |
782 dataset = FieldsSubsetDataSet(dataset,fieldnames) | |
783 assert dataset.hasFields(*fieldnames) | |
784 self.dataset=dataset | |
785 | |
786 if isinstance(dataset,MinibatchDataSet): | |
787 Example.__init__(self,fieldnames,list(dataset._fields)) | |
788 elif isinstance(original_dataset,MinibatchDataSet): | |
789 Example.__init__(self,fieldnames, | |
790 [original_dataset._fields[field] | |
791 for field in fieldnames]) | |
792 else: | |
793 minibatch_iterator = dataset.minibatches(fieldnames, | |
794 minibatch_size=len(dataset), | |
795 n_batches=1) | |
796 minibatch=minibatch_iterator.next() | |
797 Example.__init__(self,fieldnames,minibatch) | |
798 | |
799 def examples(self): | |
800 return self.dataset | |
801 | |
802 def __or__(self,other): | |
803 """ | |
804 fields1 | fields2 is a DataSetFields that whose list of examples is the concatenation | |
805 of the list of examples of DataSetFields fields1 and fields2. | |
806 """ | |
807 return (self.examples() + other.examples()).fields() | |
808 | |
809 def __and__(self,other): | |
810 """ | |
811 fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation | |
812 of the fields of DataSetFields fields1 and fields2. | |
813 """ | |
814 return (self.examples() | other.examples()).fields() | |
815 | |
816 | |
817 class MinibatchDataSet(DataSet): | |
818 """ | |
819 Turn a L{Example} of same-length (iterable) fields into an example-iterable dataset. | |
820 Each element of the lookup-list should be an iterable and sliceable, all of the same length. | |
821 """ | |
822 def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack, | |
823 values_hstack=DataSet().valuesHStack): | |
824 """ | |
825 The user can (and generally should) also provide values_vstack(fieldname,fieldvalues) | |
826 and a values_hstack(fieldnames,fieldvalues) functions behaving with the same | |
827 semantics as the DataSet methods of the same name (but without the self argument). | |
828 """ | |
829 | |
830 self._fields=fields_lookuplist | |
831 assert len(fields_lookuplist)>0 | |
832 self.length=len(fields_lookuplist[0]) | |
833 for field in fields_lookuplist[1:]: | |
834 if self.length != len(field) : | |
835 print 'self.length = ',self.length | |
836 print 'len(field) = ', len(field) | |
837 print 'self._fields.keys() = ', self._fields.keys() | |
838 print 'field=',field | |
839 print 'fields_lookuplist=', fields_lookuplist | |
840 assert self.length==len(field) | |
841 self.valuesVStack=values_vstack | |
842 self.valuesHStack=values_hstack | |
843 | |
844 def __len__(self): | |
845 return self.length | |
846 | |
847 def dontuse__getitem__(self,i): | |
848 if type(i) in (slice,list): | |
849 return DataSetFields(MinibatchDataSet( | |
850 Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames()) | |
851 if type(i) is int: | |
852 return Example(self._fields.keys(),[field[i] for field in self._fields]) | |
853 if self.hasFields(i): | |
854 return self._fields[i] | |
855 assert i in self.__dict__ # else it means we are trying to access a non-existing property | |
856 return self.__dict__[i] | |
857 | |
858 def fieldNames(self): | |
859 return self._fields.keys() | |
860 | |
861 def hasFields(self,*fieldnames): | |
862 for fieldname in fieldnames: | |
863 if fieldname not in self._fields.keys(): | |
864 return False | |
865 return True | |
866 | |
867 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): | |
868 #@TODO bug somewhere here, fieldnames doesnt seem to be well handled | |
869 class Iterator(object): | |
870 def __init__(self,ds,fieldnames): | |
871 # tbm: added two next lines to handle fieldnames | |
872 if fieldnames is None: fieldnames = ds._fields.keys() | |
873 self.fieldnames = fieldnames | |
874 | |
875 self.ds=ds | |
876 self.next_example=offset | |
877 assert minibatch_size >= 0 | |
878 if offset+minibatch_size > ds.length: | |
879 raise NotImplementedError() | |
880 def __iter__(self): | |
881 return self | |
882 def next(self): | |
883 upper = self.next_example+minibatch_size | |
884 if upper > len(self.ds) : | |
885 raise StopIteration() | |
886 assert upper<=len(self.ds) # instead of self.ds.length | |
887 #minibatch = Example(self.ds._fields.keys(), | |
888 # [field[self.next_example:upper] | |
889 # for field in self.ds._fields]) | |
890 # tbm: modif to use fieldnames | |
891 values = [] | |
892 for f in self.fieldnames : | |
893 #print 'we have field',f,'in fieldnames' | |
894 values.append( self.ds._fields[f][self.next_example:upper] ) | |
895 minibatch = Example(self.fieldnames,values) | |
896 #print minibatch | |
897 self.next_example+=minibatch_size | |
898 return minibatch | |
899 | |
900 # tbm: added fieldnames to handle subset of fieldnames | |
901 return Iterator(self,fieldnames) | |
902 | |
903 class HStackedDataSet(DataSet): | |
904 """ | |
905 A L{DataSet} that wraps several datasets and shows a view that includes all their fields, | |
906 i.e. whose list of fields is the concatenation of their lists of fields. | |
907 | |
908 If a field name is found in more than one of the datasets, then either an error is | |
909 raised or the fields are renamed (either by prefixing the __name__ attribute | |
910 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list). | |
911 | |
912 @todo: automatically detect a chain of stacked datasets due to A | B | C | D ... | |
913 """ | |
914 def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None): | |
915 DataSet.__init__(self,description,field_types) | |
916 self.datasets=datasets | |
917 self.accept_nonunique_names=accept_nonunique_names | |
918 self.fieldname2dataset={} | |
919 | |
920 def rename_field(fieldname,dataset,i): | |
921 if hasattr(dataset,"__name__"): | |
922 return dataset.__name__ + "." + fieldname | |
923 return fieldname+"."+str(i) | |
924 | |
925 # make sure all datasets have the same length and unique field names | |
926 self.length=None | |
927 names_to_change=[] | |
928 for i in xrange(len(datasets)): | |
929 dataset = datasets[i] | |
930 length=len(dataset) | |
931 if self.length: | |
932 assert self.length==length | |
933 else: | |
934 self.length=length | |
935 for fieldname in dataset.fieldNames(): | |
936 if fieldname in self.fieldname2dataset: # name conflict! | |
937 if accept_nonunique_names: | |
938 fieldname=rename_field(fieldname,dataset,i) | |
939 names2change.append((fieldname,i)) | |
940 else: | |
941 raise ValueError("Incompatible datasets: non-unique field name = "+fieldname) | |
942 self.fieldname2dataset[fieldname]=i | |
943 for fieldname,i in names_to_change: | |
944 del self.fieldname2dataset[fieldname] | |
945 self.fieldname2dataset[rename_field(fieldname,self.datasets[i],i)]=i | |
946 | |
947 def __len__(self): | |
948 return len(self.datasets[0]) | |
949 | |
950 def hasFields(self,*fieldnames): | |
951 for fieldname in fieldnames: | |
952 if not fieldname in self.fieldname2dataset: | |
953 return False | |
954 return True | |
955 | |
956 def fieldNames(self): | |
957 return self.fieldname2dataset.keys() | |
958 | |
959 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): | |
960 | |
961 class HStackedIterator(object): | |
962 def __init__(self,hsds,iterators): | |
963 self.hsds=hsds | |
964 self.iterators=iterators | |
965 def __iter__(self): | |
966 return self | |
967 def next(self): | |
968 # concatenate all the fields of the minibatches | |
969 l=Example() | |
970 for iter in self.iterators: | |
971 l.append_lookuplist(iter.next()) | |
972 return l | |
973 | |
974 assert self.hasFields(*fieldnames) | |
975 # find out which underlying datasets are necessary to service the required fields | |
976 # and construct corresponding minibatch iterators | |
977 if fieldnames and fieldnames!=self.fieldNames(): | |
978 datasets=set([]) | |
979 fields_in_dataset=dict([(dataset,[]) for dataset in datasets]) | |
980 for fieldname in fieldnames: | |
981 dataset=self.datasets[self.fieldname2dataset[fieldname]] | |
982 datasets.add(dataset) | |
983 fields_in_dataset[dataset].append(fieldname) | |
984 datasets=list(datasets) | |
985 iterators=[dataset.minibatches(fields_in_dataset[dataset],minibatch_size,n_batches,offset) | |
986 for dataset in datasets] | |
987 else: | |
988 datasets=self.datasets | |
989 iterators=[dataset.minibatches(None,minibatch_size,n_batches,offset) for dataset in datasets] | |
990 return HStackedIterator(self,iterators) | |
991 | |
992 | |
993 def untested_valuesVStack(self,fieldname,fieldvalues): | |
994 return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues) | |
995 | |
996 def untested_valuesHStack(self,fieldnames,fieldvalues): | |
997 """ | |
998 We will use the sub-dataset associated with the first fieldname in the fieldnames list | |
999 to do the work, hoping that it can cope with the other values (i.e. won't care | |
1000 about the incompatible fieldnames). Hence this heuristic will always work if | |
1001 all the fieldnames are of the same sub-dataset. | |
1002 """ | |
1003 return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues) | |
1004 | |
1005 class VStackedDataSet(DataSet): | |
1006 """ | |
1007 A L{DataSet} that wraps several datasets and shows a view that includes all their examples, | |
1008 in the order provided. This clearly assumes that they all have the same field names | |
1009 and all (except possibly the last one) are of finite length. | |
1010 | |
1011 @todo: automatically detect a chain of stacked datasets due to A + B + C + D ... | |
1012 """ | |
1013 def __init__(self,datasets): | |
1014 self.datasets=datasets | |
1015 self.length=0 | |
1016 self.index2dataset={} | |
1017 assert len(datasets)>0 | |
1018 fieldnames = datasets[-1].fieldNames() | |
1019 self.datasets_start_row=[] | |
1020 # We use this map from row index to dataset index for constant-time random access of examples, | |
1021 # to avoid having to search for the appropriate dataset each time and slice is asked for. | |
1022 for dataset,k in enumerate(datasets[0:-1]): | |
1023 assert dataset.is_unbounded() # All VStacked datasets (except possibly the last) must be bounded (have a length). | |
1024 L=len(dataset) | |
1025 for i in xrange(L): | |
1026 self.index2dataset[self.length+i]=k | |
1027 self.datasets_start_row.append(self.length) | |
1028 self.length+=L | |
1029 assert dataset.fieldNames()==fieldnames | |
1030 self.datasets_start_row.append(self.length) | |
1031 self.length+=len(datasets[-1]) | |
1032 # If length is very large, we should use a more memory-efficient mechanism | |
1033 # that does not store all indices | |
1034 if self.length>1000000: | |
1035 # 1 million entries would require about 60 meg for the index2dataset map | |
1036 # TODO | |
1037 print "A more efficient mechanism for index2dataset should be implemented" | |
1038 | |
1039 def __len__(self): | |
1040 return self.length | |
1041 | |
1042 def fieldNames(self): | |
1043 return self.datasets[0].fieldNames() | |
1044 | |
1045 def hasFields(self,*fieldnames): | |
1046 return self.datasets[0].hasFields(*fieldnames) | |
1047 | |
1048 def locate_row(self,row): | |
1049 """Return (dataset_index, row_within_dataset) for global row number""" | |
1050 dataset_index = self.index2dataset[row] | |
1051 row_within_dataset = self.datasets_start_row[dataset_index] | |
1052 return dataset_index, row_within_dataset | |
1053 | |
1054 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): | |
1055 | |
1056 class VStackedIterator(object): | |
1057 def __init__(self,vsds): | |
1058 self.vsds=vsds | |
1059 self.next_row=offset | |
1060 self.next_dataset_index,self.next_dataset_row=self.vsds.locate_row(offset) | |
1061 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ | |
1062 self.next_iterator(vsds.datasets[0],offset,n_batches) | |
1063 | |
1064 def next_iterator(self,dataset,starting_offset,batches_left): | |
1065 L=len(dataset) | |
1066 ds_nbatches = (L-starting_offset)/minibatch_size | |
1067 if batches_left is not None: | |
1068 ds_nbatches = max(batches_left,ds_nbatches) | |
1069 if minibatch_size>L: | |
1070 ds_minibatch_size=L | |
1071 n_left_in_mb=minibatch_size-L | |
1072 ds_nbatches=1 | |
1073 else: | |
1074 n_left_in_mb=0 | |
1075 return dataset.minibatches(fieldnames,minibatch_size,ds_nbatches,starting_offset), \ | |
1076 L-(starting_offset+ds_nbatches*minibatch_size), n_left_in_mb | |
1077 | |
1078 def move_to_next_dataset(self): | |
1079 if self.n_left_at_the_end_of_ds>0: | |
1080 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ | |
1081 self.next_iterator(vsds.datasets[self.next_dataset_index], | |
1082 self.n_left_at_the_end_of_ds,1) | |
1083 else: | |
1084 self.next_dataset_index +=1 | |
1085 if self.next_dataset_index==len(self.vsds.datasets): | |
1086 self.next_dataset_index = 0 | |
1087 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ | |
1088 self.next_iterator(vsds.datasets[self.next_dataset_index],starting_offset,n_batches) | |
1089 | |
1090 def __iter__(self): | |
1091 return self | |
1092 | |
1093 def next(self): | |
1094 dataset=self.vsds.datasets[self.next_dataset_index] | |
1095 mb = self.next_iterator.next() | |
1096 if self.n_left_in_mb: | |
1097 extra_mb = [] | |
1098 while self.n_left_in_mb>0: | |
1099 self.move_to_next_dataset() | |
1100 extra_mb.append(self.next_iterator.next()) | |
1101 mb = Example(fieldnames, | |
1102 [dataset.valuesVStack(name, | |
1103 [mb[name]]+[b[name] for b in extra_mb]) | |
1104 for name in fieldnames]) | |
1105 | |
1106 self.next_row+=minibatch_size | |
1107 self.next_dataset_row+=minibatch_size | |
1108 if self.next_row+minibatch_size>len(dataset): | |
1109 self.move_to_next_dataset() | |
1110 return examples | |
1111 return VStackedIterator(self) | |
1112 | |
1113 class ArrayFieldsDataSet(DataSet): | |
1114 """ | |
1115 Virtual super-class of datasets whose field values are numpy array, | |
1116 thus defining valuesHStack and valuesVStack for sub-classes. | |
1117 """ | |
1118 def __init__(self,description=None,field_types=None): | |
1119 DataSet.__init__(self,description,field_types) | |
1120 def untested_valuesHStack(self,fieldnames,fieldvalues): | |
1121 """Concatenate field values horizontally, e.g. two vectors | |
1122 become a longer vector, two matrices become a wider matrix, etc.""" | |
1123 return numpy.hstack(fieldvalues) | |
1124 def untested_valuesVStack(self,fieldname,values): | |
1125 """Concatenate field values vertically, e.g. two vectors | |
1126 become a two-row matrix, two matrices become a longer matrix, etc.""" | |
1127 return numpy.vstack(values) | |
1128 | |
1129 | |
1130 | |
1131 class NArraysDataSet(ArrayFieldsDataSet) : | |
1132 """ | |
1133 An NArraysDataSet stores fields that are numpy tensor, whose first axis | |
1134 iterates over examples. It's a generalization of ArrayDataSet. | |
1135 """ | |
1136 #@TODO not completely implemented yet | |
1137 def __init__(self, data_arrays, fieldnames, **kwargs) : | |
1138 """ | |
1139 Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list | |
1140 of fieldnames. The number of arrays must be the same as the number of | |
1141 fieldnames. Each set of numpy tensor must have the same first dimension (first | |
1142 axis) corresponding to the number of examples. | |
1143 | |
1144 Every tensor is treated as a numpy array (using numpy.asarray) | |
1145 """ | |
1146 ArrayFieldsDataSet.__init__(self,**kwargs) | |
1147 assert len(data_arrays) == len(fieldnames) | |
1148 assert len(fieldnames) > 0 | |
1149 ndarrays = [numpy.asarray(a) for a in data_arrays] | |
1150 lens = [a.shape[0] for a in ndarrays] | |
1151 num_examples = lens[0] #they must all be equal anyway | |
1152 self._fieldnames = fieldnames | |
1153 for k in ndarrays : | |
1154 assert k.shape[0] == num_examples | |
1155 self._datas = ndarrays | |
1156 # create dict | |
1157 self.map_field_idx = dict() | |
1158 for k in range(len(fieldnames)): | |
1159 self.map_field_idx[fieldnames[k]] = k | |
1160 | |
1161 | |
1162 def __len__(self) : | |
1163 """ | |
1164 Length of the dataset is based on the first array = data_arrays[0], using its shape | |
1165 """ | |
1166 return self._datas[0].shape[0] | |
1167 | |
1168 def fieldNames(self) : | |
1169 """ | |
1170 Returns the fieldnames as set in self.__init__ | |
1171 """ | |
1172 return self._fieldnames | |
1173 | |
1174 def field_pos(self,fieldname) : | |
1175 """ | |
1176 Returns the index of a given fieldname. Fieldname must exists! see fieldNames(). | |
1177 """ | |
1178 return self.map_field_idx[fieldname] | |
1179 | |
1180 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): | |
1181 cursor = Example(fieldnames,[0]*len(fieldnames)) | |
1182 fieldnames = self.fieldNames() if fieldnames is None else fieldnames | |
1183 for n in xrange(n_batches): | |
1184 if offset == len(self): | |
1185 break | |
1186 for f in range(len(cursor._names)) : | |
1187 idx = self.field_pos(cursor._names[f]) | |
1188 sub_data = self._datas[idx][offset : offset+minibatch_size] | |
1189 cursor._values[f] = sub_data | |
1190 offset += len(sub_data) #can be less than minibatch_size at end | |
1191 yield cursor | |
1192 | |
1193 #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) | |
1194 | |
1195 | |
1196 | |
1197 | |
1198 class ArrayDataSet(ArrayFieldsDataSet): | |
1199 """ | |
1200 An ArrayDataSet stores the fields as groups of columns in a numpy tensor, | |
1201 whose first axis iterates over examples, second axis determines fields. | |
1202 If the underlying array is N-dimensional (has N axes), then the field | |
1203 values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2). | |
1204 """ | |
1205 | |
1206 def __init__(self, data_array, fields_columns, **kwargs): | |
1207 """ | |
1208 Construct an ArrayDataSet from the underlying numpy array (data) and | |
1209 a map (fields_columns) from fieldnames to field columns. The columns of a field are specified | |
1210 using the standard arguments for indexing/slicing: integer for a column index, | |
1211 slice for an interval of columns (with possible stride), or iterable of column indices. | |
1212 """ | |
1213 ArrayFieldsDataSet.__init__(self, **kwargs) | |
1214 self.data=data_array | |
1215 self.fields_columns=fields_columns | |
1216 | |
1217 # check consistency and complete slices definitions | |
1218 for fieldname, fieldcolumns in self.fields_columns.items(): | |
1219 if type(fieldcolumns) is int: | |
1220 assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1] | |
1221 if 1: | |
1222 #I changed this because it didn't make sense to me, | |
1223 # and it made it more difficult to write my learner. | |
1224 # If it breaks stuff, let's talk about it. | |
1225 # - James 22/05/2008 | |
1226 self.fields_columns[fieldname]=[fieldcolumns] | |
1227 else: | |
1228 self.fields_columns[fieldname]=fieldcolumns | |
1229 elif type(fieldcolumns) is slice: | |
1230 start,step=fieldcolumns.start,fieldcolumns.step | |
1231 if not start: | |
1232 start=0 | |
1233 if not step: | |
1234 step=1 | |
1235 self.fields_columns[fieldname]=slice(start,fieldcolumns.stop,step) | |
1236 elif hasattr(fieldcolumns,"__iter__"): # something like a list | |
1237 for i in fieldcolumns: | |
1238 assert i>=0 and i<data_array.shape[1] | |
1239 | |
1240 def fieldNames(self): | |
1241 return self.fields_columns.keys() | |
1242 | |
1243 def __len__(self): | |
1244 return len(self.data) | |
1245 | |
1246 def __getitem__(self,key): | |
1247 """More efficient implementation than the default __getitem__""" | |
1248 fieldnames=self.fields_columns.keys() | |
1249 values=self.fields_columns.values() | |
1250 if type(key) is int: | |
1251 return Example(fieldnames, | |
1252 [self.data[key,col] for col in values]) | |
1253 if type(key) is slice: | |
1254 return Example(fieldnames,[self.data[key,col] for col in values]) | |
1255 if type(key) is list: | |
1256 for i in range(len(key)): | |
1257 if self.hasFields(key[i]): | |
1258 key[i]=self.fields_columns[key[i]] | |
1259 return Example(fieldnames, | |
1260 #we must separate differently for list as numpy | |
1261 # doesn't support self.data[[i1,...],[i2,...]] | |
1262 # when their is more then two i1 and i2 | |
1263 [self.data[key,:][:,col] | |
1264 if isinstance(col,list) else | |
1265 self.data[key,col] for col in values]) | |
1266 | |
1267 # else check for a fieldname | |
1268 if self.hasFields(key): | |
1269 return self.data[:,self.fields_columns[key]] | |
1270 # else we are trying to access a property of the dataset | |
1271 assert key in self.__dict__ # else it means we are trying to access a non-existing property | |
1272 return self.__dict__[key] | |
1273 | |
1274 def dontuse__iter__(self): | |
1275 class ArrayDataSetIteratorIter(object): | |
1276 def __init__(self,dataset,fieldnames): | |
1277 if fieldnames is None: fieldnames = dataset.fieldNames() | |
1278 # store the resulting minibatch in a lookup-list of values | |
1279 self.minibatch = Example(fieldnames,[0]*len(fieldnames)) | |
1280 self.dataset=dataset | |
1281 self.current=0 | |
1282 self.columns = [self.dataset.fields_columns[f] | |
1283 for f in self.minibatch._names] | |
1284 self.l = self.dataset.data.shape[0] | |
1285 def __iter__(self): | |
1286 return self | |
1287 def next(self): | |
1288 #@todo: we suppose that we need to stop only when minibatch_size == 1. | |
1289 # Otherwise, MinibatchWrapAroundIterator do it. | |
1290 if self.current>=self.l: | |
1291 raise StopIteration | |
1292 sub_data = self.dataset.data[self.current] | |
1293 self.minibatch._values = [sub_data[c] for c in self.columns] | |
1294 | |
1295 self.current+=1 | |
1296 return self.minibatch | |
1297 | |
1298 return ArrayDataSetIteratorIter(self,self.fieldNames()) | |
1299 | |
1300 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): | |
1301 cursor = Example(fieldnames,[0]*len(fieldnames)) | |
1302 fieldnames = self.fieldNames() if fieldnames is None else fieldnames | |
1303 if n_batches == None: | |
1304 n_batches = (len(self) - offset) / minibatch_size | |
1305 for n in xrange(n_batches): | |
1306 if offset == len(self): | |
1307 break | |
1308 sub_data = self.data[offset : offset+minibatch_size] | |
1309 offset += len(sub_data) #can be less than minibatch_size at end | |
1310 cursor._values = [sub_data[:,self.fields_columns[f]] for f in cursor._names] | |
1311 yield cursor | |
1312 | |
1313 #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) | |
1314 | |
1315 | |
1316 class CachedDataSet(DataSet): | |
1317 """ | |
1318 Wrap a L{DataSet} whose values are computationally expensive to obtain | |
1319 (e.g. because they involve some computation, or disk access), | |
1320 so that repeated accesses to the same example are done cheaply, | |
1321 by caching every example value that has been accessed at least once. | |
1322 | |
1323 Optionally, for finite-length dataset, all the values can be computed | |
1324 (and cached) upon construction of the CachedDataSet, rather at the | |
1325 first access. | |
1326 | |
1327 @todo: when cache_all_upon_construction create mini-batches that are as | |
1328 large as possible but not so large as to fill up memory. | |
1329 | |
1330 @todo: add disk-buffering capability, so that when the cache becomes too | |
1331 big for memory, we cache things on disk, trying to keep in memory only | |
1332 the record most likely to be accessed next. | |
1333 """ | |
1334 def __init__(self,source_dataset,cache_all_upon_construction=False): | |
1335 self.source_dataset=source_dataset | |
1336 self.cache_all_upon_construction=cache_all_upon_construction | |
1337 self.cached_examples = [] | |
1338 if cache_all_upon_construction: | |
1339 # this potentially brings all the source examples | |
1340 # into memory at once, which may be too much | |
1341 # the work could possibly be done by minibatches | |
1342 # that are as large as possible but no more than what memory allows. | |
1343 # | |
1344 # field_values is supposed to be an DataSetFields, that inherits from LookupList | |
1345 #fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next() | |
1346 fields_values = DataSetFields(source_dataset,None) | |
1347 assert all([len(self)==len(field_values) for field_values in fields_values]) | |
1348 for example in fields_values.examples(): | |
1349 self.cached_examples.append(copy.copy(example)) | |
1350 | |
1351 self.fieldNames = source_dataset.fieldNames | |
1352 self.hasFields = source_dataset.hasFields | |
1353 self.valuesHStack = source_dataset.valuesHStack | |
1354 self.valuesVStack = source_dataset.valuesVStack | |
1355 | |
1356 def __len__(self): | |
1357 return len(self.source_dataset) | |
1358 | |
1359 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): | |
1360 class CacheIterator(object): | |
1361 def __init__(self,dataset): | |
1362 self.dataset=dataset | |
1363 self.current=offset | |
1364 self.all_fields = self.dataset.fieldNames()==fieldnames | |
1365 self.n_batches = n_batches | |
1366 self.batch_counter = 0 | |
1367 def __iter__(self): return self | |
1368 def next(self): | |
1369 self.batch_counter += 1 | |
1370 if self.n_batches and self.batch_counter > self.n_batches : | |
1371 raise StopIteration() | |
1372 upper = self.current+minibatch_size | |
1373 if upper > len(self.dataset.source_dataset): | |
1374 raise StopIteration() | |
1375 cache_len = len(self.dataset.cached_examples) | |
1376 if upper>cache_len: # whole minibatch is not already in cache | |
1377 # cache everything from current length to upper | |
1378 #for example in self.dataset.source_dataset[cache_len:upper]: | |
1379 for example in self.dataset.source_dataset.subset[cache_len:upper]: | |
1380 self.dataset.cached_examples.append(example) | |
1381 all_fields_minibatch = Example(self.dataset.fieldNames(), | |
1382 zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size])) | |
1383 | |
1384 self.current+=minibatch_size | |
1385 if self.all_fields: | |
1386 return all_fields_minibatch | |
1387 return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames]) | |
1388 return CacheIterator(self) | |
1389 | |
1390 def dontuse__getitem__(self,i): | |
1391 if type(i)==int and len(self.cached_examples)>i: | |
1392 return self.cached_examples[i] | |
1393 else: | |
1394 return self.source_dataset[i] | |
1395 | |
1396 def __iter__(self): | |
1397 class CacheIteratorIter(object): | |
1398 def __init__(self,dataset): | |
1399 self.dataset=dataset | |
1400 self.l = len(dataset) | |
1401 self.current = 0 | |
1402 self.fieldnames = self.dataset.fieldNames() | |
1403 self.example = Example(self.fieldnames,[0]*len(self.fieldnames)) | |
1404 def __iter__(self): return self | |
1405 def next(self): | |
1406 if self.current>=self.l: | |
1407 raise StopIteration | |
1408 cache_len = len(self.dataset.cached_examples) | |
1409 if self.current>=cache_len: # whole minibatch is not already in cache | |
1410 # cache everything from current length to upper | |
1411 self.dataset.cached_examples.append( | |
1412 self.dataset.source_dataset[self.current]) | |
1413 self.example._values = self.dataset.cached_examples[self.current] | |
1414 self.current+=1 | |
1415 return self.example | |
1416 | |
1417 return CacheIteratorIter(self) | |
1418 | |
1419 class ApplyFunctionDataSet(DataSet): | |
1420 """ | |
1421 A L{DataSet} that contains as fields the results of applying a | |
1422 given function example-wise or minibatch-wise to all the fields of | |
1423 an input dataset. The output of the function should be an iterable | |
1424 (e.g. a list or a LookupList) over the resulting values. | |
1425 | |
1426 The function take as input the fields of the dataset, not the examples. | |
1427 | |
1428 In minibatch mode, the function is expected to work on minibatches | |
1429 (takes a minibatch in input and returns a minibatch in output). More | |
1430 precisely, it means that each element of the input or output list | |
1431 should be iterable and indexable over the individual example values | |
1432 (typically these elements will be numpy arrays). All of the elements | |
1433 in the input and output lists should have the same length, which is | |
1434 the length of the minibatch. | |
1435 | |
1436 The function is applied each time an example or a minibatch is accessed. | |
1437 To avoid re-doing computation, wrap this dataset inside a CachedDataSet. | |
1438 | |
1439 If the values_{h,v}stack functions are not provided, then | |
1440 the input_dataset.values{H,V}Stack functions are used by default. | |
1441 | |
1442 """ | |
1443 | |
1444 def __init__(self,input_dataset,function,output_names,minibatch_mode=True, | |
1445 values_hstack=None,values_vstack=None, | |
1446 description=None,fieldtypes=None): | |
1447 """ | |
1448 Constructor takes an input dataset that has as many fields as the function | |
1449 expects as inputs. The resulting dataset has as many fields as the function | |
1450 produces as outputs, and that should correspond to the number of output names | |
1451 (provided in a list). | |
1452 | |
1453 Note that the expected semantics of the function differs in minibatch mode | |
1454 (it takes minibatches of inputs and produces minibatches of outputs, as | |
1455 documented in the class comment). | |
1456 | |
1457 TBM: are fieldtypes the old field types (from input_dataset) or the new ones | |
1458 (for the new dataset created)? | |
1459 """ | |
1460 self.input_dataset=input_dataset | |
1461 self.function=function | |
1462 self.output_names=output_names | |
1463 #print 'self.output_names in afds:', self.output_names | |
1464 #print 'length in afds:', len(self.output_names) | |
1465 self.minibatch_mode=minibatch_mode | |
1466 DataSet.__init__(self,description,fieldtypes) | |
1467 self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack | |
1468 self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack | |
1469 | |
1470 def __len__(self): | |
1471 return len(self.input_dataset) | |
1472 | |
1473 def fieldNames(self): | |
1474 return self.output_names | |
1475 | |
1476 def minibatches_nowrap(self, fieldnames, *args, **kwargs): | |
1477 all_input_fieldNames = self.input_dataset.fieldNames() | |
1478 mbnw = self.input_dataset.minibatches_nowrap | |
1479 | |
1480 for input_fields in mbnw(all_input_fieldNames, *args, **kwargs): | |
1481 if self.minibatch_mode: | |
1482 all_output_fields = self.function(*input_fields) | |
1483 else: | |
1484 input_examples = zip(*input_fields) #makes so that [i] means example i | |
1485 output_examples = [self.function(*input_example) | |
1486 for input_example in input_examples] | |
1487 all_output_fields = zip(*output_examples) | |
1488 | |
1489 #print 'output_names=', self.output_names | |
1490 #print 'all_output_fields', all_output_fields | |
1491 #print 'len(all_output_fields)=', len(all_output_fields) | |
1492 all_outputs = Example(self.output_names, all_output_fields) | |
1493 if fieldnames==self.output_names: | |
1494 rval = all_outputs | |
1495 else: | |
1496 rval = Example(fieldnames,[all_outputs[name] for name in fieldnames]) | |
1497 #print 'rval', rval | |
1498 #print '--------' | |
1499 yield rval | |
1500 | |
1501 def untested__iter__(self): # only implemented for increased efficiency | |
1502 class ApplyFunctionSingleExampleIterator(object): | |
1503 def __init__(self,output_dataset): | |
1504 self.current=0 | |
1505 self.output_dataset=output_dataset | |
1506 self.input_iterator=output_dataset.input_dataset.__iter__() | |
1507 def __iter__(self): return self | |
1508 def next(self): | |
1509 if self.output_dataset.minibatch_mode: | |
1510 function_inputs = [[input] for input in self.input_iterator.next()] | |
1511 outputs = self.output_dataset.function(*function_inputs) | |
1512 assert all([hasattr(output,'__iter__') for output in outputs]) | |
1513 function_outputs = [output[0] for output in outputs] | |
1514 else: | |
1515 function_inputs = self.input_iterator.next() | |
1516 function_outputs = self.output_dataset.function(*function_inputs) | |
1517 return Example(self.output_dataset.output_names,function_outputs) | |
1518 return ApplyFunctionSingleExampleIterator(self) | |
1519 | |
1520 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): | |
1521 """ | |
1522 Wraps an arbitrary L{DataSet} into one for supervised learning tasks | |
1523 by forcing the user to define a set of fields as the 'input' field | |
1524 and a set of fields as the 'target' field. Optionally, a single | |
1525 weight_field can also be defined. | |
1526 """ | |
1527 args = ((input_fields,'input'),(output_fields,'target')) | |
1528 if weight_field: args+=(([weight_field],'weight')) | |
1529 return src_dataset.merge_fields(*args) | |
1530 | |
1531 | |
1532 | |
1533 |