comparison pylearn/old_dataset/dataset.py @ 537:b054271b2504

new file structure layout, factories, etc.
author James Bergstra <bergstrj@iro.umontreal.ca>
date Wed, 12 Nov 2008 21:57:54 -0500
parents dataset.py@fb62f0e4bcfe
children
comparison
equal deleted inserted replaced
518:4aa7f74ea93f 537:b054271b2504
1
2 from lookup_list import LookupList as Example
3 from common.misc import unique_elements_list_intersection
4 from string import join
5 from sys import maxint
6 import numpy, copy
7
8 from exceptions import *
9
10 class AttributesHolder(object):
11 def __init__(self): pass
12
13 def attributeNames(self):
14 raise AbstractFunction()
15
16 def setAttributes(self,attribute_names,attribute_values,make_copies=False):
17 """
18 Allow the attribute_values to not be a list (but a single value) if the attribute_names is of length 1.
19 """
20 if len(attribute_names)==1 and not (isinstance(attribute_values,list) or isinstance(attribute_values,tuple) ):
21 attribute_values = [attribute_values]
22 if make_copies:
23 for name,value in zip(attribute_names,attribute_values):
24 self.__setattr__(name,copy.deepcopy(value))
25 else:
26 for name,value in zip(attribute_names,attribute_values):
27 self.__setattr__(name,value)
28
29 def getAttributes(self,attribute_names=None, return_copy=False):
30 """
31 Return all (if attribute_names=None, in the order of attributeNames()) or a specified subset of attributes.
32 """
33 if attribute_names is None:
34 attribute_names = self.attributeNames()
35 if return_copy:
36 return [copy.copy(self.__getattribute__(name)) for name in attribute_names]
37 else:
38 return [self.__getattribute__(name) for name in attribute_names]
39
40 class DataSet(AttributesHolder):
41 """A virtual base class for datasets.
42
43 A DataSet can be seen as a generalization of a matrix, meant to be used in conjunction
44 with learning algorithms (for training and testing them): rows/records are called examples, and
45 columns/attributes are called fields. The field value for a particular example can be an arbitrary
46 python object, which depends on the particular dataset.
47
48 We call a DataSet a 'stream' when its length is unbounded (in which case its __len__ method
49 should return sys.maxint).
50
51 A DataSet is a generator of iterators; these iterators can run through the
52 examples or the fields in a variety of ways. A DataSet need not necessarily have a finite
53 or known length, so this class can be used to interface to a 'stream' which
54 feeds on-line learning (however, as noted below, some operations are not
55 feasible or not recommended on streams).
56
57 To iterate over examples, there are several possibilities:
58 - for example in dataset:
59 - for val1,val2,... in dataset:
60 - for example in dataset(field1, field2,field3, ...):
61 - for val1,val2,val3 in dataset(field1, field2,field3):
62 - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N):
63 - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N):
64 Each of these is documented below. All of these iterators are expected
65 to provide, in addition to the usual 'next()' method, a 'next_index()' method
66 which returns a non-negative integer pointing to the position of the next
67 example that will be returned by 'next()' (or of the first example in the
68 next minibatch returned). This is important because these iterators
69 can wrap around the dataset in order to do multiple passes through it,
70 in possibly unregular ways if the minibatch size is not a divisor of the
71 dataset length.
72
73 To iterate over fields, one can do
74 - for field in dataset.fields():
75 for field_value in field: # iterate over the values associated to that field for all the dataset examples
76 - for field in dataset(field1,field2,...).fields() to select a subset of fields
77 - for field in dataset.fields(field1,field2,...) to select a subset of fields
78 and each of these fields is iterable over the examples:
79 - for field_examples in dataset.fields():
80 for example_value in field_examples:
81 ...
82 but when the dataset is a stream (unbounded length), it is not recommended to do
83 such things because the underlying dataset may refuse to access the different fields in
84 an unsynchronized ways. Hence the fields() method is illegal for streams, by default.
85 The result of fields() is a L{DataSetFields} object, which iterates over fields,
86 and whose elements are iterable over examples. A DataSetFields object can
87 be turned back into a DataSet with its examples() method::
88 dataset2 = dataset1.fields().examples()
89 and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1).
90
91 Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
92
93 Note: The content of a field can be of any type. Field values can also be 'missing'
94 (e.g. to handle semi-supervised learning), and in the case of numeric (numpy array)
95 fields (i.e. an ArrayFieldsDataSet), NaN plays the role of a missing value.
96 What about non-numeric values? None.
97
98 Dataset elements can be indexed and sub-datasets (with a subset
99 of examples) can be extracted. These operations are not supported
100 by default in the case of streams.
101
102 - dataset[:n] returns an Example with the n first examples.
103
104 - dataset[i1:i2:s] returns an Example with the examples i1,i1+s,...i2-s.
105
106 - dataset[i] returns an Example.
107
108 - dataset[[i1,i2,...in]] returns an Example with examples i1,i2,...in.
109
110 A similar command gives you a DataSet instead of Examples :
111
112 - dataset.subset[:n] returns a DataSet with the n first examples.
113
114 - dataset.subset[i1:i2:s] returns a DataSet with the examples i1,i1+s,...i2-s.
115
116 - dataset.subset[i] returns a DataSet.
117
118 - dataset.subset[[i1,i2,...in]] returns a DataSet with examples i1,i2,...in.
119
120
121 - dataset.<property> returns the value of a property associated with
122 the name <property>. The following properties should be supported:
123 - 'description': a textual description or name for the dataset
124 - 'fieldtypes': a list of types (one per field)
125 A DataSet may have other attributes that it makes visible to other objects. These are
126 used to store information that is not example-wise but global to the dataset.
127 The list of names of these attributes is given by the attribute_names() method.
128
129 Datasets can be concatenated either vertically (increasing the length) or
130 horizontally (augmenting the set of fields), if they are compatible, using
131 the following operations (with the same basic semantics as numpy.hstack
132 and numpy.vstack):
133
134 - dataset1 | dataset2 | dataset3 == dataset.hstack([dataset1,dataset2,dataset3])
135
136 creates a new dataset whose list of fields is the concatenation of the list of
137 fields of the argument datasets. This only works if they all have the same length.
138
139 - dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3])
140
141 creates a new dataset that concatenates the examples from the argument datasets
142 (and whose length is the sum of the length of the argument datasets). This only
143 works if they all have the same fields.
144
145 According to the same logic, and viewing a DataSetFields object associated to
146 a DataSet as a kind of transpose of it, fields1 & fields2 concatenates fields of
147 a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their
148 examples.
149
150 A dataset can hold arbitrary key-value pairs that may be used to access meta-data
151 or other properties of the dataset or associated with the dataset or the result
152 of a computation stored in a dataset. These can be accessed through the [key] syntax
153 when key is a string (or more specifically, neither an integer, a slice, nor a list).
154
155 A DataSet sub-class should always redefine the following methods:
156 - __len__ if it is not a stream
157 - fieldNames
158 - minibatches_nowrap (called by DataSet.minibatches())
159 For efficiency of implementation, a sub-class might also want to redefine
160 - valuesHStack
161 - valuesVStack
162 - hasFields
163 - __getitem__ may not be feasible with some streams
164 - __iter__
165 A sub-class should also append attributes to self._attribute_names
166 (the default value returned by attributeNames()).
167 By convention, attributes not in attributeNames() should have a name
168 starting with an underscore.
169 @todo enforce/test that convention!
170 """
171
172 numpy_vstack = lambda fieldname,values: numpy.vstack(values)
173 numpy_hstack = lambda fieldnames,values: numpy.hstack(values)
174
175 def __init__(self, description=None, fieldnames=None, fieldtypes=None):
176 """
177 @type fieldnames: list of strings
178 @type fieldtypes: list of python types, same length as fieldnames
179 @type description: string
180 @param description: description/name for this dataset
181 """
182 def default_desc():
183 return type(self).__name__ \
184 + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )"
185
186 #self.fieldnames = fieldnames
187
188 self.fieldtypes = fieldtypes if fieldtypes is not None \
189 else [None]*1 #len(fieldnames)
190
191 self.description = default_desc() if description is None \
192 else description
193 self._attribute_names = ["description"]
194
195
196 attributeNames = property(lambda self: copy.copy(self._attribute_names))
197
198 def __contains__(self, fieldname):
199 return (fieldname in self.fieldNames()) \
200 or (fieldname in self.attributeNames())
201
202 def __iter__(self):
203 """Supports the syntax "for i in dataset: ..."
204
205 Using this syntax, "i" will be an Example instance (or equivalent) with
206 all the fields of DataSet self. Every field of "i" will give access to
207 a field of a single example. Fields should be accessible via
208 i["fielname"] or i[3] (in the order defined by the elements of the
209 Example returned by this iterator), but the derived class is free
210 to accept any type of identifier, and add extra functionality to the iterator.
211
212 The default implementation calls the minibatches iterator and extracts the first example of each field.
213 """
214 return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1))
215
216 def __len__(self):
217 """
218 len(dataset) returns the number of examples in the dataset.
219 By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint).
220 Sub-classes which implement finite-length datasets should redefine this method.
221 Some methods only make sense for finite-length datasets.
222 """
223 from sys import maxint
224 return maxint
225
226
227 class MinibatchToSingleExampleIterator(object):
228 """
229 Converts the result of minibatch iterator with minibatch_size==1 into
230 single-example values in the result. Therefore the result of
231 iterating on the dataset itself gives a sequence of single examples
232 (whereas the result of iterating over minibatches gives in each
233 Example field an iterable object over the individual examples in
234 the minibatch).
235 """
236 def __init__(self, minibatch_iterator):
237 self.minibatch_iterator = minibatch_iterator
238 self.minibatch = None
239 def __iter__(self): #makes for loop work
240 return self
241 def next(self):
242 size1_minibatch = self.minibatch_iterator.next()
243 if not self.minibatch:
244 names = size1_minibatch.keys()
245 # next lines are a hack, but there was problem when we were getting [array(327)] for instance
246 try:
247 values = [value[0] for value in size1_minibatch.values()]
248 except :
249 values = [value for value in size1_minibatch.values()]
250 self.minibatch = Example(names,values)
251 else:
252 self.minibatch._values = [value[0] for value in size1_minibatch.values()]
253 return self.minibatch
254
255 def next_index(self):
256 return self.minibatch_iterator.next_index()
257
258 class MinibatchWrapAroundIterator(object):
259 """
260 An iterator for minibatches that handles the case where we need to wrap around the
261 dataset because n_batches*minibatch_size > len(dataset). It is constructed from
262 a dataset that provides a minibatch iterator that does not need to handle that problem.
263 This class is a utility for dataset subclass writers, so that they do not have to handle
264 this issue multiple times, nor check that fieldnames are valid, nor handle the
265 empty fieldnames (meaning 'use all the fields').
266 """
267 def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset):
268 self.dataset=dataset
269 self.fieldnames=fieldnames
270 self.minibatch_size=minibatch_size
271 self.n_batches=n_batches
272 self.n_batches_done=0
273 self.next_row=offset
274 self.L=len(dataset)
275 self.offset=offset % self.L
276 ds_nbatches = (self.L-self.next_row)/self.minibatch_size
277 if n_batches is not None:
278 ds_nbatches = min(n_batches,ds_nbatches)
279 if fieldnames:
280 assert dataset.hasFields(*fieldnames)
281 else:
282 self.fieldnames=dataset.fieldNames()
283 self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, ds_nbatches,self.next_row)
284
285 def __iter__(self):
286 return self
287
288 def next_index(self):
289 return self.next_row
290
291 def next(self):
292 if self.n_batches and self.n_batches_done==self.n_batches:
293 raise StopIteration
294 elif not self.n_batches and self.next_row ==self.L:
295 raise StopIteration
296 upper = self.next_row+self.minibatch_size
297 if upper <=self.L:
298 minibatch = self.iterator.next()
299 else:
300 if not self.n_batches:
301 upper=min(upper, self.L)
302 # if their is not a fixed number of batch, we continue to the end of the dataset.
303 # this can create a minibatch that is smaller then the minibatch_size
304 assert (self.L-self.next_row)<=self.minibatch_size
305 minibatch = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
306 else:
307 # we must concatenate (vstack) the bottom and top parts of our minibatch
308 # first get the beginning of our minibatch (top of dataset)
309 first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
310 second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next()
311 minibatch = Example(self.fieldnames,
312 [self.dataset.valuesVStack(name,[first_part[name],second_part[name]])
313 for name in self.fieldnames])
314 self.next_row=upper
315 self.n_batches_done+=1
316 if upper >= self.L and self.n_batches:
317 self.next_row -= self.L
318 ds_nbatches = (self.L-self.next_row)/self.minibatch_size
319 if self.n_batches is not None:
320 ds_nbatches = min(self.n_batches,ds_nbatches)
321 self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size,
322 ds_nbatches,self.next_row)
323 return DataSetFields(MinibatchDataSet(minibatch,self.dataset.valuesVStack,
324 self.dataset.valuesHStack),
325 minibatch.keys())
326
327
328 minibatches_fieldnames = None
329 minibatches_minibatch_size = 1
330 minibatches_n_batches = None
331 def minibatches(self,
332 fieldnames = minibatches_fieldnames,
333 minibatch_size = minibatches_minibatch_size,
334 n_batches = minibatches_n_batches,
335 offset = 0):
336 """
337 Return an iterator that supports three forms of syntax:
338
339 for i in dataset.minibatches(None,**kwargs): ...
340
341 for i in dataset.minibatches([f1, f2, f3],**kwargs): ...
342
343 for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ...
344
345 Using the first two syntaxes, "i" will be an indexable object, such as a list,
346 tuple, or Example instance. In both cases, i[k] is a list-like container
347 of a batch of current examples. In the second case, i[0] is
348 list-like container of the f1 field of a batch current examples, i[1] is
349 a list-like container of the f2 field, etc.
350
351 Using the first syntax, all the fields will be returned in "i".
352 Using the third syntax, i1, i2, i3 will be list-like containers of the
353 f1, f2, and f3 fields of a batch of examples on each loop iteration.
354
355 The minibatches iterator is expected to return upon each call to next()
356 a DataSetFields object, which is a Example (indexed by the field names) whose
357 elements are iterable and indexable over the minibatch examples, and which keeps a pointer to
358 a sub-dataset that can be used to iterate over the individual examples
359 in the minibatch. Hence a minibatch can be converted back to a regular
360 dataset or its fields can be looked at individually (and possibly iterated over).
361
362 PARAMETERS
363 - fieldnames (list of any type, default None):
364 The loop variables i1, i2, i3 (in the example above) should contain the
365 f1, f2, and f3 fields of the current batch of examples. If None, the
366 derived class can choose a default, e.g. all fields.
367
368 - minibatch_size (integer, default 1)
369 On every iteration, the variables i1, i2, i3 will have
370 exactly minibatch_size elements. e.g. len(i1) == minibatch_size
371
372 @DEPRECATED n_batches : not used anywhere
373 - n_batches (integer, default None)
374 The iterator will loop exactly this many times, and then stop. If None,
375 the derived class can choose a default. If (-1), then the returned
376 iterator should support looping indefinitely.
377
378 - offset (integer, default 0)
379 The iterator will start at example 'offset' in the dataset, rather than the default.
380
381 Note: A list-like container is something like a tuple, list, numpy.ndarray or
382 any other object that supports integer indexing and slicing.
383
384 @ATTENTION: now minibatches returns minibatches_nowrap, which is supposed to return complete
385 batches only, raise StopIteration.
386 @ATTENTION: minibatches returns a LookupList, we can't iterate over examples on it.
387
388 """
389 #return DataSet.MinibatchWrapAroundIterator(self, fieldnames, minibatch_size, n_batches,offset)
390 assert offset >= 0
391 assert offset < len(self)
392 assert offset + minibatch_size -1 < len(self)
393 if fieldnames == None :
394 fieldnames = self.fieldNames()
395 return self.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset)
396
397 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
398 """
399 This is the minibatches iterator generator that sub-classes must define.
400 It does not need to worry about wrapping around multiple times across the dataset,
401 as this is handled by MinibatchWrapAroundIterator when DataSet.minibatches() is called.
402 The next() method of the returned iterator does not even need to worry about
403 the termination condition (as StopIteration will be raised by DataSet.minibatches
404 before an improper call to minibatches_nowrap's next() is made).
405 That next() method can assert that its next row will always be within [0,len(dataset)).
406 The iterator returned by minibatches_nowrap does not need to implement
407 a next_index() method either, as this will be provided by MinibatchWrapAroundIterator.
408 """
409 raise AbstractFunction()
410
411 def is_unbounded(self):
412 """
413 Tests whether a dataset is unbounded (e.g. a stream).
414 """
415 return len(self)==maxint
416
417 def hasFields(self,*fieldnames):
418 """
419 Return true if the given field name (or field names, if multiple arguments are
420 given) is recognized by the DataSet (i.e. can be used as a field name in one
421 of the iterators).
422
423 The default implementation may be inefficient (O(# fields in dataset)), as it calls the fieldNames()
424 method. Many datasets may store their field names in a dictionary, which would allow more efficiency.
425 """
426 return len(unique_elements_list_intersection(fieldnames,self.fieldNames()))>0
427
428 def fieldNames(self):
429 """
430 Return the list of field names that are supported by the iterators,
431 and for which hasFields(fieldname) would return True.
432 """
433 raise AbstractFunction()
434
435 def __call__(self,*fieldnames):
436 """
437 Return a dataset that sees only the fields whose name are specified.
438 """
439 assert self.hasFields(*fieldnames)
440 #return self.fields(*fieldnames).examples()
441 fieldnames_list = list(fieldnames)
442 return FieldsSubsetDataSet(self,fieldnames_list)
443
444 def cached_fields_subset(self,*fieldnames) :
445 """
446 Behaviour is supposed to be the same as __call__(*fieldnames), but the dataset returned is cached.
447 @see : dataset.__call__
448 """
449 assert self.hasFields(*fieldnames)
450 return self.fields(*fieldnames).examples()
451
452 def fields(self,*fieldnames):
453 """
454 Return a DataSetFields object associated with this dataset.
455 """
456 return DataSetFields(self,fieldnames)
457
458 def getitem_key(self, fieldname):
459 """A not-so-well thought-out place to put code that used to be in
460 getitem.
461 """
462 #removing as per discussion June 4. --JSB
463
464 i = fieldname
465 # else check for a fieldname
466 if self.hasFields(i):
467 return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
468 # else we are trying to access a property of the dataset
469 assert i in self.__dict__ # else it means we are trying to access a non-existing property
470 return self.__dict__[i]
471
472 def __getitem__(self,i):
473 """
474 @rtype: Example
475 @returns: single or multiple examples
476
477 @type i: integer or slice or <iterable> of integers
478 @param i:
479 dataset[i] returns the (i+1)-th example of the dataset.
480 dataset[i:j] returns a LookupList with examples i,i+1,...,j-1.
481 dataset[i:j:s] returns a LookupList with examples i,i+2,i+4...,j-2.
482 dataset[[i1,i2,..,in]] returns a LookupList with examples i1,i2,...,in.
483
484 @note:
485 Some stream datasets may be unable to implement random access, i.e.
486 arbitrary slicing/indexing because they can only iterate through
487 examples one or a minibatch at a time and do not actually store or keep
488 past (or future) examples.
489
490 The default implementation of getitem uses the minibatches iterator
491 to obtain one example, one slice, or a list of examples. It may not
492 always be the most efficient way to obtain the result, especially if
493 the data are actually stored in a memory array.
494 """
495
496 if type(i) is int:
497 assert i >= 0 # TBM: see if someone complains and want negative i
498 if i >= len(self) :
499 raise IndexError
500 i_batch = self.minibatches_nowrap(self.fieldNames(),
501 minibatch_size=1, n_batches=1, offset=i)
502 return DataSet.MinibatchToSingleExampleIterator(i_batch).next()
503
504 #if i is a contiguous slice
505 if type(i) is slice and (i.step in (None, 1)):
506 offset = 0 if i.start is None else i.start
507 upper_bound = len(self) if i.stop is None else i.stop
508 upper_bound = min(len(self) , upper_bound)
509 #return MinibatchDataSet(self.minibatches_nowrap(self.fieldNames(),
510 # minibatch_size=upper_bound - offset,
511 # n_batches=1,
512 # offset=offset).next())
513 # now returns a LookupList
514 return self.minibatches_nowrap(self.fieldNames(),
515 minibatch_size=upper_bound - offset,
516 n_batches=1,
517 offset=offset).next()
518
519 # if slice has a step param, convert it to list and handle it with the
520 # list code
521 if type(i) is slice:
522 offset = 0 if i.start is None else i.start
523 upper_bound = len(self) if i.stop is None else i.stop
524 upper_bound = min(len(self) , upper_bound)
525 i = list(range(offset, upper_bound, i.step))
526
527 # handle tuples, arrays, lists
528 if hasattr(i, '__getitem__'):
529 for idx in i:
530 #dis-allow nested slices
531 if not isinstance(idx, int):
532 raise TypeError(idx)
533 if idx >= len(self) :
534 raise IndexError
535 # call back into self.__getitem__
536 examples = [self.minibatches_nowrap(self.fieldNames(),
537 minibatch_size=1, n_batches=1, offset=ii).next()
538 for ii in i]
539 # re-index the fields in each example by field instead of by example
540 field_values = [[] for blah in self.fieldNames()]
541 for e in examples:
542 for f,v in zip(field_values, e):
543 f.append(v)
544 #build them into a LookupList (a.ka. Example)
545 zz = zip(self.fieldNames(),field_values)
546 vst = [self.valuesVStack(fieldname,field_values) for fieldname,field_values in zz]
547 example = Example(self.fieldNames(), vst)
548 #return MinibatchDataSet(example, self.valuesVStack, self.valuesHStack)
549 # now returns a LookupList
550 return example
551
552 # what in the world is i?
553 raise TypeError(i, type(i))
554
555
556 """
557 Enables the call dataset.subset[a:b:c] that will return a DataSet
558 around the examples returned by __getitem__(slice(a,b,c))
559
560 @SEE DataSet.__getsubset(self)
561 """
562 subset = property(lambda s : s.__getsubset(),doc="returns a subset as a DataSet")
563
564
565 def __getsubset(self) :
566 """
567 Enables the call data.subset[a:b:c], returns a DataSet.
568 Default implementation is a simple wrap around __getitem__() using MinibatchDataSet.
569
570 @RETURN DataSet
571 @SEE DataSet.subset = property(lambda s : s.__getsubset())
572 """
573 _self = self
574 class GetSliceReturnsDataSet(object) :
575 def __getitem__(self,slice) :
576 return MinibatchDataSet(_self.__getitem__(slice))
577 return GetSliceReturnsDataSet()
578
579
580
581 def valuesHStack(self,fieldnames,fieldvalues):
582 """
583 Return a value that corresponds to concatenating (horizontally) several field values.
584 This can be useful to merge some fields. The implementation of this operation is likely
585 to involve a copy of the original values. When the values are numpy arrays, the
586 result should be numpy.hstack(values). If it makes sense, this operation should
587 work as well when each value corresponds to multiple examples in a minibatch
588 e.g. if each value is a Ni-vector and a minibatch of length L is a LxNi matrix,
589 then the result should be a Lx(N1+N2+..) matrix equal to numpy.hstack(values).
590 The default is to use numpy.hstack for numpy.ndarray values, and a list
591 pointing to the original values for other data types.
592 """
593 all_numpy=True
594 for value in fieldvalues:
595 if not type(value) is numpy.ndarray:
596 all_numpy=False
597 if all_numpy:
598 return numpy.hstack(fieldvalues)
599 # the default implementation of horizontal stacking is to put values in a list
600 return fieldvalues
601
602 def valuesVStack(self,fieldname,values):
603 """
604 @param fieldname: the name of the field from which the values were taken
605 @type fieldname: any type
606
607 @param values: bits near the beginning or end of the dataset
608 @type values: list of minibatches (returned by minibatches_nowrap)
609
610 @return: the concatenation (stacking) of the values
611 @rtype: something suitable as a minibatch field
612 """
613 rval = []
614 for v in values:
615 rval.extend(v)
616 return rval
617
618 def __or__(self,other):
619 """
620 dataset1 | dataset2 returns a dataset whose list of fields is the concatenation of the list of
621 fields of the argument datasets. This only works if they all have the same length.
622 """
623 return HStackedDataSet([self,other])
624
625 def __and__(self,other):
626 """
627 dataset1 & dataset2 is a dataset that concatenates the examples from the argument datasets
628 (and whose length is the sum of the length of the argument datasets). This only
629 works if they all have the same fields.
630 """
631 return VStackedDataSet([self,other])
632
633 def hstack(datasets):
634 """
635 hstack(dataset1,dataset2,...) returns dataset1 | datataset2 | ...
636 which is a dataset whose fields list is the concatenation of the fields
637 of the individual datasets.
638 """
639 assert len(datasets)>0
640 if len(datasets)==1:
641 return datasets[0]
642 return HStackedDataSet(datasets)
643
644 def vstack(datasets):
645 """
646 vstack(dataset1,dataset2,...) returns dataset1 & datataset2 & ...
647 which is a dataset which iterates first over the examples of dataset1, then
648 over those of dataset2, etc.
649 """
650 assert len(datasets)>0
651 if len(datasets)==1:
652 return datasets[0]
653 return VStackedDataSet(datasets)
654
655 class FieldsSubsetDataSet(DataSet):
656 """
657 A sub-class of L{DataSet} that selects a subset of the fields.
658 """
659 def __init__(self,src,fieldnames):
660 self.src=src
661 self.fieldnames=fieldnames
662 assert src.hasFields(*fieldnames)
663 self.valuesHStack = src.valuesHStack
664 self.valuesVStack = src.valuesVStack
665
666 def __len__(self): return len(self.src)
667
668 def fieldNames(self):
669 return self.fieldnames
670
671 def __iter__(self):
672 class FieldsSubsetIterator(object):
673 def __init__(self,ds):
674 self.ds=ds
675 self.src_iter=ds.src.__iter__()
676 self.example=None
677 def __iter__(self): return self
678 def next(self):
679 complete_example = self.src_iter.next()
680 if self.example:
681 self.example._values=[complete_example[field]
682 for field in self.ds.fieldnames]
683 else:
684 self.example=Example(self.ds.fieldnames,
685 [complete_example[field] for field in self.ds.fieldnames])
686 return self.example
687 return FieldsSubsetIterator(self)
688
689 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
690 assert self.hasFields(*fieldnames)
691 return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset)
692 def dontuse__getitem__(self,i):
693 return FieldsSubsetDataSet(self.src[i],self.fieldnames)
694
695 class RenamedFieldsDataSet(DataSet):
696 """
697 A sub-class of L{DataSet} that selects and renames a subset of the fields.
698 """
699 def __init__(self,src,src_fieldnames,new_fieldnames):
700 self.src=src
701 self.src_fieldnames=src_fieldnames
702 self.new_fieldnames=new_fieldnames
703 assert src.hasFields(*src_fieldnames)
704 assert len(src_fieldnames)==len(new_fieldnames)
705 self.valuesHStack = src.valuesHStack
706 self.valuesVStack = src.valuesVStack
707 self.lookup_fields = Example(new_fieldnames,src_fieldnames)
708
709 def __len__(self): return len(self.src)
710
711 def fieldNames(self):
712 return self.new_fieldnames
713
714 def __iter__(self):
715 class FieldsSubsetIterator(object):
716 def __init__(self,ds):
717 self.ds=ds
718 self.src_iter=ds.src.__iter__()
719 self.example=None
720 def __iter__(self): return self
721 def next(self):
722 complete_example = self.src_iter.next()
723 if self.example:
724 self.example._values=[complete_example[field]
725 for field in self.ds.src_fieldnames]
726 else:
727 self.example=Example(self.ds.new_fieldnames,
728 [complete_example[field]
729 for field in self.ds.src_fieldnames])
730 return self.example
731 return FieldsSubsetIterator(self)
732
733 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
734 assert self.hasFields(*fieldnames)
735 cursor = Example(fieldnames,[0]*len(fieldnames))
736 for batch in self.src.minibatches_nowrap([self.lookup_fields[f] for f in fieldnames],minibatch_size,n_batches,offset):
737 cursor._values=batch._values
738 yield cursor
739
740 def __getitem__(self,i):
741 # return FieldsSubsetDataSet(self.src[i],self.new_fieldnames)
742 complete_example = self.src[i]
743 return Example(self.new_fieldnames,
744 [complete_example[field]
745 for field in self.src_fieldnames])
746
747
748
749 class DataSetFields(Example):
750 """
751 Although a L{DataSet} iterates over examples (like rows of a matrix), an associated
752 DataSetFields iterates over fields (like columns of a matrix), and can be understood
753 as a transpose of the associated dataset.
754
755 To iterate over fields, one can do
756 * for fields in dataset.fields()
757 * for fields in dataset(field1,field2,...).fields() to select a subset of fields
758 * for fields in dataset.fields(field1,field2,...) to select a subset of fields
759 and each of these fields is iterable over the examples:
760 * for field_examples in dataset.fields():
761 for example_value in field_examples:
762 ...
763 but when the dataset is a stream (unbounded length), it is not recommended to do
764 such things because the underlying dataset may refuse to access the different fields in
765 an unsynchronized ways. Hence the fields() method is illegal for streams, by default.
766 The result of fields() is a DataSetFields object, which iterates over fields,
767 and whose elements are iterable over examples. A DataSetFields object can
768 be turned back into a DataSet with its examples() method:
769 dataset2 = dataset1.fields().examples()
770 and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1).
771
772 DataSetFields can be concatenated vertically or horizontally. To be consistent with
773 the syntax used for DataSets, the | concatenates the fields and the & concatenates
774 the examples.
775 """
776 def __init__(self,dataset,fieldnames):
777 original_dataset=dataset
778 if not fieldnames:
779 fieldnames=dataset.fieldNames()
780 elif not list(fieldnames)==list(dataset.fieldNames()):
781 #we must cast to list, othersize('x','y')!=['x','y']
782 dataset = FieldsSubsetDataSet(dataset,fieldnames)
783 assert dataset.hasFields(*fieldnames)
784 self.dataset=dataset
785
786 if isinstance(dataset,MinibatchDataSet):
787 Example.__init__(self,fieldnames,list(dataset._fields))
788 elif isinstance(original_dataset,MinibatchDataSet):
789 Example.__init__(self,fieldnames,
790 [original_dataset._fields[field]
791 for field in fieldnames])
792 else:
793 minibatch_iterator = dataset.minibatches(fieldnames,
794 minibatch_size=len(dataset),
795 n_batches=1)
796 minibatch=minibatch_iterator.next()
797 Example.__init__(self,fieldnames,minibatch)
798
799 def examples(self):
800 return self.dataset
801
802 def __or__(self,other):
803 """
804 fields1 | fields2 is a DataSetFields that whose list of examples is the concatenation
805 of the list of examples of DataSetFields fields1 and fields2.
806 """
807 return (self.examples() + other.examples()).fields()
808
809 def __and__(self,other):
810 """
811 fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation
812 of the fields of DataSetFields fields1 and fields2.
813 """
814 return (self.examples() | other.examples()).fields()
815
816
817 class MinibatchDataSet(DataSet):
818 """
819 Turn a L{Example} of same-length (iterable) fields into an example-iterable dataset.
820 Each element of the lookup-list should be an iterable and sliceable, all of the same length.
821 """
822 def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack,
823 values_hstack=DataSet().valuesHStack):
824 """
825 The user can (and generally should) also provide values_vstack(fieldname,fieldvalues)
826 and a values_hstack(fieldnames,fieldvalues) functions behaving with the same
827 semantics as the DataSet methods of the same name (but without the self argument).
828 """
829
830 self._fields=fields_lookuplist
831 assert len(fields_lookuplist)>0
832 self.length=len(fields_lookuplist[0])
833 for field in fields_lookuplist[1:]:
834 if self.length != len(field) :
835 print 'self.length = ',self.length
836 print 'len(field) = ', len(field)
837 print 'self._fields.keys() = ', self._fields.keys()
838 print 'field=',field
839 print 'fields_lookuplist=', fields_lookuplist
840 assert self.length==len(field)
841 self.valuesVStack=values_vstack
842 self.valuesHStack=values_hstack
843
844 def __len__(self):
845 return self.length
846
847 def dontuse__getitem__(self,i):
848 if type(i) in (slice,list):
849 return DataSetFields(MinibatchDataSet(
850 Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames())
851 if type(i) is int:
852 return Example(self._fields.keys(),[field[i] for field in self._fields])
853 if self.hasFields(i):
854 return self._fields[i]
855 assert i in self.__dict__ # else it means we are trying to access a non-existing property
856 return self.__dict__[i]
857
858 def fieldNames(self):
859 return self._fields.keys()
860
861 def hasFields(self,*fieldnames):
862 for fieldname in fieldnames:
863 if fieldname not in self._fields.keys():
864 return False
865 return True
866
867 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
868 #@TODO bug somewhere here, fieldnames doesnt seem to be well handled
869 class Iterator(object):
870 def __init__(self,ds,fieldnames):
871 # tbm: added two next lines to handle fieldnames
872 if fieldnames is None: fieldnames = ds._fields.keys()
873 self.fieldnames = fieldnames
874
875 self.ds=ds
876 self.next_example=offset
877 assert minibatch_size >= 0
878 if offset+minibatch_size > ds.length:
879 raise NotImplementedError()
880 def __iter__(self):
881 return self
882 def next(self):
883 upper = self.next_example+minibatch_size
884 if upper > len(self.ds) :
885 raise StopIteration()
886 assert upper<=len(self.ds) # instead of self.ds.length
887 #minibatch = Example(self.ds._fields.keys(),
888 # [field[self.next_example:upper]
889 # for field in self.ds._fields])
890 # tbm: modif to use fieldnames
891 values = []
892 for f in self.fieldnames :
893 #print 'we have field',f,'in fieldnames'
894 values.append( self.ds._fields[f][self.next_example:upper] )
895 minibatch = Example(self.fieldnames,values)
896 #print minibatch
897 self.next_example+=minibatch_size
898 return minibatch
899
900 # tbm: added fieldnames to handle subset of fieldnames
901 return Iterator(self,fieldnames)
902
903 class HStackedDataSet(DataSet):
904 """
905 A L{DataSet} that wraps several datasets and shows a view that includes all their fields,
906 i.e. whose list of fields is the concatenation of their lists of fields.
907
908 If a field name is found in more than one of the datasets, then either an error is
909 raised or the fields are renamed (either by prefixing the __name__ attribute
910 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list).
911
912 @todo: automatically detect a chain of stacked datasets due to A | B | C | D ...
913 """
914 def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None):
915 DataSet.__init__(self,description,field_types)
916 self.datasets=datasets
917 self.accept_nonunique_names=accept_nonunique_names
918 self.fieldname2dataset={}
919
920 def rename_field(fieldname,dataset,i):
921 if hasattr(dataset,"__name__"):
922 return dataset.__name__ + "." + fieldname
923 return fieldname+"."+str(i)
924
925 # make sure all datasets have the same length and unique field names
926 self.length=None
927 names_to_change=[]
928 for i in xrange(len(datasets)):
929 dataset = datasets[i]
930 length=len(dataset)
931 if self.length:
932 assert self.length==length
933 else:
934 self.length=length
935 for fieldname in dataset.fieldNames():
936 if fieldname in self.fieldname2dataset: # name conflict!
937 if accept_nonunique_names:
938 fieldname=rename_field(fieldname,dataset,i)
939 names2change.append((fieldname,i))
940 else:
941 raise ValueError("Incompatible datasets: non-unique field name = "+fieldname)
942 self.fieldname2dataset[fieldname]=i
943 for fieldname,i in names_to_change:
944 del self.fieldname2dataset[fieldname]
945 self.fieldname2dataset[rename_field(fieldname,self.datasets[i],i)]=i
946
947 def __len__(self):
948 return len(self.datasets[0])
949
950 def hasFields(self,*fieldnames):
951 for fieldname in fieldnames:
952 if not fieldname in self.fieldname2dataset:
953 return False
954 return True
955
956 def fieldNames(self):
957 return self.fieldname2dataset.keys()
958
959 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
960
961 class HStackedIterator(object):
962 def __init__(self,hsds,iterators):
963 self.hsds=hsds
964 self.iterators=iterators
965 def __iter__(self):
966 return self
967 def next(self):
968 # concatenate all the fields of the minibatches
969 l=Example()
970 for iter in self.iterators:
971 l.append_lookuplist(iter.next())
972 return l
973
974 assert self.hasFields(*fieldnames)
975 # find out which underlying datasets are necessary to service the required fields
976 # and construct corresponding minibatch iterators
977 if fieldnames and fieldnames!=self.fieldNames():
978 datasets=set([])
979 fields_in_dataset=dict([(dataset,[]) for dataset in datasets])
980 for fieldname in fieldnames:
981 dataset=self.datasets[self.fieldname2dataset[fieldname]]
982 datasets.add(dataset)
983 fields_in_dataset[dataset].append(fieldname)
984 datasets=list(datasets)
985 iterators=[dataset.minibatches(fields_in_dataset[dataset],minibatch_size,n_batches,offset)
986 for dataset in datasets]
987 else:
988 datasets=self.datasets
989 iterators=[dataset.minibatches(None,minibatch_size,n_batches,offset) for dataset in datasets]
990 return HStackedIterator(self,iterators)
991
992
993 def untested_valuesVStack(self,fieldname,fieldvalues):
994 return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues)
995
996 def untested_valuesHStack(self,fieldnames,fieldvalues):
997 """
998 We will use the sub-dataset associated with the first fieldname in the fieldnames list
999 to do the work, hoping that it can cope with the other values (i.e. won't care
1000 about the incompatible fieldnames). Hence this heuristic will always work if
1001 all the fieldnames are of the same sub-dataset.
1002 """
1003 return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues)
1004
1005 class VStackedDataSet(DataSet):
1006 """
1007 A L{DataSet} that wraps several datasets and shows a view that includes all their examples,
1008 in the order provided. This clearly assumes that they all have the same field names
1009 and all (except possibly the last one) are of finite length.
1010
1011 @todo: automatically detect a chain of stacked datasets due to A + B + C + D ...
1012 """
1013 def __init__(self,datasets):
1014 self.datasets=datasets
1015 self.length=0
1016 self.index2dataset={}
1017 assert len(datasets)>0
1018 fieldnames = datasets[-1].fieldNames()
1019 self.datasets_start_row=[]
1020 # We use this map from row index to dataset index for constant-time random access of examples,
1021 # to avoid having to search for the appropriate dataset each time and slice is asked for.
1022 for dataset,k in enumerate(datasets[0:-1]):
1023 assert dataset.is_unbounded() # All VStacked datasets (except possibly the last) must be bounded (have a length).
1024 L=len(dataset)
1025 for i in xrange(L):
1026 self.index2dataset[self.length+i]=k
1027 self.datasets_start_row.append(self.length)
1028 self.length+=L
1029 assert dataset.fieldNames()==fieldnames
1030 self.datasets_start_row.append(self.length)
1031 self.length+=len(datasets[-1])
1032 # If length is very large, we should use a more memory-efficient mechanism
1033 # that does not store all indices
1034 if self.length>1000000:
1035 # 1 million entries would require about 60 meg for the index2dataset map
1036 # TODO
1037 print "A more efficient mechanism for index2dataset should be implemented"
1038
1039 def __len__(self):
1040 return self.length
1041
1042 def fieldNames(self):
1043 return self.datasets[0].fieldNames()
1044
1045 def hasFields(self,*fieldnames):
1046 return self.datasets[0].hasFields(*fieldnames)
1047
1048 def locate_row(self,row):
1049 """Return (dataset_index, row_within_dataset) for global row number"""
1050 dataset_index = self.index2dataset[row]
1051 row_within_dataset = self.datasets_start_row[dataset_index]
1052 return dataset_index, row_within_dataset
1053
1054 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
1055
1056 class VStackedIterator(object):
1057 def __init__(self,vsds):
1058 self.vsds=vsds
1059 self.next_row=offset
1060 self.next_dataset_index,self.next_dataset_row=self.vsds.locate_row(offset)
1061 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
1062 self.next_iterator(vsds.datasets[0],offset,n_batches)
1063
1064 def next_iterator(self,dataset,starting_offset,batches_left):
1065 L=len(dataset)
1066 ds_nbatches = (L-starting_offset)/minibatch_size
1067 if batches_left is not None:
1068 ds_nbatches = max(batches_left,ds_nbatches)
1069 if minibatch_size>L:
1070 ds_minibatch_size=L
1071 n_left_in_mb=minibatch_size-L
1072 ds_nbatches=1
1073 else:
1074 n_left_in_mb=0
1075 return dataset.minibatches(fieldnames,minibatch_size,ds_nbatches,starting_offset), \
1076 L-(starting_offset+ds_nbatches*minibatch_size), n_left_in_mb
1077
1078 def move_to_next_dataset(self):
1079 if self.n_left_at_the_end_of_ds>0:
1080 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
1081 self.next_iterator(vsds.datasets[self.next_dataset_index],
1082 self.n_left_at_the_end_of_ds,1)
1083 else:
1084 self.next_dataset_index +=1
1085 if self.next_dataset_index==len(self.vsds.datasets):
1086 self.next_dataset_index = 0
1087 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
1088 self.next_iterator(vsds.datasets[self.next_dataset_index],starting_offset,n_batches)
1089
1090 def __iter__(self):
1091 return self
1092
1093 def next(self):
1094 dataset=self.vsds.datasets[self.next_dataset_index]
1095 mb = self.next_iterator.next()
1096 if self.n_left_in_mb:
1097 extra_mb = []
1098 while self.n_left_in_mb>0:
1099 self.move_to_next_dataset()
1100 extra_mb.append(self.next_iterator.next())
1101 mb = Example(fieldnames,
1102 [dataset.valuesVStack(name,
1103 [mb[name]]+[b[name] for b in extra_mb])
1104 for name in fieldnames])
1105
1106 self.next_row+=minibatch_size
1107 self.next_dataset_row+=minibatch_size
1108 if self.next_row+minibatch_size>len(dataset):
1109 self.move_to_next_dataset()
1110 return examples
1111 return VStackedIterator(self)
1112
1113 class ArrayFieldsDataSet(DataSet):
1114 """
1115 Virtual super-class of datasets whose field values are numpy array,
1116 thus defining valuesHStack and valuesVStack for sub-classes.
1117 """
1118 def __init__(self,description=None,field_types=None):
1119 DataSet.__init__(self,description,field_types)
1120 def untested_valuesHStack(self,fieldnames,fieldvalues):
1121 """Concatenate field values horizontally, e.g. two vectors
1122 become a longer vector, two matrices become a wider matrix, etc."""
1123 return numpy.hstack(fieldvalues)
1124 def untested_valuesVStack(self,fieldname,values):
1125 """Concatenate field values vertically, e.g. two vectors
1126 become a two-row matrix, two matrices become a longer matrix, etc."""
1127 return numpy.vstack(values)
1128
1129
1130
1131 class NArraysDataSet(ArrayFieldsDataSet) :
1132 """
1133 An NArraysDataSet stores fields that are numpy tensor, whose first axis
1134 iterates over examples. It's a generalization of ArrayDataSet.
1135 """
1136 #@TODO not completely implemented yet
1137 def __init__(self, data_arrays, fieldnames, **kwargs) :
1138 """
1139 Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list
1140 of fieldnames. The number of arrays must be the same as the number of
1141 fieldnames. Each set of numpy tensor must have the same first dimension (first
1142 axis) corresponding to the number of examples.
1143
1144 Every tensor is treated as a numpy array (using numpy.asarray)
1145 """
1146 ArrayFieldsDataSet.__init__(self,**kwargs)
1147 assert len(data_arrays) == len(fieldnames)
1148 assert len(fieldnames) > 0
1149 ndarrays = [numpy.asarray(a) for a in data_arrays]
1150 lens = [a.shape[0] for a in ndarrays]
1151 num_examples = lens[0] #they must all be equal anyway
1152 self._fieldnames = fieldnames
1153 for k in ndarrays :
1154 assert k.shape[0] == num_examples
1155 self._datas = ndarrays
1156 # create dict
1157 self.map_field_idx = dict()
1158 for k in range(len(fieldnames)):
1159 self.map_field_idx[fieldnames[k]] = k
1160
1161
1162 def __len__(self) :
1163 """
1164 Length of the dataset is based on the first array = data_arrays[0], using its shape
1165 """
1166 return self._datas[0].shape[0]
1167
1168 def fieldNames(self) :
1169 """
1170 Returns the fieldnames as set in self.__init__
1171 """
1172 return self._fieldnames
1173
1174 def field_pos(self,fieldname) :
1175 """
1176 Returns the index of a given fieldname. Fieldname must exists! see fieldNames().
1177 """
1178 return self.map_field_idx[fieldname]
1179
1180 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
1181 cursor = Example(fieldnames,[0]*len(fieldnames))
1182 fieldnames = self.fieldNames() if fieldnames is None else fieldnames
1183 for n in xrange(n_batches):
1184 if offset == len(self):
1185 break
1186 for f in range(len(cursor._names)) :
1187 idx = self.field_pos(cursor._names[f])
1188 sub_data = self._datas[idx][offset : offset+minibatch_size]
1189 cursor._values[f] = sub_data
1190 offset += len(sub_data) #can be less than minibatch_size at end
1191 yield cursor
1192
1193 #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
1194
1195
1196
1197
1198 class ArrayDataSet(ArrayFieldsDataSet):
1199 """
1200 An ArrayDataSet stores the fields as groups of columns in a numpy tensor,
1201 whose first axis iterates over examples, second axis determines fields.
1202 If the underlying array is N-dimensional (has N axes), then the field
1203 values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2).
1204 """
1205
1206 def __init__(self, data_array, fields_columns, **kwargs):
1207 """
1208 Construct an ArrayDataSet from the underlying numpy array (data) and
1209 a map (fields_columns) from fieldnames to field columns. The columns of a field are specified
1210 using the standard arguments for indexing/slicing: integer for a column index,
1211 slice for an interval of columns (with possible stride), or iterable of column indices.
1212 """
1213 ArrayFieldsDataSet.__init__(self, **kwargs)
1214 self.data=data_array
1215 self.fields_columns=fields_columns
1216
1217 # check consistency and complete slices definitions
1218 for fieldname, fieldcolumns in self.fields_columns.items():
1219 if type(fieldcolumns) is int:
1220 assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1]
1221 if 1:
1222 #I changed this because it didn't make sense to me,
1223 # and it made it more difficult to write my learner.
1224 # If it breaks stuff, let's talk about it.
1225 # - James 22/05/2008
1226 self.fields_columns[fieldname]=[fieldcolumns]
1227 else:
1228 self.fields_columns[fieldname]=fieldcolumns
1229 elif type(fieldcolumns) is slice:
1230 start,step=fieldcolumns.start,fieldcolumns.step
1231 if not start:
1232 start=0
1233 if not step:
1234 step=1
1235 self.fields_columns[fieldname]=slice(start,fieldcolumns.stop,step)
1236 elif hasattr(fieldcolumns,"__iter__"): # something like a list
1237 for i in fieldcolumns:
1238 assert i>=0 and i<data_array.shape[1]
1239
1240 def fieldNames(self):
1241 return self.fields_columns.keys()
1242
1243 def __len__(self):
1244 return len(self.data)
1245
1246 def __getitem__(self,key):
1247 """More efficient implementation than the default __getitem__"""
1248 fieldnames=self.fields_columns.keys()
1249 values=self.fields_columns.values()
1250 if type(key) is int:
1251 return Example(fieldnames,
1252 [self.data[key,col] for col in values])
1253 if type(key) is slice:
1254 return Example(fieldnames,[self.data[key,col] for col in values])
1255 if type(key) is list:
1256 for i in range(len(key)):
1257 if self.hasFields(key[i]):
1258 key[i]=self.fields_columns[key[i]]
1259 return Example(fieldnames,
1260 #we must separate differently for list as numpy
1261 # doesn't support self.data[[i1,...],[i2,...]]
1262 # when their is more then two i1 and i2
1263 [self.data[key,:][:,col]
1264 if isinstance(col,list) else
1265 self.data[key,col] for col in values])
1266
1267 # else check for a fieldname
1268 if self.hasFields(key):
1269 return self.data[:,self.fields_columns[key]]
1270 # else we are trying to access a property of the dataset
1271 assert key in self.__dict__ # else it means we are trying to access a non-existing property
1272 return self.__dict__[key]
1273
1274 def dontuse__iter__(self):
1275 class ArrayDataSetIteratorIter(object):
1276 def __init__(self,dataset,fieldnames):
1277 if fieldnames is None: fieldnames = dataset.fieldNames()
1278 # store the resulting minibatch in a lookup-list of values
1279 self.minibatch = Example(fieldnames,[0]*len(fieldnames))
1280 self.dataset=dataset
1281 self.current=0
1282 self.columns = [self.dataset.fields_columns[f]
1283 for f in self.minibatch._names]
1284 self.l = self.dataset.data.shape[0]
1285 def __iter__(self):
1286 return self
1287 def next(self):
1288 #@todo: we suppose that we need to stop only when minibatch_size == 1.
1289 # Otherwise, MinibatchWrapAroundIterator do it.
1290 if self.current>=self.l:
1291 raise StopIteration
1292 sub_data = self.dataset.data[self.current]
1293 self.minibatch._values = [sub_data[c] for c in self.columns]
1294
1295 self.current+=1
1296 return self.minibatch
1297
1298 return ArrayDataSetIteratorIter(self,self.fieldNames())
1299
1300 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
1301 cursor = Example(fieldnames,[0]*len(fieldnames))
1302 fieldnames = self.fieldNames() if fieldnames is None else fieldnames
1303 if n_batches == None:
1304 n_batches = (len(self) - offset) / minibatch_size
1305 for n in xrange(n_batches):
1306 if offset == len(self):
1307 break
1308 sub_data = self.data[offset : offset+minibatch_size]
1309 offset += len(sub_data) #can be less than minibatch_size at end
1310 cursor._values = [sub_data[:,self.fields_columns[f]] for f in cursor._names]
1311 yield cursor
1312
1313 #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
1314
1315
1316 class CachedDataSet(DataSet):
1317 """
1318 Wrap a L{DataSet} whose values are computationally expensive to obtain
1319 (e.g. because they involve some computation, or disk access),
1320 so that repeated accesses to the same example are done cheaply,
1321 by caching every example value that has been accessed at least once.
1322
1323 Optionally, for finite-length dataset, all the values can be computed
1324 (and cached) upon construction of the CachedDataSet, rather at the
1325 first access.
1326
1327 @todo: when cache_all_upon_construction create mini-batches that are as
1328 large as possible but not so large as to fill up memory.
1329
1330 @todo: add disk-buffering capability, so that when the cache becomes too
1331 big for memory, we cache things on disk, trying to keep in memory only
1332 the record most likely to be accessed next.
1333 """
1334 def __init__(self,source_dataset,cache_all_upon_construction=False):
1335 self.source_dataset=source_dataset
1336 self.cache_all_upon_construction=cache_all_upon_construction
1337 self.cached_examples = []
1338 if cache_all_upon_construction:
1339 # this potentially brings all the source examples
1340 # into memory at once, which may be too much
1341 # the work could possibly be done by minibatches
1342 # that are as large as possible but no more than what memory allows.
1343 #
1344 # field_values is supposed to be an DataSetFields, that inherits from LookupList
1345 #fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()
1346 fields_values = DataSetFields(source_dataset,None)
1347 assert all([len(self)==len(field_values) for field_values in fields_values])
1348 for example in fields_values.examples():
1349 self.cached_examples.append(copy.copy(example))
1350
1351 self.fieldNames = source_dataset.fieldNames
1352 self.hasFields = source_dataset.hasFields
1353 self.valuesHStack = source_dataset.valuesHStack
1354 self.valuesVStack = source_dataset.valuesVStack
1355
1356 def __len__(self):
1357 return len(self.source_dataset)
1358
1359 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
1360 class CacheIterator(object):
1361 def __init__(self,dataset):
1362 self.dataset=dataset
1363 self.current=offset
1364 self.all_fields = self.dataset.fieldNames()==fieldnames
1365 self.n_batches = n_batches
1366 self.batch_counter = 0
1367 def __iter__(self): return self
1368 def next(self):
1369 self.batch_counter += 1
1370 if self.n_batches and self.batch_counter > self.n_batches :
1371 raise StopIteration()
1372 upper = self.current+minibatch_size
1373 if upper > len(self.dataset.source_dataset):
1374 raise StopIteration()
1375 cache_len = len(self.dataset.cached_examples)
1376 if upper>cache_len: # whole minibatch is not already in cache
1377 # cache everything from current length to upper
1378 #for example in self.dataset.source_dataset[cache_len:upper]:
1379 for example in self.dataset.source_dataset.subset[cache_len:upper]:
1380 self.dataset.cached_examples.append(example)
1381 all_fields_minibatch = Example(self.dataset.fieldNames(),
1382 zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size]))
1383
1384 self.current+=minibatch_size
1385 if self.all_fields:
1386 return all_fields_minibatch
1387 return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
1388 return CacheIterator(self)
1389
1390 def dontuse__getitem__(self,i):
1391 if type(i)==int and len(self.cached_examples)>i:
1392 return self.cached_examples[i]
1393 else:
1394 return self.source_dataset[i]
1395
1396 def __iter__(self):
1397 class CacheIteratorIter(object):
1398 def __init__(self,dataset):
1399 self.dataset=dataset
1400 self.l = len(dataset)
1401 self.current = 0
1402 self.fieldnames = self.dataset.fieldNames()
1403 self.example = Example(self.fieldnames,[0]*len(self.fieldnames))
1404 def __iter__(self): return self
1405 def next(self):
1406 if self.current>=self.l:
1407 raise StopIteration
1408 cache_len = len(self.dataset.cached_examples)
1409 if self.current>=cache_len: # whole minibatch is not already in cache
1410 # cache everything from current length to upper
1411 self.dataset.cached_examples.append(
1412 self.dataset.source_dataset[self.current])
1413 self.example._values = self.dataset.cached_examples[self.current]
1414 self.current+=1
1415 return self.example
1416
1417 return CacheIteratorIter(self)
1418
1419 class ApplyFunctionDataSet(DataSet):
1420 """
1421 A L{DataSet} that contains as fields the results of applying a
1422 given function example-wise or minibatch-wise to all the fields of
1423 an input dataset. The output of the function should be an iterable
1424 (e.g. a list or a LookupList) over the resulting values.
1425
1426 The function take as input the fields of the dataset, not the examples.
1427
1428 In minibatch mode, the function is expected to work on minibatches
1429 (takes a minibatch in input and returns a minibatch in output). More
1430 precisely, it means that each element of the input or output list
1431 should be iterable and indexable over the individual example values
1432 (typically these elements will be numpy arrays). All of the elements
1433 in the input and output lists should have the same length, which is
1434 the length of the minibatch.
1435
1436 The function is applied each time an example or a minibatch is accessed.
1437 To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
1438
1439 If the values_{h,v}stack functions are not provided, then
1440 the input_dataset.values{H,V}Stack functions are used by default.
1441
1442 """
1443
1444 def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
1445 values_hstack=None,values_vstack=None,
1446 description=None,fieldtypes=None):
1447 """
1448 Constructor takes an input dataset that has as many fields as the function
1449 expects as inputs. The resulting dataset has as many fields as the function
1450 produces as outputs, and that should correspond to the number of output names
1451 (provided in a list).
1452
1453 Note that the expected semantics of the function differs in minibatch mode
1454 (it takes minibatches of inputs and produces minibatches of outputs, as
1455 documented in the class comment).
1456
1457 TBM: are fieldtypes the old field types (from input_dataset) or the new ones
1458 (for the new dataset created)?
1459 """
1460 self.input_dataset=input_dataset
1461 self.function=function
1462 self.output_names=output_names
1463 #print 'self.output_names in afds:', self.output_names
1464 #print 'length in afds:', len(self.output_names)
1465 self.minibatch_mode=minibatch_mode
1466 DataSet.__init__(self,description,fieldtypes)
1467 self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
1468 self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
1469
1470 def __len__(self):
1471 return len(self.input_dataset)
1472
1473 def fieldNames(self):
1474 return self.output_names
1475
1476 def minibatches_nowrap(self, fieldnames, *args, **kwargs):
1477 all_input_fieldNames = self.input_dataset.fieldNames()
1478 mbnw = self.input_dataset.minibatches_nowrap
1479
1480 for input_fields in mbnw(all_input_fieldNames, *args, **kwargs):
1481 if self.minibatch_mode:
1482 all_output_fields = self.function(*input_fields)
1483 else:
1484 input_examples = zip(*input_fields) #makes so that [i] means example i
1485 output_examples = [self.function(*input_example)
1486 for input_example in input_examples]
1487 all_output_fields = zip(*output_examples)
1488
1489 #print 'output_names=', self.output_names
1490 #print 'all_output_fields', all_output_fields
1491 #print 'len(all_output_fields)=', len(all_output_fields)
1492 all_outputs = Example(self.output_names, all_output_fields)
1493 if fieldnames==self.output_names:
1494 rval = all_outputs
1495 else:
1496 rval = Example(fieldnames,[all_outputs[name] for name in fieldnames])
1497 #print 'rval', rval
1498 #print '--------'
1499 yield rval
1500
1501 def untested__iter__(self): # only implemented for increased efficiency
1502 class ApplyFunctionSingleExampleIterator(object):
1503 def __init__(self,output_dataset):
1504 self.current=0
1505 self.output_dataset=output_dataset
1506 self.input_iterator=output_dataset.input_dataset.__iter__()
1507 def __iter__(self): return self
1508 def next(self):
1509 if self.output_dataset.minibatch_mode:
1510 function_inputs = [[input] for input in self.input_iterator.next()]
1511 outputs = self.output_dataset.function(*function_inputs)
1512 assert all([hasattr(output,'__iter__') for output in outputs])
1513 function_outputs = [output[0] for output in outputs]
1514 else:
1515 function_inputs = self.input_iterator.next()
1516 function_outputs = self.output_dataset.function(*function_inputs)
1517 return Example(self.output_dataset.output_names,function_outputs)
1518 return ApplyFunctionSingleExampleIterator(self)
1519
1520 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
1521 """
1522 Wraps an arbitrary L{DataSet} into one for supervised learning tasks
1523 by forcing the user to define a set of fields as the 'input' field
1524 and a set of fields as the 'target' field. Optionally, a single
1525 weight_field can also be defined.
1526 """
1527 args = ((input_fields,'input'),(output_fields,'target'))
1528 if weight_field: args+=(([weight_field],'weight'))
1529 return src_dataset.merge_fields(*args)
1530
1531
1532
1533