Mercurial > pylearn
comparison dataset.py @ 20:266c68cb6136
Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
author | bengioy@bengiomac.local |
---|---|
date | Mon, 07 Apr 2008 09:48:39 -0400 |
parents | 57f4015e2e09 |
children | b6b36f65664f |
comparison
equal
deleted
inserted
replaced
19:57f4015e2e09 | 20:266c68cb6136 |
---|---|
8 """A virtual base class for datasets. | 8 """A virtual base class for datasets. |
9 | 9 |
10 A DataSet is a generator of iterators; these iterators can run through the | 10 A DataSet is a generator of iterators; these iterators can run through the |
11 examples in a variety of ways. A DataSet need not necessarily have a finite | 11 examples in a variety of ways. A DataSet need not necessarily have a finite |
12 or known length, so this class can be used to interface to a 'stream' which | 12 or known length, so this class can be used to interface to a 'stream' which |
13 feed on-line learning. | 13 feeds on-line learning. |
14 | 14 |
15 To iterate over examples, there are several possibilities: | 15 To iterate over examples, there are several possibilities: |
16 - for i in dataset.zip(field1, field2,field3, ...) | 16 - for example in dataset.zip([field1, field2,field3, ...]) |
17 - for i in dataset.minibatches(N, field1, field2, ...) | 17 - for val1,val2,val3 in dataset.zip([field1, field2,field3]) |
18 - for i in dataset | 18 - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N) |
19 - for example in dataset | |
19 Each of these is documented below. | 20 Each of these is documented below. |
20 | 21 |
21 Note: For a dataset of fixed and known length, which can implement item | 22 Note: For a dataset of fixed and known length, which can implement item |
22 random-access efficiently (e.g. indexing and slicing), and which can profit | 23 random-access efficiently (e.g. indexing and slicing), and which can profit |
23 from the FiniteDataSetIterator, consider using base class FiniteDataSet. | 24 from the FiniteDataSetIterator, consider using base class FiniteDataSet. |
34 def __iter__(self): | 35 def __iter__(self): |
35 """Supports the syntax "for i in dataset: ..." | 36 """Supports the syntax "for i in dataset: ..." |
36 | 37 |
37 Using this syntax, "i" will be an Example instance (or equivalent) with | 38 Using this syntax, "i" will be an Example instance (or equivalent) with |
38 all the fields of DataSet self. Every field of "i" will give access to | 39 all the fields of DataSet self. Every field of "i" will give access to |
39 a the field of a single example. Fields should be accessible via | 40 a field of a single example. Fields should be accessible via |
40 i[identifier], but the derived class is free to accept any type of | 41 i["fielname"] or i[3] (in the fieldNames() order), but the derived class is free |
41 identifier, and add extra functionality to the iterator. | 42 to accept any type of identifier, and add extra functionality to the iterator. |
42 """ | 43 """ |
43 raise AbstractFunction() | 44 return self.zip(*self.fieldNames()) |
44 | 45 |
45 def zip(self, *fieldnames): | 46 def zip(self, *fieldnames): |
46 """ | 47 """ |
47 Supports two forms of syntax: | 48 Supports two forms of syntax: |
48 | 49 |
49 for i in dataset.zip(f1, f2, f3): ... | 50 for i in dataset.zip([f1, f2, f3]): ... |
50 | 51 |
51 for i1, i2, i3 in dataset.zip(f1, f2, f3): ... | 52 for i1, i2, i3 in dataset.zip([f1, f2, f3]): ... |
52 | 53 |
53 Using the first syntax, "i" will be an indexable object, such as a list, | 54 Using the first syntax, "i" will be an indexable object, such as a list, |
54 tuple, or Example instance, such that on every iteration, i[0] is the f1 | 55 tuple, or Example instance, such that on every iteration, i[0] is the f1 |
55 field of the current example, i[1] is the f2 field, and so on. | 56 field of the current example, i[1] is the f2 field, and so on. |
56 | 57 |
118 def fieldNames(self): | 119 def fieldNames(self): |
119 #Yoshua- | 120 #Yoshua- |
120 # This list may not be finite; what would make sense in the use you have | 121 # This list may not be finite; what would make sense in the use you have |
121 # in mind? | 122 # in mind? |
122 # -JB | 123 # -JB |
123 """Return the list of field names in the examples of this dataset.""" | 124 #James- |
125 # You are right. I had put this to be able to iterate over the fields | |
126 # but maybe an iterator mechanism (over fields rather than examples) | |
127 # would be more appropriate. Fieldnames are needed in general | |
128 # by the iterators over examples or minibatches, to construct | |
129 # examples or minibatches with the corresponding names as attributes. | |
130 # -YB | |
131 """ | |
132 Return an iterator (an object with an __iter__ method) that | |
133 iterates over the names of the fields. As a special cases, | |
134 a list or a tuple of field names can be returned. | |
135 """" | |
136 # Note that some datasets | |
137 # may have virtual fields and support a virtually infinite number | |
138 # of possible field names. In that case, fieldNames() should | |
139 # either raise an error or iterate over a particular set of | |
140 # names as appropriate. Another option would be to iterate | |
141 # over the sub-datasets comprising a single field at a time. | |
142 # I am not sure yet what is most appropriate. | |
143 # -YB | |
144 """ | |
124 raise AbstractFunction() | 145 raise AbstractFunction() |
125 | 146 |
126 def rename(*new_field_specifications): | 147 def rename(*new_field_specifications): |
127 #Yoshua- | 148 #Yoshua- |
128 # Do you mean for this to be a virtual method? | 149 # Do you mean for this to be a virtual method? |
129 # Wouldn't this functionality be easier to provide via a | 150 # Wouldn't this functionality be easier to provide via a |
130 # RenamingDataSet, such as the one I've written below? | 151 # RenamingDataSet, such as the one I've written below? |
131 # -JB | 152 # -JB |
153 # You are right. Whichever implementation, however, we need a generic way to | |
154 # 'concatenate' fields, to handle the ([old_field1, old_field2, ...], new_field) semantics. | |
155 # -YB | |
132 """ | 156 """ |
133 Return a new dataset that maps old fields (of self) to new fields (of the returned | 157 Return a new dataset that maps old fields (of self) to new fields (of the returned |
134 dataset). The minimal syntax that should be supported is the following: | 158 dataset). The minimal syntax that should be supported is the following: |
135 new_field_specifications = [new_field_spec1, new_field_spec2, ...] | 159 new_field_specifications = [new_field_spec1, new_field_spec2, ...] |
136 new_field_spec = ([old_field1, old_field2, ...], new_field) | 160 new_field_spec = ([old_field1, old_field2, ...], new_field) |
137 In general both old_field and new_field should be strings, but some datasets may also | 161 In general both old_field and new_field should be strings, but some datasets may also |
138 support additional indexing schemes within each field (e.g. column slice | 162 support additional indexing schemes within each field (e.g. column slice |
139 of a matrix-like field). | 163 of a matrix-like field). |
140 """ | 164 """ |
141 raise AbstractFunction() | 165 raise AbstractFunction() |
166 | |
167 | |
168 def apply_function(function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): | |
169 """ | |
170 Return a dataset that contains as fields the results of applying | |
171 the given function (example-wise) to the specified input_fields. The | |
172 function should return a sequence whose elements will be stored in | |
173 fields whose names are given in the output_fields list. If copy_inputs | |
174 is True then the resulting dataset will also contain the fields of self. | |
175 If accept_minibatches, then the function may be called | |
176 with minibatches as arguments (what is returned by the minibatches | |
177 iterator). In any case, the computations may be delayed until the examples | |
178 of the resulting dataset are requested. If cache is True, then | |
179 once the output fields for some examples have been computed, then | |
180 are cached (to avoid recomputation if the same examples are again | |
181 requested). | |
182 """ | |
183 return ApplyFunctionDataSet(function, input_fields, output_fields, copy_inputs, accept_minibatches, cache) | |
142 | 184 |
143 class RenamingDataSet(DataSet): | 185 class RenamingDataSet(DataSet): |
144 """A DataSet that wraps another one, and makes it look like the field names | 186 """A DataSet that wraps another one, and makes it look like the field names |
145 are different | 187 are different |
146 | 188 |
285 # - try to see if we can avoid the copy? | 327 # - try to see if we can avoid the copy? |
286 | 328 |
287 class ArrayDataSet(FiniteDataSet): | 329 class ArrayDataSet(FiniteDataSet): |
288 """ | 330 """ |
289 An ArrayDataSet behaves like a numpy array but adds the notion of named fields | 331 An ArrayDataSet behaves like a numpy array but adds the notion of named fields |
290 from DataSet (and the ability to view multiple field values as an 'Example'). | 332 from DataSet (and the ability to view the values of multiple fields as an 'Example'). |
291 It is a fixed-length and fixed-width dataset | 333 It is a fixed-length and fixed-width dataset |
292 in which each element is a numpy array or a number, hence the whole | 334 in which each element is a fixed dimension numpy array or a number, hence the whole |
293 dataset corresponds to a numpy array. Fields | 335 dataset corresponds to a numpy array. Fields |
294 must correspond to a slice of array columns. If the dataset has fields, | 336 must correspond to a slice of array columns. If the dataset has fields, |
295 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. | 337 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. |
296 Any dataset can also be converted to a numpy array (losing the notion of fields | 338 Any dataset can also be converted to a numpy array (losing the notion of fields |
297 by the numpy.array(dataset) call. | 339 by the numpy.array(dataset) call. |
380 if not fieldslice.start or not fieldslice.step: | 422 if not fieldslice.start or not fieldslice.step: |
381 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) | 423 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) |
382 # and coherent with the data array | 424 # and coherent with the data array |
383 assert fieldslice.start >= 0 and fieldslice.stop <= cols | 425 assert fieldslice.start >= 0 and fieldslice.stop <= cols |
384 | 426 |
385 def __iter__(self): | |
386 return self.zip(*self.fieldNames()) | |
387 | |
388 def minibatches(self, | 427 def minibatches(self, |
389 fieldnames = DataSet.minibatches_fieldnames, | 428 fieldnames = DataSet.minibatches_fieldnames, |
390 minibatch_size = DataSet.minibatches_minibatch_size, | 429 minibatch_size = DataSet.minibatches_minibatch_size, |
391 n_batches = DataSet.minibatches_n_batches): | 430 n_batches = DataSet.minibatches_n_batches): |
392 """ | 431 """ |
444 def __getslice__(self,*args): | 483 def __getslice__(self,*args): |
445 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" | 484 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" |
446 return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields) | 485 return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields) |
447 | 486 |
448 def __array__(self): | 487 def __array__(self): |
449 """Return an view of this dataset which is an numpy.ndarray | 488 """Return a view of this dataset which is an numpy.ndarray (i.e. losing |
489 the identity and name of fields within the dataset). | |
450 | 490 |
451 Numpy uses this special function name to retrieve an ndarray view for | 491 Numpy uses this special function name to retrieve an ndarray view for |
452 function such as numpy.sum, numpy.dot, numpy.asarray, etc. | 492 function such as numpy.sum, numpy.dot, numpy.asarray, etc. |
453 | 493 |
454 If this dataset has no fields, then we simply return self.data, | 494 If this dataset has no fields, then we simply return self.data, |
455 otherwise things are complicated. | 495 otherwise things are complicated. |
456 - why do we want this behaviour when there are fields? (JB) | 496 - why do we want this behaviour when there are fields? (JB) |
497 - for convenience and completeness (but maybe it would make | |
498 more sense to implement this through a 'field-merging' | |
499 dataset). (YB) | |
457 """ | 500 """ |
458 if not self.fields: | 501 if not self.fields: |
459 return self.data | 502 return self.data |
460 # else, select subsets of columns mapped by the fields | 503 # else, select subsets of columns mapped by the fields |
461 columns_used = numpy.zeros((self.data.shape[1]),dtype=bool) | 504 columns_used = numpy.zeros((self.data.shape[1]),dtype=bool) |
495 # copy the field here | 538 # copy the field here |
496 result[:,slice(c,slice_width)]=self.data[:,field_slice] | 539 result[:,slice(c,slice_width)]=self.data[:,field_slice] |
497 c+=slice_width | 540 c+=slice_width |
498 return result | 541 return result |
499 | 542 |
500 | 543 class ApplyFunctionDataset(DataSet): |
544 """ | |
545 A dataset that contains as fields the results of applying | |
546 a given function (example-wise) to specified input_fields of a source | |
547 dataset. The function should return a sequence whose elements will be stored in | |
548 fields whose names are given in the output_fields list. If copy_inputs | |
549 is True then the resulting dataset will also contain the fields of the source. | |
550 dataset. If accept_minibatches, then the function expects | |
551 minibatches as arguments (what is returned by the minibatches | |
552 iterator). In any case, the computations may be delayed until the examples | |
553 of self are requested. If cache is True, then | |
554 once the output fields for some examples have been computed, then | |
555 are cached (to avoid recomputation if the same examples are again requested). | |
556 """ | |
557 def __init__(src,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): | |
558 DataSet.__init__(self) | |
559 self.src=src | |
560 self.function=function | |
561 self.input_fields=input_fields | |
562 self.output_fields=output_fields | |
563 self.copy_inputs=copy_inputs | |
564 self.accept_minibatches=accept_minibatches | |
565 src_fieldnames = src.fieldNames() | |
566 if copy_inputs: | |
567 for src_field in src_fieldnames: | |
568 assert src_field not in output_fields | |
569 self.fieldnames=src_fieldnames+output_fields | |
570 else: | |
571 self.fieldnames=output_fields | |
572 for input_field in input_fields: | |
573 assert input_field in src_fieldnames | |
574 self.cache=cache | |
575 if cache: | |
576 # maybe a fixed-size array kind of structure would be more efficient than a list | |
577 # in the case where src is FiniteDataSet. -YB | |
578 self.cached_examples = [] | |
579 | |
580 def fieldNames(self): return self.fieldnames | |
581 | |
582 def minibatches(self, | |
583 fieldnames = DataSet.minibatches_fieldnames, | |
584 minibatch_size = DataSet.minibatches_minibatch_size, | |
585 n_batches = DataSet.minibatches_n_batches): | |
586 | |
587 class Iterator(LookupList): | |
588 | |
589 def __init__(self,dataset): | |
590 LookupList.__init__(self, fieldnames, [0]*len(fieldnames)) | |
591 self.dataset=dataset | |
592 if dataset.copy_inputs: | |
593 src_fields=dataset.fieldNames() | |
594 else: | |
595 src_fields=dataset.input_fields | |
596 self.src_iterator=self.src.minibatches(src_fields,minibatch_size,n_batches) | |
597 | |
598 def __iter__(self): | |
599 return self | |
600 | |
601 def next(self): | |
602 src_examples = self.src_iterator.next() | |
603 if self.dataset.copy_inputs: | |
604 function_inputs = src_examples | |
605 else: | |
606 function_inputs = | |
607 [src_examples[field_name] for field_name in self.dataset.input_fields]) | |
608 return self.dataset.function(*function_inputs) | |
609 | |
610 for fieldname in fieldnames: | |
611 assert fieldname in self.input_fields | |
612 return Iterator(self) | |
613 | |
614 |