Mercurial > pylearn
comparison dataset.py @ 41:283e95c15b47
Added ArrayDataSet
author | bengioy@grenat.iro.umontreal.ca |
---|---|
date | Fri, 25 Apr 2008 12:04:55 -0400 |
parents | 88fd1cce08b9 |
children | 9b68774fcc6b |
comparison
equal
deleted
inserted
replaced
40:88fd1cce08b9 | 41:283e95c15b47 |
---|---|
1 | 1 |
2 from lookup_list import LookupList | 2 from lookup_list import LookupList |
3 Example = LookupList | 3 Example = LookupList |
4 from misc import * | 4 from misc import * |
5 import copy | 5 import copy |
6 import string | |
6 | 7 |
7 class AbstractFunction (Exception): """Derived class must override this function""" | 8 class AbstractFunction (Exception): """Derived class must override this function""" |
8 class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented""" | 9 class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented""" |
9 class UnboundedDataSet (Exception): """Trying to obtain length of unbounded dataset (a stream)""" | 10 class UnboundedDataSet (Exception): """Trying to obtain length of unbounded dataset (a stream)""" |
10 | 11 |
73 | 74 |
74 * dataset[i] returns an Example. | 75 * dataset[i] returns an Example. |
75 | 76 |
76 * dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in. | 77 * dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in. |
77 | 78 |
79 * dataset['key'] returns a property associated with the given 'key' string. | |
80 If 'key' is a fieldname, then the VStacked field values (iterable over | |
81 field values) for that field is returned. Other keys may be supported | |
82 by different dataset subclasses. The following key names are should be supported: | |
83 - 'description': a textual description or name for the dataset | |
84 - '<fieldname>.type': a type name or value for a given <fieldname> | |
85 | |
78 Datasets can be concatenated either vertically (increasing the length) or | 86 Datasets can be concatenated either vertically (increasing the length) or |
79 horizontally (augmenting the set of fields), if they are compatible, using | 87 horizontally (augmenting the set of fields), if they are compatible, using |
80 the following operations (with the same basic semantics as numpy.hstack | 88 the following operations (with the same basic semantics as numpy.hstack |
81 and numpy.vstack): | 89 and numpy.vstack): |
82 | 90 |
94 According to the same logic, and viewing a DataSetFields object associated to | 102 According to the same logic, and viewing a DataSetFields object associated to |
95 a DataSet as a kind of transpose of it, fields1 + fields2 concatenates fields of | 103 a DataSet as a kind of transpose of it, fields1 + fields2 concatenates fields of |
96 a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their | 104 a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their |
97 examples. | 105 examples. |
98 | 106 |
107 A dataset can hold arbitrary key-value pairs that may be used to access meta-data | |
108 or other properties of the dataset or associated with the dataset or the result | |
109 of a computation stored in a dataset. These can be accessed through the [key] syntax | |
110 when key is a string (or more specifically, neither an integer, a slice, nor a list). | |
111 | |
99 A DataSet sub-class should always redefine the following methods: | 112 A DataSet sub-class should always redefine the following methods: |
100 * __len__ if it is not a stream | 113 * __len__ if it is not a stream |
101 * fieldNames | 114 * fieldNames |
102 * minibatches_nowrap (called by DataSet.minibatches()) | 115 * minibatches_nowrap (called by DataSet.minibatches()) |
103 * valuesHStack | 116 * valuesHStack |
106 * hasFields | 119 * hasFields |
107 * __getitem__ may not be feasible with some streams | 120 * __getitem__ may not be feasible with some streams |
108 * __iter__ | 121 * __iter__ |
109 """ | 122 """ |
110 | 123 |
111 def __init__(self): | 124 def __init__(self,description=None,field_types=None): |
112 pass | 125 if description is None: |
126 # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)" | |
127 description = type(self).__name__ + " ( " + string.join([x.__name__ for x in type(self).__bases__]) + " )" | |
128 self.description=description | |
129 self.field_types=field_types | |
113 | 130 |
114 class MinibatchToSingleExampleIterator(object): | 131 class MinibatchToSingleExampleIterator(object): |
115 """ | 132 """ |
116 Converts the result of minibatch iterator with minibatch_size==1 into | 133 Converts the result of minibatch iterator with minibatch_size==1 into |
117 single-example values in the result. Therefore the result of | 134 single-example values in the result. Therefore the result of |
318 """ | 335 """ |
319 dataset[i] returns the (i+1)-th example of the dataset. | 336 dataset[i] returns the (i+1)-th example of the dataset. |
320 dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. | 337 dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. |
321 dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. | 338 dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. |
322 dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in. | 339 dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in. |
340 dataset['key'] returns a property associated with the given 'key' string. | |
341 If 'key' is a fieldname, then the VStacked field values (iterable over | |
342 field values) for that field is returned. Other keys may be supported | |
343 by different dataset subclasses. The following key names are encouraged: | |
344 - 'description': a textual description or name for the dataset | |
345 - '<fieldname>.type': a type name or value for a given <fieldname> | |
323 | 346 |
324 Note that some stream datasets may be unable to implement random access, i.e. | 347 Note that some stream datasets may be unable to implement random access, i.e. |
325 arbitrary slicing/indexing | 348 arbitrary slicing/indexing |
326 because they can only iterate through examples one or a minibatch at a time | 349 because they can only iterate through examples one or a minibatch at a time |
327 and do not actually store or keep past (or future) examples. | 350 and do not actually store or keep past (or future) examples. |
329 The default implementation of getitem uses the minibatches iterator | 352 The default implementation of getitem uses the minibatches iterator |
330 to obtain one example, one slice, or a list of examples. It may not | 353 to obtain one example, one slice, or a list of examples. It may not |
331 always be the most efficient way to obtain the result, especially if | 354 always be the most efficient way to obtain the result, especially if |
332 the data are actually stored in a memory array. | 355 the data are actually stored in a memory array. |
333 """ | 356 """ |
357 # check for an index | |
334 if type(i) is int: | 358 if type(i) is int: |
335 return DataSet.MinibatchToSingleExampleIterator( | 359 return DataSet.MinibatchToSingleExampleIterator( |
336 self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next() | 360 self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next() |
361 rows=None | |
362 # or a slice | |
337 if type(i) is slice: | 363 if type(i) is slice: |
338 if not i.start: i.start=0 | 364 if not i.start: i.start=0 |
339 if not i.step: i.step=1 | 365 if not i.step: i.step=1 |
340 if i.step is 1: | 366 if i.step is 1: |
341 return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples() | 367 return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples() |
342 rows = range(i.start,i.stop,i.step) | 368 rows = range(i.start,i.stop,i.step) |
343 else: | 369 # or a list of indices |
344 assert type(i) is list | 370 elif type(i) is list: |
345 rows = i | 371 rows = i |
346 fields_values = zip(*[self[row] for row in rows]) | 372 if rows is not None: |
347 return MinibatchDataSet( | 373 fields_values = zip(*[self[row] for row in rows]) |
348 Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values) | 374 return MinibatchDataSet( |
349 for fieldname,field_values | 375 Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values) |
350 in zip(self.fieldNames(),fields_values)])) | 376 for fieldname,field_values |
377 in zip(self.fieldNames(),fields_values)])) | |
378 # else check for a fieldname | |
379 if self.hasFields(i): | |
380 return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] | |
381 # else we are trying to access a property of the dataset | |
382 assert i in self.__dict__ # else it means we are trying to access a non-existing property | |
383 return self.__dict__[i] | |
351 | 384 |
352 def valuesHStack(self,fieldnames,fieldvalues): | 385 def valuesHStack(self,fieldnames,fieldvalues): |
353 """ | 386 """ |
354 Return a value that corresponds to concatenating (horizontally) several field values. | 387 Return a value that corresponds to concatenating (horizontally) several field values. |
355 This can be useful to merge some fields. The implementation of this operation is likely | 388 This can be useful to merge some fields. The implementation of this operation is likely |
459 if not fieldnames: | 492 if not fieldnames: |
460 fieldnames=dataset.fieldNames() | 493 fieldnames=dataset.fieldNames() |
461 assert dataset.hasFields(*fieldnames) | 494 assert dataset.hasFields(*fieldnames) |
462 LookupList.__init__(self,dataset.fieldNames(), | 495 LookupList.__init__(self,dataset.fieldNames(), |
463 dataset.minibatches(fieldnames if len(fieldnames)>0 else self.fieldNames(), | 496 dataset.minibatches(fieldnames if len(fieldnames)>0 else self.fieldNames(), |
464 minibatch_size=len(dataset)).next() | 497 minibatch_size=len(dataset)).next()) |
465 def examples(self): | 498 def examples(self): |
466 return self.dataset | 499 return self.dataset |
467 | 500 |
468 def __or__(self,other): | 501 def __or__(self,other): |
469 """ | 502 """ |
520 class Iterator(object): | 553 class Iterator(object): |
521 def __init__(self,ds): | 554 def __init__(self,ds): |
522 self.ds=ds | 555 self.ds=ds |
523 self.next_example=offset | 556 self.next_example=offset |
524 assert minibatch_size > 0 | 557 assert minibatch_size > 0 |
525 if offset+minibatch_size > ds.length | 558 if offset+minibatch_size > ds.length: |
526 raise NotImplementedError() | 559 raise NotImplementedError() |
527 def __iter__(self): | 560 def __iter__(self): |
528 return self | 561 return self |
529 def next(self): | 562 def next(self): |
530 upper = next_example+minibatch_size | 563 upper = next_example+minibatch_size |
552 raised or the fields are renamed (either by prefixing the __name__ attribute | 585 raised or the fields are renamed (either by prefixing the __name__ attribute |
553 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list). | 586 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list). |
554 | 587 |
555 TODO: automatically detect a chain of stacked datasets due to A | B | C | D ... | 588 TODO: automatically detect a chain of stacked datasets due to A | B | C | D ... |
556 """ | 589 """ |
557 def __init__(self,datasets,accept_nonunique_names=False): | 590 def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None): |
558 DataSet.__init__(self) | 591 DataSet.__init__(self,description,field_types) |
559 self.datasets=datasets | 592 self.datasets=datasets |
560 self.accept_nonunique_names=accept_nonunique_names | 593 self.accept_nonunique_names=accept_nonunique_names |
561 self.fieldname2dataset={} | 594 self.fieldname2dataset={} |
562 | 595 |
563 def rename_field(fieldname,dataset,i): | 596 def rename_field(fieldname,dataset,i): |
594 return True | 627 return True |
595 | 628 |
596 def fieldNames(self): | 629 def fieldNames(self): |
597 return self.fieldname2dataset.keys() | 630 return self.fieldname2dataset.keys() |
598 | 631 |
599 def minibatches_nowrap(self, | 632 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): |
600 fieldnames = minibatches_fieldnames, | |
601 minibatch_size = minibatches_minibatch_size, | |
602 n_batches = minibatches_n_batches, | |
603 offset = 0): | |
604 | 633 |
605 class Iterator(object): | 634 class Iterator(object): |
606 def __init__(self,hsds,iterators): | 635 def __init__(self,hsds,iterators): |
607 self.hsds=hsds | 636 self.hsds=hsds |
608 self.iterators=iterators | 637 self.iterators=iterators |
697 """Return (dataset_index, row_within_dataset) for global row number""" | 726 """Return (dataset_index, row_within_dataset) for global row number""" |
698 dataset_index = self.index2dataset[row] | 727 dataset_index = self.index2dataset[row] |
699 row_within_dataset = self.datasets_start_row[dataset_index] | 728 row_within_dataset = self.datasets_start_row[dataset_index] |
700 return dataset_index, row_within_dataset | 729 return dataset_index, row_within_dataset |
701 | 730 |
702 def minibatches_nowrap(self, | 731 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): |
703 fieldnames = minibatches_fieldnames, | |
704 minibatch_size = minibatches_minibatch_size, | |
705 n_batches = minibatches_n_batches, | |
706 offset = 0): | |
707 | 732 |
708 class Iterator(object): | 733 class Iterator(object): |
709 def __init__(self,vsds): | 734 def __init__(self,vsds): |
710 self.vsds=vsds | 735 self.vsds=vsds |
711 self.next_row=offset | 736 self.next_row=offset |
760 self.next_dataset_row+=minibatch_size | 785 self.next_dataset_row+=minibatch_size |
761 if self.next_row+minibatch_size>len(dataset): | 786 if self.next_row+minibatch_size>len(dataset): |
762 self.move_to_next_dataset() | 787 self.move_to_next_dataset() |
763 return | 788 return |
764 | 789 |
790 class ArrayFieldsDataSet(DataSet): | |
791 """ | |
792 Virtual super-class of datasets whose field values are numpy array, | |
793 thus defining valuesHStack and valuesVStack for sub-classes. | |
794 """ | |
795 def __init__(self,description=None,field_types=None): | |
796 DataSet.__init__(self,description,field_types) | |
797 def valuesHStack(self,fieldnames,fieldvalues): | |
798 """Concatenate field values horizontally, e.g. two vectors | |
799 become a longer vector, two matrices become a wider matrix, etc.""" | |
800 return numpy.hstack(fieldvalues) | |
801 def valuesVStack(self,fieldname,values): | |
802 """Concatenate field values vertically, e.g. two vectors | |
803 become a two-row matrix, two matrices become a longer matrix, etc.""" | |
804 return numpy.vstack(values) | |
805 | |
806 class ArrayDataSet(ArrayFieldsDataSet): | |
807 """ | |
808 An ArrayDataSet stores the fields as groups of columns in a numpy tensor, | |
809 whose first axis iterates over examples, second axis determines fields. | |
810 If the underlying array is N-dimensional (has N axes), then the field | |
811 values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2). | |
812 """ | |
813 | |
814 """ | |
815 Construct an ArrayDataSet from the underlying numpy array (data) and | |
816 a map from fieldnames to field columns. The columns of a field are specified | |
817 using the standard arguments for indexing/slicing: integer for a column index, | |
818 slice for an interval of columns (with possible stride), or iterable of column indices. | |
819 """ | |
820 def __init__(self, data_array, fields_names_columns): | |
821 self.data=data_array | |
822 self.fields=fields_names_columns | |
823 | |
824 # check consistency and complete slices definitions | |
825 for fieldname, fieldcolumns in self.fields.items(): | |
826 if type(fieldcolumns) is int: | |
827 assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1] | |
828 elif type(fieldcolumns) is slice: | |
829 start,step=None,None | |
830 if not fieldcolumns.start: | |
831 start=0 | |
832 if not fieldcolumns.step: | |
833 step=1 | |
834 if start or step: | |
835 self.fields[fieldname]=slice(start,fieldcolumns.stop,step) | |
836 elif hasattr(fieldcolumns,"__iter__"): # something like a list | |
837 for i in fieldcolumns: | |
838 assert i>=0 and i<data_array.shape[1] | |
839 | |
840 def fieldNames(self): | |
841 return self.fields.keys() | |
842 | |
843 def __len__(self): | |
844 return len(self.data) | |
845 | |
846 #def __getitem__(self,i): | |
847 # """More efficient implementation than the default""" | |
848 | |
849 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): | |
850 class Iterator(LookupList): # store the result in the lookup-list values | |
851 def __init__(dataset,fieldnames,minibatch_size,n_batches,offset): | |
852 if fieldnames is None: fieldnames = dataset.fieldNames() | |
853 LookupList.__init__(self,fieldnames,[0]*len(fieldnames)) | |
854 self.dataset=dataset | |
855 self.minibatch_size=minibatch_size | |
856 assert offset>=0 and offset<len(dataset.data) | |
857 assert offset+minibatch_size<len(dataset.data) | |
858 self.current=offset | |
859 def __iter__(self): | |
860 return self | |
861 def next(self): | |
862 sub_data = self.dataset.data[self.current:self.current+self.minibatch_size] | |
863 self._values = [sub_data[:,self.dataset.fields[f]] for f in self._names] | |
864 return self | |
765 | 865 |
866 return Iterator(self,fieldnames,minibatch_size,n_batches,offset) | |
867 | |
766 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): | 868 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): |
767 """ | 869 """ |
768 Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the | 870 Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the |
769 user to define a set of fields as the 'input' field and a set of fields | 871 user to define a set of fields as the 'input' field and a set of fields |
770 as the 'target' field. Optionally, a single weight_field can also be defined. | 872 as the 'target' field. Optionally, a single weight_field can also be defined. |