comparison dataset.py @ 41:283e95c15b47

Added ArrayDataSet
author bengioy@grenat.iro.umontreal.ca
date Fri, 25 Apr 2008 12:04:55 -0400
parents 88fd1cce08b9
children 9b68774fcc6b
comparison
equal deleted inserted replaced
40:88fd1cce08b9 41:283e95c15b47
1 1
2 from lookup_list import LookupList 2 from lookup_list import LookupList
3 Example = LookupList 3 Example = LookupList
4 from misc import * 4 from misc import *
5 import copy 5 import copy
6 import string
6 7
7 class AbstractFunction (Exception): """Derived class must override this function""" 8 class AbstractFunction (Exception): """Derived class must override this function"""
8 class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented""" 9 class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented"""
9 class UnboundedDataSet (Exception): """Trying to obtain length of unbounded dataset (a stream)""" 10 class UnboundedDataSet (Exception): """Trying to obtain length of unbounded dataset (a stream)"""
10 11
73 74
74 * dataset[i] returns an Example. 75 * dataset[i] returns an Example.
75 76
76 * dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in. 77 * dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in.
77 78
79 * dataset['key'] returns a property associated with the given 'key' string.
80 If 'key' is a fieldname, then the VStacked field values (iterable over
81 field values) for that field is returned. Other keys may be supported
82 by different dataset subclasses. The following key names are should be supported:
83 - 'description': a textual description or name for the dataset
84 - '<fieldname>.type': a type name or value for a given <fieldname>
85
78 Datasets can be concatenated either vertically (increasing the length) or 86 Datasets can be concatenated either vertically (increasing the length) or
79 horizontally (augmenting the set of fields), if they are compatible, using 87 horizontally (augmenting the set of fields), if they are compatible, using
80 the following operations (with the same basic semantics as numpy.hstack 88 the following operations (with the same basic semantics as numpy.hstack
81 and numpy.vstack): 89 and numpy.vstack):
82 90
94 According to the same logic, and viewing a DataSetFields object associated to 102 According to the same logic, and viewing a DataSetFields object associated to
95 a DataSet as a kind of transpose of it, fields1 + fields2 concatenates fields of 103 a DataSet as a kind of transpose of it, fields1 + fields2 concatenates fields of
96 a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their 104 a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their
97 examples. 105 examples.
98 106
107 A dataset can hold arbitrary key-value pairs that may be used to access meta-data
108 or other properties of the dataset or associated with the dataset or the result
109 of a computation stored in a dataset. These can be accessed through the [key] syntax
110 when key is a string (or more specifically, neither an integer, a slice, nor a list).
111
99 A DataSet sub-class should always redefine the following methods: 112 A DataSet sub-class should always redefine the following methods:
100 * __len__ if it is not a stream 113 * __len__ if it is not a stream
101 * fieldNames 114 * fieldNames
102 * minibatches_nowrap (called by DataSet.minibatches()) 115 * minibatches_nowrap (called by DataSet.minibatches())
103 * valuesHStack 116 * valuesHStack
106 * hasFields 119 * hasFields
107 * __getitem__ may not be feasible with some streams 120 * __getitem__ may not be feasible with some streams
108 * __iter__ 121 * __iter__
109 """ 122 """
110 123
111 def __init__(self): 124 def __init__(self,description=None,field_types=None):
112 pass 125 if description is None:
126 # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)"
127 description = type(self).__name__ + " ( " + string.join([x.__name__ for x in type(self).__bases__]) + " )"
128 self.description=description
129 self.field_types=field_types
113 130
114 class MinibatchToSingleExampleIterator(object): 131 class MinibatchToSingleExampleIterator(object):
115 """ 132 """
116 Converts the result of minibatch iterator with minibatch_size==1 into 133 Converts the result of minibatch iterator with minibatch_size==1 into
117 single-example values in the result. Therefore the result of 134 single-example values in the result. Therefore the result of
318 """ 335 """
319 dataset[i] returns the (i+1)-th example of the dataset. 336 dataset[i] returns the (i+1)-th example of the dataset.
320 dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. 337 dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
321 dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. 338 dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
322 dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in. 339 dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in.
340 dataset['key'] returns a property associated with the given 'key' string.
341 If 'key' is a fieldname, then the VStacked field values (iterable over
342 field values) for that field is returned. Other keys may be supported
343 by different dataset subclasses. The following key names are encouraged:
344 - 'description': a textual description or name for the dataset
345 - '<fieldname>.type': a type name or value for a given <fieldname>
323 346
324 Note that some stream datasets may be unable to implement random access, i.e. 347 Note that some stream datasets may be unable to implement random access, i.e.
325 arbitrary slicing/indexing 348 arbitrary slicing/indexing
326 because they can only iterate through examples one or a minibatch at a time 349 because they can only iterate through examples one or a minibatch at a time
327 and do not actually store or keep past (or future) examples. 350 and do not actually store or keep past (or future) examples.
329 The default implementation of getitem uses the minibatches iterator 352 The default implementation of getitem uses the minibatches iterator
330 to obtain one example, one slice, or a list of examples. It may not 353 to obtain one example, one slice, or a list of examples. It may not
331 always be the most efficient way to obtain the result, especially if 354 always be the most efficient way to obtain the result, especially if
332 the data are actually stored in a memory array. 355 the data are actually stored in a memory array.
333 """ 356 """
357 # check for an index
334 if type(i) is int: 358 if type(i) is int:
335 return DataSet.MinibatchToSingleExampleIterator( 359 return DataSet.MinibatchToSingleExampleIterator(
336 self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next() 360 self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next()
361 rows=None
362 # or a slice
337 if type(i) is slice: 363 if type(i) is slice:
338 if not i.start: i.start=0 364 if not i.start: i.start=0
339 if not i.step: i.step=1 365 if not i.step: i.step=1
340 if i.step is 1: 366 if i.step is 1:
341 return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples() 367 return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples()
342 rows = range(i.start,i.stop,i.step) 368 rows = range(i.start,i.stop,i.step)
343 else: 369 # or a list of indices
344 assert type(i) is list 370 elif type(i) is list:
345 rows = i 371 rows = i
346 fields_values = zip(*[self[row] for row in rows]) 372 if rows is not None:
347 return MinibatchDataSet( 373 fields_values = zip(*[self[row] for row in rows])
348 Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values) 374 return MinibatchDataSet(
349 for fieldname,field_values 375 Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values)
350 in zip(self.fieldNames(),fields_values)])) 376 for fieldname,field_values
377 in zip(self.fieldNames(),fields_values)]))
378 # else check for a fieldname
379 if self.hasFields(i):
380 return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
381 # else we are trying to access a property of the dataset
382 assert i in self.__dict__ # else it means we are trying to access a non-existing property
383 return self.__dict__[i]
351 384
352 def valuesHStack(self,fieldnames,fieldvalues): 385 def valuesHStack(self,fieldnames,fieldvalues):
353 """ 386 """
354 Return a value that corresponds to concatenating (horizontally) several field values. 387 Return a value that corresponds to concatenating (horizontally) several field values.
355 This can be useful to merge some fields. The implementation of this operation is likely 388 This can be useful to merge some fields. The implementation of this operation is likely
459 if not fieldnames: 492 if not fieldnames:
460 fieldnames=dataset.fieldNames() 493 fieldnames=dataset.fieldNames()
461 assert dataset.hasFields(*fieldnames) 494 assert dataset.hasFields(*fieldnames)
462 LookupList.__init__(self,dataset.fieldNames(), 495 LookupList.__init__(self,dataset.fieldNames(),
463 dataset.minibatches(fieldnames if len(fieldnames)>0 else self.fieldNames(), 496 dataset.minibatches(fieldnames if len(fieldnames)>0 else self.fieldNames(),
464 minibatch_size=len(dataset)).next() 497 minibatch_size=len(dataset)).next())
465 def examples(self): 498 def examples(self):
466 return self.dataset 499 return self.dataset
467 500
468 def __or__(self,other): 501 def __or__(self,other):
469 """ 502 """
520 class Iterator(object): 553 class Iterator(object):
521 def __init__(self,ds): 554 def __init__(self,ds):
522 self.ds=ds 555 self.ds=ds
523 self.next_example=offset 556 self.next_example=offset
524 assert minibatch_size > 0 557 assert minibatch_size > 0
525 if offset+minibatch_size > ds.length 558 if offset+minibatch_size > ds.length:
526 raise NotImplementedError() 559 raise NotImplementedError()
527 def __iter__(self): 560 def __iter__(self):
528 return self 561 return self
529 def next(self): 562 def next(self):
530 upper = next_example+minibatch_size 563 upper = next_example+minibatch_size
552 raised or the fields are renamed (either by prefixing the __name__ attribute 585 raised or the fields are renamed (either by prefixing the __name__ attribute
553 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list). 586 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list).
554 587
555 TODO: automatically detect a chain of stacked datasets due to A | B | C | D ... 588 TODO: automatically detect a chain of stacked datasets due to A | B | C | D ...
556 """ 589 """
557 def __init__(self,datasets,accept_nonunique_names=False): 590 def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None):
558 DataSet.__init__(self) 591 DataSet.__init__(self,description,field_types)
559 self.datasets=datasets 592 self.datasets=datasets
560 self.accept_nonunique_names=accept_nonunique_names 593 self.accept_nonunique_names=accept_nonunique_names
561 self.fieldname2dataset={} 594 self.fieldname2dataset={}
562 595
563 def rename_field(fieldname,dataset,i): 596 def rename_field(fieldname,dataset,i):
594 return True 627 return True
595 628
596 def fieldNames(self): 629 def fieldNames(self):
597 return self.fieldname2dataset.keys() 630 return self.fieldname2dataset.keys()
598 631
599 def minibatches_nowrap(self, 632 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
600 fieldnames = minibatches_fieldnames,
601 minibatch_size = minibatches_minibatch_size,
602 n_batches = minibatches_n_batches,
603 offset = 0):
604 633
605 class Iterator(object): 634 class Iterator(object):
606 def __init__(self,hsds,iterators): 635 def __init__(self,hsds,iterators):
607 self.hsds=hsds 636 self.hsds=hsds
608 self.iterators=iterators 637 self.iterators=iterators
697 """Return (dataset_index, row_within_dataset) for global row number""" 726 """Return (dataset_index, row_within_dataset) for global row number"""
698 dataset_index = self.index2dataset[row] 727 dataset_index = self.index2dataset[row]
699 row_within_dataset = self.datasets_start_row[dataset_index] 728 row_within_dataset = self.datasets_start_row[dataset_index]
700 return dataset_index, row_within_dataset 729 return dataset_index, row_within_dataset
701 730
702 def minibatches_nowrap(self, 731 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
703 fieldnames = minibatches_fieldnames,
704 minibatch_size = minibatches_minibatch_size,
705 n_batches = minibatches_n_batches,
706 offset = 0):
707 732
708 class Iterator(object): 733 class Iterator(object):
709 def __init__(self,vsds): 734 def __init__(self,vsds):
710 self.vsds=vsds 735 self.vsds=vsds
711 self.next_row=offset 736 self.next_row=offset
760 self.next_dataset_row+=minibatch_size 785 self.next_dataset_row+=minibatch_size
761 if self.next_row+minibatch_size>len(dataset): 786 if self.next_row+minibatch_size>len(dataset):
762 self.move_to_next_dataset() 787 self.move_to_next_dataset()
763 return 788 return
764 789
790 class ArrayFieldsDataSet(DataSet):
791 """
792 Virtual super-class of datasets whose field values are numpy array,
793 thus defining valuesHStack and valuesVStack for sub-classes.
794 """
795 def __init__(self,description=None,field_types=None):
796 DataSet.__init__(self,description,field_types)
797 def valuesHStack(self,fieldnames,fieldvalues):
798 """Concatenate field values horizontally, e.g. two vectors
799 become a longer vector, two matrices become a wider matrix, etc."""
800 return numpy.hstack(fieldvalues)
801 def valuesVStack(self,fieldname,values):
802 """Concatenate field values vertically, e.g. two vectors
803 become a two-row matrix, two matrices become a longer matrix, etc."""
804 return numpy.vstack(values)
805
806 class ArrayDataSet(ArrayFieldsDataSet):
807 """
808 An ArrayDataSet stores the fields as groups of columns in a numpy tensor,
809 whose first axis iterates over examples, second axis determines fields.
810 If the underlying array is N-dimensional (has N axes), then the field
811 values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2).
812 """
813
814 """
815 Construct an ArrayDataSet from the underlying numpy array (data) and
816 a map from fieldnames to field columns. The columns of a field are specified
817 using the standard arguments for indexing/slicing: integer for a column index,
818 slice for an interval of columns (with possible stride), or iterable of column indices.
819 """
820 def __init__(self, data_array, fields_names_columns):
821 self.data=data_array
822 self.fields=fields_names_columns
823
824 # check consistency and complete slices definitions
825 for fieldname, fieldcolumns in self.fields.items():
826 if type(fieldcolumns) is int:
827 assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1]
828 elif type(fieldcolumns) is slice:
829 start,step=None,None
830 if not fieldcolumns.start:
831 start=0
832 if not fieldcolumns.step:
833 step=1
834 if start or step:
835 self.fields[fieldname]=slice(start,fieldcolumns.stop,step)
836 elif hasattr(fieldcolumns,"__iter__"): # something like a list
837 for i in fieldcolumns:
838 assert i>=0 and i<data_array.shape[1]
839
840 def fieldNames(self):
841 return self.fields.keys()
842
843 def __len__(self):
844 return len(self.data)
845
846 #def __getitem__(self,i):
847 # """More efficient implementation than the default"""
848
849 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
850 class Iterator(LookupList): # store the result in the lookup-list values
851 def __init__(dataset,fieldnames,minibatch_size,n_batches,offset):
852 if fieldnames is None: fieldnames = dataset.fieldNames()
853 LookupList.__init__(self,fieldnames,[0]*len(fieldnames))
854 self.dataset=dataset
855 self.minibatch_size=minibatch_size
856 assert offset>=0 and offset<len(dataset.data)
857 assert offset+minibatch_size<len(dataset.data)
858 self.current=offset
859 def __iter__(self):
860 return self
861 def next(self):
862 sub_data = self.dataset.data[self.current:self.current+self.minibatch_size]
863 self._values = [sub_data[:,self.dataset.fields[f]] for f in self._names]
864 return self
765 865
866 return Iterator(self,fieldnames,minibatch_size,n_batches,offset)
867
766 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): 868 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
767 """ 869 """
768 Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the 870 Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the
769 user to define a set of fields as the 'input' field and a set of fields 871 user to define a set of fields as the 'input' field and a set of fields
770 as the 'target' field. Optionally, a single weight_field can also be defined. 872 as the 'target' field. Optionally, a single weight_field can also be defined.