Mercurial > pylearn
diff dataset.py @ 40:88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
author | bengioy@esprit.iro.umontreal.ca |
---|---|
date | Fri, 25 Apr 2008 10:41:19 -0400 |
parents | c682c6e9bf93 |
children | 283e95c15b47 |
line wrap: on
line diff
--- a/dataset.py Thu Apr 24 14:46:10 2008 -0400 +++ b/dataset.py Fri Apr 25 10:41:19 2008 -0400 @@ -6,6 +6,7 @@ class AbstractFunction (Exception): """Derived class must override this function""" class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented""" +class UnboundedDataSet (Exception): """Trying to obtain length of unbounded dataset (a stream)""" class DataSet(object): """A virtual base class for datasets. @@ -15,7 +16,8 @@ columns/attributes are called fields. The field value for a particular example can be an arbitrary python object, which depends on the particular dataset. - We call a DataSet a 'stream' when its length is unbounded (len(dataset)==float("infinity")). + We call a DataSet a 'stream' when its length is unbounded (otherwise its __len__ method + should raise an UnboundedDataSet exception). A DataSet is a generator of iterators; these iterators can run through the examples or the fields in a variety of ways. A DataSet need not necessarily have a finite @@ -27,6 +29,7 @@ * for example in dataset([field1, field2,field3, ...]): * for val1,val2,val3 in dataset([field1, field2,field3]): * for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N): + * for mini1,mini2,mini3 in dataset.minibatches([field1, field2, ...],minibatch_size=N): * for example in dataset: Each of these is documented below. All of these iterators are expected to provide, in addition to the usual 'next()' method, a 'next_index()' method @@ -82,7 +85,7 @@ creates a new dataset whose list of fields is the concatenation of the list of fields of the argument datasets. This only works if they all have the same length. - * dataset1 + dataset2 + dataset3 == dataset.vstack([dataset1,dataset2,dataset3]) + * dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3]) creates a new dataset that concatenates the examples from the argument datasets (and whose length is the sum of the length of the argument datasets). This only @@ -93,20 +96,18 @@ a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their examples. - A DataSet sub-class should always redefine the following methods: * __len__ if it is not a stream - * __getitem__ may not be feasible with some streams * fieldNames * minibatches_nowrap (called by DataSet.minibatches()) * valuesHStack * valuesVStack For efficiency of implementation, a sub-class might also want to redefine * hasFields + * __getitem__ may not be feasible with some streams + * __iter__ """ - infinity = float("infinity") - def __init__(self): pass @@ -124,7 +125,9 @@ def __iter__(self): #makes for loop work return self def next(self): - return self.minibatch_iterator.next()[0] + size1_minibatch = self.minibatch_iterator.next() + return Example(size1_minibatch.keys,[value[0] for value in size1_minibatch.values()]) + def next_index(self): return self.minibatch_iterator.next_index() @@ -223,9 +226,6 @@ a list-like container of the f2 field, etc. Using the first syntax, all the fields will be returned in "i". - Beware that some datasets may not support this syntax, if the number - of fields is infinite (i.e. field values may be computed "on demand"). - Using the third syntax, i1, i2, i3 will be list-like containers of the f1, f2, and f3 fields of a batch of examples on each loop iteration. @@ -277,13 +277,11 @@ def __len__(self): """ len(dataset) returns the number of examples in the dataset. - By default, a DataSet is a 'stream', i.e. it has an unbounded (infinite) length. + By default, a DataSet is a 'stream', i.e. it has an unbounded length (raises UnboundedDataSet). Sub-classes which implement finite-length datasets should redefine this method. - Some methods only make sense for finite-length datasets, and will perform - assert len(dataset)<DataSet.infinity - in order to check the finiteness of the dataset. + Some methods only make sense for finite-length datasets. """ - return infinity + raise UnboundedDataSet() def hasFields(self,*fieldnames): """ @@ -327,8 +325,29 @@ arbitrary slicing/indexing because they can only iterate through examples one or a minibatch at a time and do not actually store or keep past (or future) examples. + + The default implementation of getitem uses the minibatches iterator + to obtain one example, one slice, or a list of examples. It may not + always be the most efficient way to obtain the result, especially if + the data are actually stored in a memory array. """ - raise NotImplementedError() + if type(i) is int: + return DataSet.MinibatchToSingleExampleIterator( + self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next() + if type(i) is slice: + if not i.start: i.start=0 + if not i.step: i.step=1 + if i.step is 1: + return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples() + rows = range(i.start,i.stop,i.step) + else: + assert type(i) is list + rows = i + fields_values = zip(*[self[row] for row in rows]) + return MinibatchDataSet( + Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values) + for fieldname,field_values + in zip(self.fieldNames(),fields_values)])) def valuesHStack(self,fieldnames,fieldvalues): """ @@ -377,9 +396,9 @@ """ return HStackedDataSet(self,other) - def __add__(self,other): + def __and__(self,other): """ - dataset1 + dataset2 is a dataset that concatenates the examples from the argument datasets + dataset1 & dataset2 is a dataset that concatenates the examples from the argument datasets (and whose length is the sum of the length of the argument datasets). This only works if they all have the same fields. """ @@ -398,7 +417,7 @@ def vstack(datasets): """ - vstack(dataset1,dataset2,...) returns dataset1 + datataset2 + ... + vstack(dataset1,dataset2,...) returns dataset1 & datataset2 & ... which is a dataset which iterates first over the examples of dataset1, then over those of dataset2, etc. """ @@ -430,9 +449,15 @@ be turned back into a DataSet with its examples() method: dataset2 = dataset1.fields().examples() and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1). + + DataSetFields can be concatenated vertically or horizontally. To be consistent with + the syntax used for DataSets, the | concatenates the fields and the & concatenates + the examples. """ def __init__(self,dataset,*fieldnames): self.dataset=dataset + if not fieldnames: + fieldnames=dataset.fieldNames() assert dataset.hasFields(*fieldnames) LookupList.__init__(self,dataset.fieldNames(), dataset.minibatches(fieldnames if len(fieldnames)>0 else self.fieldNames(), @@ -447,7 +472,7 @@ """ return (self.examples() + other.examples()).fields() - def __add__(self,other): + def __and__(self,other): """ fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation of the fields of DataSetFields fields1 and fields2. @@ -479,7 +504,8 @@ return self.length def __getitem__(self,i): - return Example(self.fields.keys(),[field[i] for field in self.fields]) + return DataSetFields(MinibatchDataSet( + Example(self.fields.keys(),[field[i] for field in self.fields])),self.fields) def fieldNames(self): return self.fields.keys() @@ -509,7 +535,7 @@ self.next_example+=minibatch_size return DataSetFields(MinibatchDataSet(minibatch),fieldnames) - return MinibatchWrapAroundIterator(self,fieldnames,minibatch_size,n_batches,offset) + return Iterator(self) def valuesVStack(self,fieldname,fieldvalues): return self.values_vstack(fieldname,fieldvalues) @@ -639,8 +665,11 @@ # We use this map from row index to dataset index for constant-time random access of examples, # to avoid having to search for the appropriate dataset each time and slice is asked for. for dataset,k in enumerate(datasets[0:-1]): - L=len(dataset) - assert L<DataSet.infinity + try: + L=len(dataset) + except UnboundedDataSet: + print "All VStacked datasets (except possibly the last) must be bounded (have a length)." + assert False for i in xrange(L): self.index2dataset[self.length+i]=k self.datasets_start_row.append(self.length) @@ -721,14 +750,17 @@ while self.n_left_in_mb>0: self.move_to_next_dataset() extra_mb.append(self.next_iterator.next()) - mb = Example(names, - [dataset.valuesVStack(name,[mb[name]]+[b[name] for b in extra_mb]) - for name in fieldnames]) + examples = Example(names, + [dataset.valuesVStack(name, + [mb[name]]+[b[name] for b in extra_mb]) + for name in fieldnames]) + mb = DataSetFields(MinibatchDataSet(examples),fieldnames) + self.next_row+=minibatch_size self.next_dataset_row+=minibatch_size if self.next_row+minibatch_size>len(dataset): self.move_to_next_dataset() - return mb + return def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):