# HG changeset patch # User bengioy@esprit.iro.umontreal.ca # Date 1209134479 14400 # Node ID 88fd1cce08b934bb0f0491cf203a6bb132be0075 # Parent c682c6e9bf935272b255bb8ff7fe7f8b374dbf39 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets diff -r c682c6e9bf93 -r 88fd1cce08b9 dataset.py --- a/dataset.py Thu Apr 24 14:46:10 2008 -0400 +++ b/dataset.py Fri Apr 25 10:41:19 2008 -0400 @@ -6,6 +6,7 @@ class AbstractFunction (Exception): """Derived class must override this function""" class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented""" +class UnboundedDataSet (Exception): """Trying to obtain length of unbounded dataset (a stream)""" class DataSet(object): """A virtual base class for datasets. @@ -15,7 +16,8 @@ columns/attributes are called fields. The field value for a particular example can be an arbitrary python object, which depends on the particular dataset. - We call a DataSet a 'stream' when its length is unbounded (len(dataset)==float("infinity")). + We call a DataSet a 'stream' when its length is unbounded (otherwise its __len__ method + should raise an UnboundedDataSet exception). A DataSet is a generator of iterators; these iterators can run through the examples or the fields in a variety of ways. A DataSet need not necessarily have a finite @@ -27,6 +29,7 @@ * for example in dataset([field1, field2,field3, ...]): * for val1,val2,val3 in dataset([field1, field2,field3]): * for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N): + * for mini1,mini2,mini3 in dataset.minibatches([field1, field2, ...],minibatch_size=N): * for example in dataset: Each of these is documented below. All of these iterators are expected to provide, in addition to the usual 'next()' method, a 'next_index()' method @@ -82,7 +85,7 @@ creates a new dataset whose list of fields is the concatenation of the list of fields of the argument datasets. This only works if they all have the same length. - * dataset1 + dataset2 + dataset3 == dataset.vstack([dataset1,dataset2,dataset3]) + * dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3]) creates a new dataset that concatenates the examples from the argument datasets (and whose length is the sum of the length of the argument datasets). This only @@ -93,20 +96,18 @@ a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their examples. - A DataSet sub-class should always redefine the following methods: * __len__ if it is not a stream - * __getitem__ may not be feasible with some streams * fieldNames * minibatches_nowrap (called by DataSet.minibatches()) * valuesHStack * valuesVStack For efficiency of implementation, a sub-class might also want to redefine * hasFields + * __getitem__ may not be feasible with some streams + * __iter__ """ - infinity = float("infinity") - def __init__(self): pass @@ -124,7 +125,9 @@ def __iter__(self): #makes for loop work return self def next(self): - return self.minibatch_iterator.next()[0] + size1_minibatch = self.minibatch_iterator.next() + return Example(size1_minibatch.keys,[value[0] for value in size1_minibatch.values()]) + def next_index(self): return self.minibatch_iterator.next_index() @@ -223,9 +226,6 @@ a list-like container of the f2 field, etc. Using the first syntax, all the fields will be returned in "i". - Beware that some datasets may not support this syntax, if the number - of fields is infinite (i.e. field values may be computed "on demand"). - Using the third syntax, i1, i2, i3 will be list-like containers of the f1, f2, and f3 fields of a batch of examples on each loop iteration. @@ -277,13 +277,11 @@ def __len__(self): """ len(dataset) returns the number of examples in the dataset. - By default, a DataSet is a 'stream', i.e. it has an unbounded (infinite) length. + By default, a DataSet is a 'stream', i.e. it has an unbounded length (raises UnboundedDataSet). Sub-classes which implement finite-length datasets should redefine this method. - Some methods only make sense for finite-length datasets, and will perform - assert len(dataset)0 else self.fieldNames(), @@ -447,7 +472,7 @@ """ return (self.examples() + other.examples()).fields() - def __add__(self,other): + def __and__(self,other): """ fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation of the fields of DataSetFields fields1 and fields2. @@ -479,7 +504,8 @@ return self.length def __getitem__(self,i): - return Example(self.fields.keys(),[field[i] for field in self.fields]) + return DataSetFields(MinibatchDataSet( + Example(self.fields.keys(),[field[i] for field in self.fields])),self.fields) def fieldNames(self): return self.fields.keys() @@ -509,7 +535,7 @@ self.next_example+=minibatch_size return DataSetFields(MinibatchDataSet(minibatch),fieldnames) - return MinibatchWrapAroundIterator(self,fieldnames,minibatch_size,n_batches,offset) + return Iterator(self) def valuesVStack(self,fieldname,fieldvalues): return self.values_vstack(fieldname,fieldvalues) @@ -639,8 +665,11 @@ # We use this map from row index to dataset index for constant-time random access of examples, # to avoid having to search for the appropriate dataset each time and slice is asked for. for dataset,k in enumerate(datasets[0:-1]): - L=len(dataset) - assert L0: self.move_to_next_dataset() extra_mb.append(self.next_iterator.next()) - mb = Example(names, - [dataset.valuesVStack(name,[mb[name]]+[b[name] for b in extra_mb]) - for name in fieldnames]) + examples = Example(names, + [dataset.valuesVStack(name, + [mb[name]]+[b[name] for b in extra_mb]) + for name in fieldnames]) + mb = DataSetFields(MinibatchDataSet(examples),fieldnames) + self.next_row+=minibatch_size self.next_dataset_row+=minibatch_size if self.next_row+minibatch_size>len(dataset): self.move_to_next_dataset() - return mb + return def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):