pylearn: dataset.py comparison

comparison dataset.py @ 40:88fd1cce08b9

replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets

author	bengioy@esprit.iro.umontreal.ca
date	Fri, 25 Apr 2008 10:41:19 -0400
parents	c682c6e9bf93
children	283e95c15b47

comparison

equal deleted inserted replaced

-:c682c6e9bf93
+:88fd1cce08b9
 from misc import *
 import copy
 class AbstractFunction (Exception): """Derived class must override this function"""
 class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented"""
+class UnboundedDataSet (Exception): """Trying to obtain length of unbounded dataset (a stream)"""
 class DataSet(object):
 """A virtual base class for datasets.
 A DataSet can be seen as a generalization of a matrix, meant to be used in conjunction
 with learning algorithms (for training and testing them): rows/records are called examples, and
 columns/attributes are called fields. The field value for a particular example can be an arbitrary
 python object, which depends on the particular dataset.
-We call a DataSet a 'stream' when its length is unbounded (len(dataset)==float("infinity")).
+We call a DataSet a 'stream' when its length is unbounded (otherwise its __len__ method
+should raise an UnboundedDataSet exception).
 A DataSet is a generator of iterators; these iterators can run through the
 examples or the fields in a variety of ways.  A DataSet need not necessarily have a finite
 or known length, so this class can be used to interface to a 'stream' which
 feeds on-line learning (however, as noted below, some operations are not
 To iterate over examples, there are several possibilities:
 * for example in dataset([field1, field2,field3, ...]):
 * for val1,val2,val3 in dataset([field1, field2,field3]):
 * for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N):
+* for mini1,mini2,mini3 in dataset.minibatches([field1, field2, ...],minibatch_size=N):
 * for example in dataset:
 Each of these is documented below. All of these iterators are expected
 to provide, in addition to the usual 'next()' method, a 'next_index()' method
 which returns a non-negative integer pointing to the position of the next
 example that will be returned by 'next()' (or of the first example in the
 * dataset1 | dataset2 | dataset3 == dataset.hstack([dataset1,dataset2,dataset3])
 creates a new dataset whose list of fields is the concatenation of the list of
 fields of the argument datasets. This only works if they all have the same length.
-* dataset1 + dataset2 + dataset3 == dataset.vstack([dataset1,dataset2,dataset3])
+* dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3])
 creates a new dataset that concatenates the examples from the argument datasets
 (and whose length is the sum of the length of the argument datasets). This only
 works if they all have the same fields.
 According to the same logic, and viewing a DataSetFields object associated to
 a DataSet as a kind of transpose of it, fields1 + fields2 concatenates fields of
 a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their
 examples.
 A DataSet sub-class should always redefine the following methods:
 * __len__ if it is not a stream
-* __getitem__ may not be feasible with some streams
 * fieldNames
 * minibatches_nowrap (called by DataSet.minibatches())
 * valuesHStack
 * valuesVStack
 For efficiency of implementation, a sub-class might also want to redefine
 * hasFields
-"""
+* __getitem__ may not be feasible with some streams
+* __iter__
-infinity = float("infinity")
+"""
 def __init__(self):
 pass
 class MinibatchToSingleExampleIterator(object):
 """
 def __init__(self, minibatch_iterator):
 self.minibatch_iterator = minibatch_iterator
 def __iter__(self): #makes for loop work
 return self
 def next(self):
-return self.minibatch_iterator.next()[0]
+size1_minibatch = self.minibatch_iterator.next()
+return Example(size1_minibatch.keys,[value[0] for value in size1_minibatch.values()])
 def next_index(self):
 return self.minibatch_iterator.next_index()
 def __iter__(self):
 """Supports the syntax "for i in dataset: ..."
 of a batch of current examples. In the second case, i[0] is
 list-like container of the f1 field of a batch current examples, i[1] is
 a list-like container of the f2 field, etc.
 Using the first syntax, all the fields will be returned in "i".
-Beware that some datasets may not support this syntax, if the number
-of fields is infinite (i.e. field values may be computed "on demand").
 Using the third syntax, i1, i2, i3 will be list-like containers of the
 f1, f2, and f3 fields of a batch of examples on each loop iteration.
 The minibatches iterator is expected to return upon each call to next()
 a DataSetFields object, which is a LookupList (indexed by the field names) whose
 raise AbstractFunction()
 def __len__(self):
 """
 len(dataset) returns the number of examples in the dataset.
-By default, a DataSet is a 'stream', i.e. it has an unbounded (infinite) length.
+By default, a DataSet is a 'stream', i.e. it has an unbounded length (raises UnboundedDataSet).
 Sub-classes which implement finite-length datasets should redefine this method.
-Some methods only make sense for finite-length datasets, and will perform
+Some methods only make sense for finite-length datasets.
-assert len(dataset)<DataSet.infinity
+"""
-in order to check the finiteness of the dataset.
+raise UnboundedDataSet()
-"""
-return infinity
 def hasFields(self,*fieldnames):
 """
 Return true if the given field name (or field names, if multiple arguments are
 given) is recognized by the DataSet (i.e. can be used as a field name in one
 Note that some stream datasets may be unable to implement random access, i.e.
 arbitrary slicing/indexing
 because they can only iterate through examples one or a minibatch at a time
 and do not actually store or keep past (or future) examples.
-"""
-raise NotImplementedError()
+The default implementation of getitem uses the minibatches iterator
+to obtain one example, one slice, or a list of examples. It may not
+always be the most efficient way to obtain the result, especially if
+the data are actually stored in a memory array.
+"""
+if type(i) is int:
+return DataSet.MinibatchToSingleExampleIterator(
+self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next()
+if type(i) is slice:
+if not i.start: i.start=0
+if not i.step: i.step=1
+if i.step is 1:
+return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples()
+rows = range(i.start,i.stop,i.step)
+else:
+assert type(i) is list
+rows = i
+fields_values = zip(*[self[row] for row in rows])
+return MinibatchDataSet(
+Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values)
+for fieldname,field_values
+in zip(self.fieldNames(),fields_values)]))
 def valuesHStack(self,fieldnames,fieldvalues):
 """
 Return a value that corresponds to concatenating (horizontally) several field values.
 This can be useful to merge some fields. The implementation of this operation is likely
 dataset1 | dataset2 returns a dataset whose list of fields is the concatenation of the list of
 fields of the argument datasets. This only works if they all have the same length.
 """
 return HStackedDataSet(self,other)
-def __add__(self,other):
+def __and__(self,other):
 """
-dataset1 + dataset2 is a dataset that concatenates the examples from the argument datasets
+dataset1 & dataset2 is a dataset that concatenates the examples from the argument datasets
 (and whose length is the sum of the length of the argument datasets). This only
 works if they all have the same fields.
 """
 return VStackedDataSet(self,other)
 return datasets[0]
 return HStackedDataSet(datasets)
 def vstack(datasets):
 """
-vstack(dataset1,dataset2,...) returns dataset1 + datataset2 + ...
+vstack(dataset1,dataset2,...) returns dataset1 & datataset2 & ...
 which is a dataset which iterates first over the examples of dataset1, then
 over those of dataset2, etc.
 """
 assert len(datasets)>0
 if len(datasets)==1:
 The result of fields() is a DataSetFields object, which iterates over fields,
 and whose elements are iterable over examples. A DataSetFields object can
 be turned back into a DataSet with its examples() method:
 dataset2 = dataset1.fields().examples()
 and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1).
+DataSetFields can be concatenated vertically or horizontally. To be consistent with
+the syntax used for DataSets, the | concatenates the fields and the & concatenates
+the examples.
 """
 def __init__(self,dataset,*fieldnames):
 self.dataset=dataset
+if not fieldnames:
+fieldnames=dataset.fieldNames()
 assert dataset.hasFields(*fieldnames)
 LookupList.__init__(self,dataset.fieldNames(),
 dataset.minibatches(fieldnames if len(fieldnames)>0 else self.fieldNames(),
 minibatch_size=len(dataset)).next()
 def examples(self):
 fields1 | fields2 is a DataSetFields that whose list of examples is the concatenation
 of the list of examples of DataSetFields fields1 and fields2.
 """
 return (self.examples() + other.examples()).fields()
-def __add__(self,other):
+def __and__(self,other):
 """
 fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation
 of the fields of DataSetFields fields1 and fields2.
 """
 return (self.examples() | other.examples()).fields()
 def __len__(self):
 return self.length
 def __getitem__(self,i):
-return Example(self.fields.keys(),[field[i] for field in self.fields])
+return DataSetFields(MinibatchDataSet(
+Example(self.fields.keys(),[field[i] for field in self.fields])),self.fields)
 def fieldNames(self):
 return self.fields.keys()
 def hasFields(self,*fieldnames):
 [field[next_example:upper]
 for field in self.ds.fields])
 self.next_example+=minibatch_size
 return DataSetFields(MinibatchDataSet(minibatch),fieldnames)
-return MinibatchWrapAroundIterator(self,fieldnames,minibatch_size,n_batches,offset)
+return Iterator(self)
 def valuesVStack(self,fieldname,fieldvalues):
 return self.values_vstack(fieldname,fieldvalues)
 def valuesHStack(self,fieldnames,fieldvalues):
 fieldnames = datasets[-1].fieldNames()
 self.datasets_start_row=[]
 # We use this map from row index to dataset index for constant-time random access of examples,
 # to avoid having to search for the appropriate dataset each time and slice is asked for.
 for dataset,k in enumerate(datasets[0:-1]):
-L=len(dataset)
+try:
-assert L<DataSet.infinity
+L=len(dataset)
+except UnboundedDataSet:
+print "All VStacked datasets (except possibly the last) must be bounded (have a length)."
+assert False
 for i in xrange(L):
 self.index2dataset[self.length+i]=k
 self.datasets_start_row.append(self.length)
 self.length+=L
 assert dataset.fieldNames()==fieldnames
 if self.n_left_in_mb:
 extra_mb = []
 while self.n_left_in_mb>0:
 self.move_to_next_dataset()
 extra_mb.append(self.next_iterator.next())
-mb = Example(names,
+examples = Example(names,
-[dataset.valuesVStack(name,[mb[name]]+[b[name] for b in extra_mb])
+[dataset.valuesVStack(name,
-for name in fieldnames])
+[mb[name]]+[b[name] for b in extra_mb])
+for name in fieldnames])
+mb = DataSetFields(MinibatchDataSet(examples),fieldnames)
 self.next_row+=minibatch_size
 self.next_dataset_row+=minibatch_size
 if self.next_row+minibatch_size>len(dataset):
 self.move_to_next_dataset()
-return mb
+return
 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
 """
 Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the

Mercurial > pylearn

comparison dataset.py @ 40:88fd1cce08b9