pylearn: dataset.py comparison

comparison dataset.py @ 22:b6b36f65664f

Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet, removed .field ability from LookupList (because of setattr problems), removed fieldNames() from DataSet (but is in FiniteWidthDataSet, where it makes sense), and added hasFields() instead. Fixed problems in asarray, and tested previous functionality in _test_dataset.py, but not yet new functionality.

author	bengioy@esprit.iro.umontreal.ca
date	Mon, 07 Apr 2008 20:44:37 -0400
parents	266c68cb6136
children	526e192b0699

comparison

equal deleted inserted replaced

-:fdf0abc490f7
+:b6b36f65664f
 """A virtual base class for datasets.
 A DataSet is a generator of iterators; these iterators can run through the
 examples in a variety of ways.  A DataSet need not necessarily have a finite
 or known length, so this class can be used to interface to a 'stream' which
 feeds on-line learning.
 To iterate over examples, there are several possibilities:
 - for example in dataset.zip([field1, field2,field3, ...])
 - for val1,val2,val3 in dataset.zip([field1, field2,field3])
 - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N)
 - for example in dataset
 Each of these is documented below.
-Note: For a dataset of fixed and known length, which can implement item
-random-access efficiently (e.g. indexing and slicing), and which can profit
-from the FiniteDataSetIterator, consider using base class FiniteDataSet.
 Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
 Note: The content of a field can be of any type.
+Note: A dataset can recognize a potentially infinite number of field names (i.e. the field
+values can be computed on-demand, when particular field names are used in one of the
+iterators).
+Datasets of finite length should be sub-classes of FiniteLengthDataSet.
+Datasets whose elements can be indexed and sub-datasets of consecutive
+examples (i.e. slices) can be extracted from should be sub-classes of
+SliceableDataSet.
+Datasets with a finite number of fields should be sub-classes of
+FiniteWidthDataSet.
 """
 def __init__(self):
 pass
+class Iter(LookupList):
+def __init__(self, ll):
+LookupList.__init__(self, ll.keys(), ll.values())
+self.ll = ll
+def __iter__(self): #makes for loop work
+return self
+def next(self):
+self.ll.next()
+self._values = [v[0] for v in self.ll._values]
+return self
 def __iter__(self):
 """Supports the syntax "for i in dataset: ..."
 Using this syntax, "i" will be an Example instance (or equivalent) with
 all the fields of DataSet self.  Every field of "i" will give access to
 a field of a single example.  Fields should be accessible via
-i["fielname"] or i[3] (in the fieldNames() order), but the derived class is free
+i["fielname"] or i[3] (in the order defined by the elements of the
+Example returned by this iterator), but the derived class is free
 to accept any type of identifier, and add extra functionality to the iterator.
 """
-return self.zip(*self.fieldNames())
+return DataSet.Iter(self.minibatches(None, minibatch_size = 1))
 def zip(self, *fieldnames):
 """
 Supports two forms of syntax:
 f1, f2, and f3 fields of a single example on each loop iteration.
 The derived class may accept fieldname arguments of any type.
 """
-class Iter(LookupList):
+return DataSet.Iter(self.minibatches(fieldnames, minibatch_size = 1))
-def __init__(self, ll):
-LookupList.__init__(self, ll.keys(), ll.values())
-self.ll = ll
-def __iter__(self): #makes for loop work
-return self
-def next(self):
-self.ll.next()
-self._values = [v[0] for v in self.ll._values]
-return self
-return Iter(self.minibatches(fieldnames, minibatch_size = 1))
 minibatches_fieldnames = None
 minibatches_minibatch_size = 1
 minibatches_n_batches = None
 def minibatches(self,
 fieldnames = minibatches_fieldnames,
 minibatch_size = minibatches_minibatch_size,
 n_batches = minibatches_n_batches):
 """
-Supports two forms of syntax:
+Supports three forms of syntax:
+for i in dataset.minibatches(None,**kwargs): ...
 for i in dataset.minibatches([f1, f2, f3],**kwargs): ...
 for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ...
-Using the first syntax, "i" will be an indexable object, such as a list,
+Using the first two syntaxes, "i" will be an indexable object, such as a list,
-tuple, or Example instance, such that on every iteration, i[0] is a
+tuple, or Example instance. In both cases, i[k] is a list-like container
+of a batch of current examples. In the second case, i[0] is
 list-like container of the f1 field of a batch current examples, i[1] is
 a list-like container of the f2 field, etc.
-Using the second syntax, i1, i2, i3 will be list-like containers of the
+Using the first syntax, all the fields will be returned in "i".
+Beware that some datasets may not support this syntax, if the number
+of fields is infinite (i.e. field values may be computed "on demand").
+Using the third syntax, i1, i2, i3 will be list-like containers of the
 f1, f2, and f3 fields of a batch of examples on each loop iteration.
 PARAMETERS
 - fieldnames (list of any type, default None):
 The loop variables i1, i2, i3 (in the example above) should contain the
 Note: A list-like container is something like a tuple, list, numpy.ndarray or
 any other object that supports integer indexing and slicing.
 """
 raise AbstractFunction()
-def fieldNames(self):
+def hasFields(*fieldnames):
-#Yoshua-
+"""
-# This list may not be finite; what would make sense in the use you have
+Return true if the given field name (or field names, if multiple arguments are
-# in mind?
+given) is recognized by the DataSet (i.e. can be used as a field name in one
-# -JB
+of the iterators).
-#James-
+"""
-# You are right. I had put this to be able to iterate over the fields
+raise AbstractFunction()
-# but maybe an iterator mechanism (over fields rather than examples)
-# would be more appropriate. Fieldnames are needed in general
-# by the iterators over examples or minibatches, to construct
-# examples or minibatches with the corresponding names as attributes.
-# -YB
-"""
-Return an iterator (an object with an __iter__ method) that
-iterates over the names of the fields. As a special cases,
-a list or a tuple of field names can be returned.
-""""
-# Note that some datasets
-# may have virtual fields and support a virtually infinite number
-# of possible field names. In that case, fieldNames() should
-# either raise an error or iterate over a particular set of
-# names as appropriate. Another option would be to iterate
-# over the sub-datasets comprising a single field at a time.
-# I am not sure yet what is most appropriate.
-#  -YB
-"""
-raise AbstractFunction()
 def rename(*new_field_specifications):
 #Yoshua-
 # Do you mean for this to be a virtual method?
 # Wouldn't this functionality be easier to provide via a
 # RenamingDataSet, such as the one I've written below?
 of a matrix-like field).
 """
 raise AbstractFunction()
-def apply_function(function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True):
+def applyFunction(function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True):
 """
 Return a dataset that contains as fields the results of applying
 the given function (example-wise) to the specified input_fields. The
 function should return a sequence whose elements will be stored in
 fields whose names are given in the output_fields list. If copy_inputs
 n_batches = DataSet.minibatches_n_batches):
 dct = self.rename_dct
 new_fieldnames = [dct.get(f, f) for f in fieldnames]
 return self.src.minibatches(new_fieldnames, minibatches_size, n_batches)
-def fieldNames(self):
+class FiniteLengthDataSet(DataSet):
-return [dct.get(f, f) for f in self.src.fieldNames()]
+"""
+Virtual interface for datasets that have a finite length (number of examples),
+and thus recognize a len(dataset) call.
-class FiniteDataSet(DataSet):
+"""
-"""
+def __init__(self):
-Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
+DataSet.__init__(self)
-Examples are indexed by an integer between 0 and self.length()-1,
-and a subdataset can be obtained by slicing. This may not be appropriate in general
+def __len__(self):
-but only for datasets which can be thought of like ones that access rows AND fields
+"""len(dataset) returns the number of examples in the dataset."""
-in an efficient random access way. Users are encouraged to expect only the generic dataset
+raise AbstractFunction()
-interface in general. A FiniteDataSet is mainly useful when one has to obtain
-a subset of examples (e.g. for splitting a dataset into training and test sets).
-"""
+class SliceableDataSet(DataSet):
+"""
-class FiniteDataSetIterator(object):
+Virtual interface, a subclass of DataSet for datasets which are sliceable
-"""
+and whose individual elements can be accessed, generally respecting the
-If the fieldnames list is empty, it means that we want to see ALL the fields.
+python semantics for [spec], where spec is either a non-negative integer
-"""
+(for selecting one example), or a python slice (for selecting a sub-dataset
-def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
+comprising the specified examples). This is useful for obtaining
-self.dataset=dataset
+sub-datasets, e.g. for splitting a dataset into training and test sets.
-self.minibatch_size=minibatch_size
+"""
-assert minibatch_size>=1 and minibatch_size<=len(dataset)
+def __init__(self):
-self.current = -self.minibatch_size
+DataSet.__init__(self)
-self.fieldnames = fieldnames
-if len(dataset) % minibatch_size:
-raise NotImplementedError()
-def __iter__(self):
-return self
-def next(self):
-self.current+=self.minibatch_size
-if self.current>=len(self.dataset):
-self.current=-self.minibatch_size
-raise StopIteration
-if self.minibatch_size==1:
-complete_example=self.dataset[self.current]
-else:
-complete_example=self.dataset[self.current:self.current+self.minibatch_size]
-if self.fieldnames:
-return Example(self.fieldnames,list(complete_example))
-else:
-return complete_example
-def __init__(self):
-pass
 def minibatches(self,
 fieldnames = DataSet.minibatches_fieldnames,
 minibatch_size = DataSet.minibatches_minibatch_size,
 n_batches = DataSet.minibatches_n_batches):
 """
-If the fieldnames list is empty, it means that we want to see ALL the fields.
 If the n_batches is empty, we want to see all the examples possible
-for the give minibatch_size.
+for the given minibatch_size (possibly missing a few at the end of the dataset).
 """
 # substitute the defaults:
-if fieldnames is None: fieldnames = self.fieldNames()
 if n_batches is None: n_batches = len(self) / minibatch_size
 return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
-def __getattr__(self,fieldname):
-"""Return an that can iterate over the values of the field in this dataset."""
-return self(fieldname)
-def __call__(self,*fieldnames):
-"""Return a sub-dataset containing only the given fieldnames as fields.
-The return value's default iterator will iterate only over the given
-fields.
-"""
-raise AbstractFunction()
-def __len__(self):
-"""len(dataset) returns the number of examples in the dataset."""
-raise AbstractFunction()
 def __getitem__(self,i):
 """dataset[i] returns the (i+1)-th example of the dataset."""
 raise AbstractFunction()
 def __getslice__(self,*slice_args):
 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
 raise AbstractFunction()
+class FiniteWidthDataSet(DataSet):
+"""
+Virtual interface for datasets that have a finite width (number of fields),
+and thus return a list of fieldNames.
+"""
+def __init__(self):
+DataSet.__init__(self)
+def hasFields(*fieldnames):
+has_fields=True
+for fieldname in fieldnames:
+if fieldname not in self.fields.keys():
+has_fields=False
+return has_fields
+def fieldNames(self):
+"""Return the list of field names that are supported by the iterators,
+and for which hasFields(fieldname) would return True."""
+raise AbstractFunction()
 # we may want ArrayDataSet defined in another python file
 import numpy
 # many complicated things remain to be done:
 #  - find common dtype
 #  - decide what to do with extra dimensions if not the same in all fields
 #  - try to see if we can avoid the copy?
-class ArrayDataSet(FiniteDataSet):
+class ArrayDataSet(FiniteLengthDataSet,FiniteWidthDataSet,SliceableDataSet):
 """
 An ArrayDataSet behaves like a numpy array but adds the notion of named fields
 from DataSet (and the ability to view the values of multiple fields as an 'Example').
 It is a  fixed-length and fixed-width dataset
 in which each element is a fixed dimension numpy array or a number, hence the whole
 """
 class Iterator(LookupList):
 """An iterator over a finite dataset that implements wrap-around"""
 def __init__(self, dataset, fieldnames, minibatch_size, next_max):
-LookupList.__init__(self, fieldnames, [0] * len(fieldnames))
+if fieldnames is None: fieldnames = dataset.fieldNames()
+LookupList.__init__(self, fieldnames, [0]*len(fieldnames))
 self.dataset=dataset
 self.minibatch_size=minibatch_size
 self.next_count = 0
 self.next_max = next_max
 self.current = -self.minibatch_size
 dataview = data[self.current:]
 upper -= rows
 assert upper > 0
 dataview = self.matcat(dataview, data[:upper])
 self._values = [dataview[:, self.dataset.fields[f]]\
 for f in self._names]
 return self
 def __init__(self, data, fields=None):
 """
 def minibatches(self,
 fieldnames = DataSet.minibatches_fieldnames,
 minibatch_size = DataSet.minibatches_minibatch_size,
 n_batches = DataSet.minibatches_n_batches):
 """
-If the fieldnames list is empty, it means that we want to see ALL the fields.
+If the fieldnames list is None, it means that we want to see ALL the fields.
-If the n_batches is empty, we want to see all the examples possible
+If the n_batches is None, we want to see all the examples possible
-for the give minibatch_size.
+for the given minibatch_size (possibly missing some near the end).
 """
 # substitute the defaults:
-if fieldnames is None: fieldnames = self.fieldNames()
 if n_batches is None: n_batches = len(self) / minibatch_size
 return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
 def __getattr__(self,fieldname):
 """
 for fieldname,fieldslice in self.fields.items():
 new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step)
 return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields)
 def fieldNames(self):
-"""Return the list of field names that are supported by getattr and getFields."""
+"""Return the list of field names that are supported by getattr and hasField."""
 return self.fields.keys()
 def __len__(self):
 """len(dataset) returns the number of examples in the dataset."""
 return len(self.data)
 """
 if not self.fields:
 return self.data
 # else, select subsets of columns mapped by the fields
 columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
+overlapping_fields = False
+n_columns = 0
 for field_slice in self.fields.values():
 for c in xrange(field_slice.start,field_slice.stop,field_slice.step):
+n_columns += 1
+if columns_used[c]: overlapping_fields=True
 columns_used[c]=True
 # try to figure out if we can map all the slices into one slice:
-mappable_to_one_slice = True
+mappable_to_one_slice = not overlapping_fields
-start=0
+if not overlapping_fields:
-while start<len(columns_used) and not columns_used[start]:
+start=0
-start+=1
+while start<len(columns_used) and not columns_used[start]:
-stop=len(columns_used)
+start+=1
-while stop>0 and not columns_used[stop-1]:
+stop=len(columns_used)
-stop-=1
+while stop>0 and not columns_used[stop-1]:
-step=0
+stop-=1
-i=start
+step=0
-while i<stop:
+i=start
-j=i+1
+while i<stop:
-while j<stop and not columns_used[j]:
+j=i+1
-j+=1
+while j<stop and not columns_used[j]:
-if step:
+j+=1
-if step!=j-i:
+if step:
-mappable_to_one_slice = False
+if step!=j-i:
-break
+mappable_to_one_slice = False
-else:
+break
-step = j-i
+else:
-i=j
+step = j-i
+i=j
 if mappable_to_one_slice:
 return self.data[:,slice(start,stop,step)]
-# else make contiguous copy
+# else make contiguous copy (copying the overlapping columns)
-n_columns = sum(columns_used)
+result = numpy.zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
-result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
-print result.shape
 c=0
 for field_slice in self.fields.values():
-slice_width=field_slice.stop-field_slice.start/field_slice.step
+slice_width=(field_slice.stop-field_slice.start)/field_slice.step
 # copy the field here
-result[:,slice(c,slice_width)]=self.data[:,field_slice]
+result[:,slice(c,c+slice_width)]=self.data[:,field_slice]
 c+=slice_width
 return result
-class ApplyFunctionDataset(DataSet):
+class ApplyFunctionDataSet(DataSet):
 """
 A dataset that contains as fields the results of applying
 a given function (example-wise) to specified input_fields of a source
 dataset. The function should return a sequence whose elements will be stored in
 fields whose names are given in the output_fields list. If copy_inputs
 def minibatches(self,
 fieldnames = DataSet.minibatches_fieldnames,
 minibatch_size = DataSet.minibatches_minibatch_size,
 n_batches = DataSet.minibatches_n_batches):
 class Iterator(LookupList):
 def __init__(self,dataset):
-LookupList.__init__(self, fieldnames, [0]*len(fieldnames))
+if fieldnames is None:
+LookupList.__init__(self, [],[])
+else:
+LookupList.__init__(self, fieldnames, [0]*len(fieldnames))
 self.dataset=dataset
-if dataset.copy_inputs:
+self.src_iterator=self.src.minibatches(list(set.union(set(fieldnames),set(self.dataset.input_fields))),
-src_fields=dataset.fieldNames()
+minibatch_size,n_batches)
-else:
-src_fields=dataset.input_fields
-self.src_iterator=self.src.minibatches(src_fields,minibatch_size,n_batches)
 def __iter__(self):
 return self
 def next(self):
 src_examples = self.src_iterator.next()
 if self.dataset.copy_inputs:
 function_inputs = src_examples
 else:
-function_inputs =
+function_inputs = [src_examples[field_name] for field_name in self.dataset.input_fields]
-[src_examples[field_name] for field_name in self.dataset.input_fields])
+outputs = Example(self.dataset.output_fields,self.dataset.function(*function_inputs))
-return self.dataset.function(*function_inputs)
+if self.dataset.copy_inputs:
+return src_examples + outputs
+else:
+return outputs
 for fieldname in fieldnames:
-assert fieldname in self.input_fields
+assert fieldname in self.output_fields or self.src.hasFields(fieldname)
 return Iterator(self)

Mercurial > pylearn

comparison dataset.py @ 22:b6b36f65664f