pylearn: dataset.py comparison

comparison dataset.py @ 20:266c68cb6136

Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.

author	bengioy@bengiomac.local
date	Mon, 07 Apr 2008 09:48:39 -0400
parents	57f4015e2e09
children	b6b36f65664f

comparison

equal deleted inserted replaced

-:57f4015e2e09
+:266c68cb6136
 """A virtual base class for datasets.
 A DataSet is a generator of iterators; these iterators can run through the
 examples in a variety of ways.  A DataSet need not necessarily have a finite
 or known length, so this class can be used to interface to a 'stream' which
-feed on-line learning.
+feeds on-line learning.
 To iterate over examples, there are several possibilities:
-- for i in dataset.zip(field1, field2,field3, ...)
+- for example in dataset.zip([field1, field2,field3, ...])
-- for i in dataset.minibatches(N, field1, field2, ...)
+- for val1,val2,val3 in dataset.zip([field1, field2,field3])
-- for i in dataset
+- for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N)
+- for example in dataset
 Each of these is documented below.
 Note: For a dataset of fixed and known length, which can implement item
 random-access efficiently (e.g. indexing and slicing), and which can profit
 from the FiniteDataSetIterator, consider using base class FiniteDataSet.
 def __iter__(self):
 """Supports the syntax "for i in dataset: ..."
 Using this syntax, "i" will be an Example instance (or equivalent) with
 all the fields of DataSet self.  Every field of "i" will give access to
-a the field of a single example.  Fields should be accessible via
+a field of a single example.  Fields should be accessible via
-i[identifier], but the derived class is free to accept any type of
+i["fielname"] or i[3] (in the fieldNames() order), but the derived class is free
-identifier, and add extra functionality to the iterator.
+to accept any type of identifier, and add extra functionality to the iterator.
 """
-raise AbstractFunction()
+return self.zip(*self.fieldNames())
 def zip(self, *fieldnames):
 """
 Supports two forms of syntax:
-for i in dataset.zip(f1, f2, f3): ...
+for i in dataset.zip([f1, f2, f3]): ...
-for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
+for i1, i2, i3 in dataset.zip([f1, f2, f3]): ...
 Using the first syntax, "i" will be an indexable object, such as a list,
 tuple, or Example instance, such that on every iteration, i[0] is the f1
 field of the current example, i[1] is the f2 field, and so on.
 def fieldNames(self):
 #Yoshua-
 # This list may not be finite; what would make sense in the use you have
 # in mind?
 # -JB
-"""Return the list of field names in the examples of this dataset."""
+#James-
+# You are right. I had put this to be able to iterate over the fields
+# but maybe an iterator mechanism (over fields rather than examples)
+# would be more appropriate. Fieldnames are needed in general
+# by the iterators over examples or minibatches, to construct
+# examples or minibatches with the corresponding names as attributes.
+# -YB
+"""
+Return an iterator (an object with an __iter__ method) that
+iterates over the names of the fields. As a special cases,
+a list or a tuple of field names can be returned.
+""""
+# Note that some datasets
+# may have virtual fields and support a virtually infinite number
+# of possible field names. In that case, fieldNames() should
+# either raise an error or iterate over a particular set of
+# names as appropriate. Another option would be to iterate
+# over the sub-datasets comprising a single field at a time.
+# I am not sure yet what is most appropriate.
+#  -YB
+"""
 raise AbstractFunction()
 def rename(*new_field_specifications):
 #Yoshua-
 # Do you mean for this to be a virtual method?
 # Wouldn't this functionality be easier to provide via a
 # RenamingDataSet, such as the one I've written below?
 # -JB
+# You are right. Whichever implementation, however, we need a generic way to
+# 'concatenate' fields, to handle the ([old_field1, old_field2, ...], new_field) semantics.
+# -YB
 """
 Return a new dataset that maps old fields (of self) to new fields (of the returned
 dataset). The minimal syntax that should be supported is the following:
 new_field_specifications = [new_field_spec1, new_field_spec2, ...]
 new_field_spec = ([old_field1, old_field2, ...], new_field)
 In general both old_field and new_field should be strings, but some datasets may also
 support additional indexing schemes within each field (e.g. column slice
 of a matrix-like field).
 """
 raise AbstractFunction()
+def apply_function(function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True):
+"""
+Return a dataset that contains as fields the results of applying
+the given function (example-wise) to the specified input_fields. The
+function should return a sequence whose elements will be stored in
+fields whose names are given in the output_fields list. If copy_inputs
+is True then the resulting dataset will also contain the fields of self.
+If accept_minibatches, then the function may be called
+with minibatches as arguments (what is returned by the minibatches
+iterator). In any case, the computations may be delayed until the examples
+of the resulting dataset are requested. If cache is True, then
+once the output fields for some examples have been computed, then
+are cached (to avoid recomputation if the same examples are again
+requested).
+"""
+return ApplyFunctionDataSet(function, input_fields, output_fields, copy_inputs, accept_minibatches, cache)
 class RenamingDataSet(DataSet):
 """A DataSet that wraps another one, and makes it look like the field names
 are different
 #  - try to see if we can avoid the copy?
 class ArrayDataSet(FiniteDataSet):
 """
 An ArrayDataSet behaves like a numpy array but adds the notion of named fields
-from DataSet (and the ability to view multiple field values as an 'Example').
+from DataSet (and the ability to view the values of multiple fields as an 'Example').
 It is a  fixed-length and fixed-width dataset
-in which each element is a numpy array or a number, hence the whole
+in which each element is a fixed dimension numpy array or a number, hence the whole
 dataset corresponds to a numpy array. Fields
 must correspond to a slice of array columns. If the dataset has fields,
 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
 Any dataset can also be converted to a numpy array (losing the notion of fields
 by the numpy.array(dataset) call.
 if not fieldslice.start or not fieldslice.step:
 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
 # and coherent with the data array
 assert fieldslice.start >= 0 and fieldslice.stop <= cols
-def __iter__(self):
-return self.zip(*self.fieldNames())
 def minibatches(self,
 fieldnames = DataSet.minibatches_fieldnames,
 minibatch_size = DataSet.minibatches_minibatch_size,
 n_batches = DataSet.minibatches_n_batches):
 """
 def __getslice__(self,*args):
 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
 return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields)
 def __array__(self):
-"""Return an view of this dataset which is an numpy.ndarray
+"""Return a view of this dataset which is an numpy.ndarray (i.e. losing
+the identity and name of fields within the dataset).
 Numpy uses this special function name to retrieve an ndarray view for
 function such as numpy.sum, numpy.dot, numpy.asarray, etc.
 If this dataset has no fields, then we simply return self.data,
 otherwise things are complicated.
 - why do we want this behaviour when there are fields? (JB)
+- for convenience and completeness (but maybe it would make
+more sense to implement this through a 'field-merging'
+dataset). (YB)
 """
 if not self.fields:
 return self.data
 # else, select subsets of columns mapped by the fields
 columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
 # copy the field here
 result[:,slice(c,slice_width)]=self.data[:,field_slice]
 c+=slice_width
 return result
+class ApplyFunctionDataset(DataSet):
+"""
+A dataset that contains as fields the results of applying
+a given function (example-wise) to specified input_fields of a source
+dataset. The function should return a sequence whose elements will be stored in
+fields whose names are given in the output_fields list. If copy_inputs
+is True then the resulting dataset will also contain the fields of the source.
+dataset. If accept_minibatches, then the function expects
+minibatches as arguments (what is returned by the minibatches
+iterator). In any case, the computations may be delayed until the examples
+of self are requested. If cache is True, then
+once the output fields for some examples have been computed, then
+are cached (to avoid recomputation if the same examples are again requested).
+"""
+def __init__(src,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True):
+DataSet.__init__(self)
+self.src=src
+self.function=function
+self.input_fields=input_fields
+self.output_fields=output_fields
+self.copy_inputs=copy_inputs
+self.accept_minibatches=accept_minibatches
+src_fieldnames = src.fieldNames()
+if copy_inputs:
+for src_field in src_fieldnames:
+assert src_field not in output_fields
+self.fieldnames=src_fieldnames+output_fields
+else:
+self.fieldnames=output_fields
+for input_field in input_fields:
+assert input_field in src_fieldnames
+self.cache=cache
+if cache:
+# maybe a fixed-size array kind of structure would be more efficient than a list
+# in the case where src is FiniteDataSet. -YB
+self.cached_examples = []
+def fieldNames(self): return self.fieldnames
+def minibatches(self,
+fieldnames = DataSet.minibatches_fieldnames,
+minibatch_size = DataSet.minibatches_minibatch_size,
+n_batches = DataSet.minibatches_n_batches):
+class Iterator(LookupList):
+def __init__(self,dataset):
+LookupList.__init__(self, fieldnames, [0]*len(fieldnames))
+self.dataset=dataset
+if dataset.copy_inputs:
+src_fields=dataset.fieldNames()
+else:
+src_fields=dataset.input_fields
+self.src_iterator=self.src.minibatches(src_fields,minibatch_size,n_batches)
+def __iter__(self):
+return self
+def next(self):
+src_examples = self.src_iterator.next()
+if self.dataset.copy_inputs:
+function_inputs = src_examples
+else:
+function_inputs =
+[src_examples[field_name] for field_name in self.dataset.input_fields])
+return self.dataset.function(*function_inputs)
+for fieldname in fieldnames:
+assert fieldname in self.input_fields
+return Iterator(self)

Mercurial > pylearn

comparison dataset.py @ 20:266c68cb6136