Mercurial > pylearn
diff dataset.py @ 20:266c68cb6136
Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
author | bengioy@bengiomac.local |
---|---|
date | Mon, 07 Apr 2008 09:48:39 -0400 |
parents | 57f4015e2e09 |
children | b6b36f65664f |
line wrap: on
line diff
--- a/dataset.py Thu Mar 27 01:59:44 2008 -0400 +++ b/dataset.py Mon Apr 07 09:48:39 2008 -0400 @@ -10,12 +10,13 @@ A DataSet is a generator of iterators; these iterators can run through the examples in a variety of ways. A DataSet need not necessarily have a finite or known length, so this class can be used to interface to a 'stream' which - feed on-line learning. + feeds on-line learning. To iterate over examples, there are several possibilities: - - for i in dataset.zip(field1, field2,field3, ...) - - for i in dataset.minibatches(N, field1, field2, ...) - - for i in dataset + - for example in dataset.zip([field1, field2,field3, ...]) + - for val1,val2,val3 in dataset.zip([field1, field2,field3]) + - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N) + - for example in dataset Each of these is documented below. Note: For a dataset of fixed and known length, which can implement item @@ -36,19 +37,19 @@ Using this syntax, "i" will be an Example instance (or equivalent) with all the fields of DataSet self. Every field of "i" will give access to - a the field of a single example. Fields should be accessible via - i[identifier], but the derived class is free to accept any type of - identifier, and add extra functionality to the iterator. + a field of a single example. Fields should be accessible via + i["fielname"] or i[3] (in the fieldNames() order), but the derived class is free + to accept any type of identifier, and add extra functionality to the iterator. """ - raise AbstractFunction() + return self.zip(*self.fieldNames()) def zip(self, *fieldnames): """ Supports two forms of syntax: - for i in dataset.zip(f1, f2, f3): ... + for i in dataset.zip([f1, f2, f3]): ... - for i1, i2, i3 in dataset.zip(f1, f2, f3): ... + for i1, i2, i3 in dataset.zip([f1, f2, f3]): ... Using the first syntax, "i" will be an indexable object, such as a list, tuple, or Example instance, such that on every iteration, i[0] is the f1 @@ -120,7 +121,27 @@ # This list may not be finite; what would make sense in the use you have # in mind? # -JB - """Return the list of field names in the examples of this dataset.""" + #James- + # You are right. I had put this to be able to iterate over the fields + # but maybe an iterator mechanism (over fields rather than examples) + # would be more appropriate. Fieldnames are needed in general + # by the iterators over examples or minibatches, to construct + # examples or minibatches with the corresponding names as attributes. + # -YB + """ + Return an iterator (an object with an __iter__ method) that + iterates over the names of the fields. As a special cases, + a list or a tuple of field names can be returned. + """" + # Note that some datasets + # may have virtual fields and support a virtually infinite number + # of possible field names. In that case, fieldNames() should + # either raise an error or iterate over a particular set of + # names as appropriate. Another option would be to iterate + # over the sub-datasets comprising a single field at a time. + # I am not sure yet what is most appropriate. + # -YB + """ raise AbstractFunction() def rename(*new_field_specifications): @@ -129,6 +150,9 @@ # Wouldn't this functionality be easier to provide via a # RenamingDataSet, such as the one I've written below? # -JB + # You are right. Whichever implementation, however, we need a generic way to + # 'concatenate' fields, to handle the ([old_field1, old_field2, ...], new_field) semantics. + # -YB """ Return a new dataset that maps old fields (of self) to new fields (of the returned dataset). The minimal syntax that should be supported is the following: @@ -140,6 +164,24 @@ """ raise AbstractFunction() + + def apply_function(function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): + """ + Return a dataset that contains as fields the results of applying + the given function (example-wise) to the specified input_fields. The + function should return a sequence whose elements will be stored in + fields whose names are given in the output_fields list. If copy_inputs + is True then the resulting dataset will also contain the fields of self. + If accept_minibatches, then the function may be called + with minibatches as arguments (what is returned by the minibatches + iterator). In any case, the computations may be delayed until the examples + of the resulting dataset are requested. If cache is True, then + once the output fields for some examples have been computed, then + are cached (to avoid recomputation if the same examples are again + requested). + """ + return ApplyFunctionDataSet(function, input_fields, output_fields, copy_inputs, accept_minibatches, cache) + class RenamingDataSet(DataSet): """A DataSet that wraps another one, and makes it look like the field names are different @@ -287,9 +329,9 @@ class ArrayDataSet(FiniteDataSet): """ An ArrayDataSet behaves like a numpy array but adds the notion of named fields - from DataSet (and the ability to view multiple field values as an 'Example'). + from DataSet (and the ability to view the values of multiple fields as an 'Example'). It is a fixed-length and fixed-width dataset - in which each element is a numpy array or a number, hence the whole + in which each element is a fixed dimension numpy array or a number, hence the whole dataset corresponds to a numpy array. Fields must correspond to a slice of array columns. If the dataset has fields, each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. @@ -382,9 +424,6 @@ # and coherent with the data array assert fieldslice.start >= 0 and fieldslice.stop <= cols - def __iter__(self): - return self.zip(*self.fieldNames()) - def minibatches(self, fieldnames = DataSet.minibatches_fieldnames, minibatch_size = DataSet.minibatches_minibatch_size, @@ -446,7 +485,8 @@ return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields) def __array__(self): - """Return an view of this dataset which is an numpy.ndarray + """Return a view of this dataset which is an numpy.ndarray (i.e. losing + the identity and name of fields within the dataset). Numpy uses this special function name to retrieve an ndarray view for function such as numpy.sum, numpy.dot, numpy.asarray, etc. @@ -454,6 +494,9 @@ If this dataset has no fields, then we simply return self.data, otherwise things are complicated. - why do we want this behaviour when there are fields? (JB) + - for convenience and completeness (but maybe it would make + more sense to implement this through a 'field-merging' + dataset). (YB) """ if not self.fields: return self.data @@ -497,4 +540,75 @@ c+=slice_width return result +class ApplyFunctionDataset(DataSet): + """ + A dataset that contains as fields the results of applying + a given function (example-wise) to specified input_fields of a source + dataset. The function should return a sequence whose elements will be stored in + fields whose names are given in the output_fields list. If copy_inputs + is True then the resulting dataset will also contain the fields of the source. + dataset. If accept_minibatches, then the function expects + minibatches as arguments (what is returned by the minibatches + iterator). In any case, the computations may be delayed until the examples + of self are requested. If cache is True, then + once the output fields for some examples have been computed, then + are cached (to avoid recomputation if the same examples are again requested). + """ + def __init__(src,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): + DataSet.__init__(self) + self.src=src + self.function=function + self.input_fields=input_fields + self.output_fields=output_fields + self.copy_inputs=copy_inputs + self.accept_minibatches=accept_minibatches + src_fieldnames = src.fieldNames() + if copy_inputs: + for src_field in src_fieldnames: + assert src_field not in output_fields + self.fieldnames=src_fieldnames+output_fields + else: + self.fieldnames=output_fields + for input_field in input_fields: + assert input_field in src_fieldnames + self.cache=cache + if cache: + # maybe a fixed-size array kind of structure would be more efficient than a list + # in the case where src is FiniteDataSet. -YB + self.cached_examples = [] + def fieldNames(self): return self.fieldnames + + def minibatches(self, + fieldnames = DataSet.minibatches_fieldnames, + minibatch_size = DataSet.minibatches_minibatch_size, + n_batches = DataSet.minibatches_n_batches): + + class Iterator(LookupList): + + def __init__(self,dataset): + LookupList.__init__(self, fieldnames, [0]*len(fieldnames)) + self.dataset=dataset + if dataset.copy_inputs: + src_fields=dataset.fieldNames() + else: + src_fields=dataset.input_fields + self.src_iterator=self.src.minibatches(src_fields,minibatch_size,n_batches) + + def __iter__(self): + return self + + def next(self): + src_examples = self.src_iterator.next() + if self.dataset.copy_inputs: + function_inputs = src_examples + else: + function_inputs = + [src_examples[field_name] for field_name in self.dataset.input_fields]) + return self.dataset.function(*function_inputs) + + for fieldname in fieldnames: + assert fieldname in self.input_fields + return Iterator(self) + +