# HG changeset patch # User bengioy@grenat.iro.umontreal.ca # Date 1207943178 14400 # Node ID 46c5c90019c2b8c87c9dd9120da21e1782a318eb # Parent 541a273bc89f91bb459c988ad48ac038c9b5d1cb Changed apply_function so that it propagates methods of the source. diff -r 541a273bc89f -r 46c5c90019c2 _test_dataset.py --- a/_test_dataset.py Fri Apr 11 13:08:51 2008 -0400 +++ b/_test_dataset.py Fri Apr 11 15:46:18 2008 -0400 @@ -80,6 +80,17 @@ b=a.rename({'xx':'x','zz':'z'}) self.failUnless(b.hasFields('xx','zz') and not b.hasFields('x') and not b.hasFields('y')) +class T_applyfunctiondataset(unittest.TestCase): + def setUp(self): + numpy.random.seed(123456) + + def test_function(self): + n = numpy.random.rand(3,8) + a=ArrayDataSet(data=n,fields={"x":slice(2),"y":slice(1,4),"z":slice(4,6)}) + b=a.apply_function(lambda x,y: x+y,x+1, ['x','y'], ['x+y','x+1'], False,False,False) + print b.fieldNames() + print b('x+y') + if __name__ == '__main__': unittest.main() diff -r 541a273bc89f -r 46c5c90019c2 dataset.py --- a/dataset.py Fri Apr 11 13:08:51 2008 -0400 +++ b/dataset.py Fri Apr 11 15:46:18 2008 -0400 @@ -150,6 +150,7 @@ of the iterators). """ raise AbstractFunction() + def merge_fields(self,*specifications): """ @@ -182,7 +183,7 @@ def rename(self,rename_dict): """ - Return a new dataset that renames fields, using a dictionnary that maps old field + Changes a dataset into one that renames fields, using a dictionnary that maps old field names to new field names. The only fields visible by the returned dataset are those whose names are keys of the rename_dict. """ @@ -194,9 +195,9 @@ SelfRenamingDataSet.__init__(self,self,rename_dict) return self - def applyFunction(self,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): + def apply_function(self,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): """ - Return a dataset that contains as fields the results of applying + Changes a dataset into one that contains as fields the results of applying the given function (example-wise) to the specified input_fields. The function should return a sequence whose elements will be stored in fields whose names are given in the output_fields list. If copy_inputs @@ -209,7 +210,13 @@ are cached (to avoid recomputation if the same examples are again requested). """ - return ApplyFunctionDataSet(function, input_fields, output_fields, copy_inputs, accept_minibatches, cache) + self_class = self.__class__ + class SelfApplyFunctionDataSet(ApplyFunctionDataSet,self_class): + pass + self.__class__ = SelfApplyFunctionDataSet + # set the required additional fields + ApplyFunctionDataSet.__init__(self,self,function, input_fields, output_fields, copy_inputs, accept_minibatches, cache) + return self class FiniteLengthDataSet(DataSet): @@ -223,7 +230,21 @@ def __len__(self): """len(dataset) returns the number of examples in the dataset.""" raise AbstractFunction() - + + def __call__(self,fieldname_or_fieldnames): + """ + Extract one or more fields. This may be an expensive operation when the + dataset is large. It is not the recommanded way to access individual values + (use the iterators instead). If the argument is a string fieldname, then the result + is a sequence (iterable object) of values for that field, for the whole dataset. If the + argument is a list of field names, then the result is a 'batch', i.e., an Example with keys + corresponding to the given field names and values being iterable objects over the + individual example values. + """ + if type(fieldname_or_fieldnames) is string: + minibatch = self.minibatches([fieldname_or_fieldnames],len(self)).next() + return minibatch[fieldname_or_fieldnames] + return self.minibatches(fieldname_or_fieldnames,len(self)).next() class SliceableDataSet(DataSet): """ @@ -473,20 +494,6 @@ if n_batches is None: n_batches = len(self) / minibatch_size return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches) - def __getattr__(self,fieldname): - """ - Return a numpy array with the content associated with the given field name. - If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension - than the dataset itself) is returned. - """ - if len(self.data)==1: - return self.data[0,self.fields[fieldname]] - return self.data[:,self.fields[fieldname]] - - def __call__(self,*fieldnames): - """Return a sub-dataset containing only the given fieldnames as fields.""" - return ArrayDataSet(self.data,fields=LookupList(fieldnames,[self.fields[fieldname] for fieldname in fieldnames])) - def fieldNames(self): """Return the list of field names that are supported by getattr and hasField.""" return self.fields.keys() @@ -560,7 +567,7 @@ i=j return slice(start,stop,step) -class ApplyFunctionDataSet(DataSet): +class ApplyFunctionDataSet(FiniteWidthDataSet): """ A dataset that contains as fields the results of applying a given function (example-wise) to specified input_fields of a source @@ -603,6 +610,11 @@ # in the case where src is FiniteDataSet. -YB self.cached_examples = [] + def fieldNames(self): + if self.copy_inputs: + return self.output_fields + self.src.fieldNames() + return self.output_fields + def minibatches(self, fieldnames = DataSet.minibatches_fieldnames, minibatch_size = DataSet.minibatches_minibatch_size, diff -r 541a273bc89f -r 46c5c90019c2 gradient_learner.py --- a/gradient_learner.py Fri Apr 11 13:08:51 2008 -0400 +++ b/gradient_learner.py Fri Apr 11 15:46:18 2008 -0400 @@ -59,7 +59,7 @@ self.use_function[use_function_key]=Function(input_variables,output_variables) use_function = self.use_functions[use_function_key] # return a dataset that computes the outputs - return input_dataset.applyFunction(use_function,input_fields,output_fields,copy_inputs,compute_now=True) + return input_dataset.apply_function(use_function,input_fields,output_fields,copy_inputs,compute_now=True) class StochasticGradientDescent(object):