# HG changeset patch # User Yoshua Bengio # Date 1210025672 14400 # Node ID c4726e19b8ec372d98ff14e93c47c0897d7e06db # Parent aa9e786ee849e421c29dce429d25d3db50230879 Finished first draft of TLearner diff -r aa9e786ee849 -r c4726e19b8ec learner.py --- a/learner.py Mon May 05 11:49:40 2008 -0400 +++ b/learner.py Mon May 05 18:14:32 2008 -0400 @@ -57,7 +57,7 @@ """ raise NotImplementedError - def attribute_names(self): + def attributeNames(self): """ A Learner may have attributes that it wishes to export to other objects. To automate such export, sub-classes should define here the names (list of strings) of these attributes. @@ -85,7 +85,111 @@ or by a stats collector. - defaultOutputFields(input_fields): return a list of default dataset output fields when None are provided by the caller of use. - - - + - update_start(), update_end(), update_minibatch(minibatch): functions + executed at the beginning, the end, and in the middle + (for each minibatch) of the update method. This model only + works for 'online' or one-short learning that requires + going only once through the training data. For more complicated + models, more specialized subclasses of TLearner should be used + or a learning-algorithm specific update method should be defined. + + The following naming convention is assumed and important. + Attributes whose names are listed in attributeNames() can be of any type, + but those that can be referenced as input/output dataset fields or as + output attributes in 'use' or as input attributes in the stats collector + should be associated with a Theano Result variable. If the exported attribute + name is , the corresponding Result name (an internal attribute of + the TLearner, created in the sub-class constructor) should be _. + Typically will be numpy ndarray and _ will be the corresponding + Theano Tensor (for symbolic manipulation). """ + + def __init__(self): + Learner.__init__(self) + + def _minibatchwise_use_functions(self, input_fields, output_fields, stats_collector): + """ + Private helper function called by the generic TLearner.use. It returns a function + that can map the given input fields to the given output fields (along with the + attributes that the stats collector needs for its computation. + """ + if not output_fields: + output_fields = self.defaultOutputFields(input_fields) + if stats_collector: + stats_collector_inputs = stats_collector.inputUpdateAttributes() + for attribute in stats_collector_inputs: + if attribute not in input_fields: + output_fields.append(attribute) + key = (input_fields,output_fields) + if key not in self.use_functions_dictionary: + self.use_functions_dictionary[key]=Function(self._names2attributes(input_fields), + self._names2attributes(output_fields)) + return self.use_functions_dictionary[key] + + def attributes(self,return_copy=False): + """ + Return a list with the values of the learner's attributes (or optionally, a deep copy). + """ + return self.names2attributes(self.attributeNames()) + + def _names2attributes(self,names,return_Result=False, return_copy=False): + """ + Private helper function that maps a list of attribute names to a list + of (optionally copies) values or of the Result objects that own these values. + """ + if return_Result: + if return_copy: + return [copy.deepcopy(self.__getattr__(name)) for name in names] + else: + return [self.__getattr__(name) for name in names] + else: + if return_copy: + return [copy.deepcopy(self.__getattr__(name).data) for name in names] + else: + return [self.__getattr__(name).data for name in names] + + def use(self,input_dataset,output_fieldnames=None,output_attributes=None, + test_stats_collector=None,copy_inputs=True): + """ + The learner tries to compute in the output dataset the output fields specified + """ + minibatchwise_use_function = _minibatchwise_use_functions(input_dataset.fieldNames(), + output_fieldnames, + test_stats_collector) + virtual_output_dataset = ApplyFunctionDataSet(input_dataset, + minibatchwise_use_function, + True,DataSet.numpy_vstack, + DataSet.numpy_hstack) + # actually force the computation + output_dataset = CachedDataSet(virtual_output_dataset,True) + if copy_inputs: + output_dataset = input_dataset | output_dataset + # copy the wanted attributes in the dataset + if output_attributes: + assert set(output_attributes) <= set(self.attributeNames()) + output_dataset.setAttributes(output_attributes, + self._names2attributes(output_attributes,return_copy=True)) + if test_stats_collector: + test_stats_collector.update(output_dataset) + output_dataset.setAttributes(test_stats_collector.attributeNames(), + test_stats_collector.attributes()) + return output_dataset + + def update_start(self): pass + def update_end(self): pass + def update_minibatch(self,minibatch): + raise AbstractFunction() + def update(self,training_set,train_stats_collector=None): + + self.update_start() + for minibatch in training_set.minibatches(self.training_set_input_fields, + minibatch_size=self.minibatch_size): + self.update_minibatch(minibatch) + if train_stats_collector: + minibatch_set = minibatch.examples() + minibatch_set.setAttributes(self.attributeNames(),self.attributes()) + train_stats_collector.update(minibatch_set) + self.update_end() + return self.use + diff -r aa9e786ee849 -r c4726e19b8ec linear_regression.py --- a/linear_regression.py Mon May 05 11:49:40 2008 -0400 +++ b/linear_regression.py Mon May 05 18:14:32 2008 -0400 @@ -11,7 +11,12 @@ Implement linear regression, with or without L2 regularization (the former is called Ridge Regression and the latter Ordinary Least Squares). - The predictor is obtained analytically. + The predictor parameters are obtained analytically from the training set. + Training can proceed sequentially (with multiple calls to update with + different disjoint subsets of the training sets). After each call to + update the predictor is ready to be used (and optimized for the union + of all the training sets passed to update since construction or since + the last call to forget). The L2 regularization coefficient is obtained analytically. For each (input[t],output[t]) pair in a minibatch,:: @@ -45,22 +50,25 @@ - optional input attributes (optionally expected as input_dataset attributes) - - 'lambda' (only used by update) - - 'b' (only used by use) - - 'W' (only used by use) - - - optional output attributes (available in self and optionally in output dataset) - - - 'b' (only set by update) - - 'W' (only set by update) - - 'regularization_term' (only set by update) - - 'XtX' (only set by update) - - 'XtY' (only set by update) + - optional attributes (optionally expected as input_dataset attributes) + (warning, this may be dangerous, the 'use' method will use those provided in the + input_dataset rather than those learned during 'update'; currently no support + for providing these to update): + - 'lambda' + - 'b' + - 'W' + - 'regularization_term' + - 'XtX' + - 'XtY' """ + def attributeNames(self): + return ["lambda","b","W","regularization_term","XtX","XtY"] + # definitions specifiques a la regression lineaire: + def global_inputs(self): self.lambda = as_scalar(0.,'lambda') self.theta = t.matrix('theta') @@ -107,63 +115,6 @@ # poutine generale basee sur ces fonctions - def minibatchwise_use_functions(self, input_fields, output_fields, stats_collector): - if not output_fields: - output_fields = self.defaultOutputFields(input_fields) - if stats_collector: - stats_collector_inputs = stats_collector.inputUpdateAttributes() - for attribute in stats_collector_inputs: - if attribute not in input_fields: - output_fields.append(attribute) - key = (input_fields,output_fields) - if key not in self.use_functions_dictionary: - self.use_functions_dictionary[key]=Function(self.names2attributes(input_fields), - self.names2attributes(output_fields)) - return self.use_functions_dictionary[key] - - def attributes(self,return_copy=False): - return self.names2attributes(self.attributeNames()) - - def names2attributes(self,names,return_Result=False, return_copy=False): - if return_Result: - if return_copy: - return [copy.deepcopy(self.__getattr__(name)) for name in names] - else: - return [self.__getattr__(name) for name in names] - else: - if return_copy: - return [copy.deepcopy(self.__getattr__(name).data) for name in names] - else: - return [self.__getattr__(name).data for name in names] - - def use(self,input_dataset,output_fieldnames=None,test_stats_collector=None,copy_inputs=True): - minibatchwise_use_function = minibatchwise_use_functions(input_dataset.fieldNames(),output_fieldnames,test_stats_collector) - virtual_output_dataset = ApplyFunctionDataSet(input_dataset, - minibatchwise_use_function, - True,DataSet.numpy_vstack, - DataSet.numpy_hstack) - # actually force the computation - output_dataset = CachedDataSet(virtual_output_dataset,True) - if copy_inputs: - output_dataset = input_dataset | output_dataset - # compute the attributes that should be copied in the dataset - output_dataset.setAttributes(self.attributeNames(),self.attributes(return_copy=True)) - if test_stats_collector: - test_stats_collector.update(output_dataset) - for attribute in test_stats_collector.attributeNames(): - output_dataset[attribute] = copy.deepcopy(test_stats_collector[attribute]) - return output_dataset - - def update(self,training_set,train_stats_collector=None): - self.update_start() - for minibatch in training_set.minibatches(self.training_set_input_fields, minibatch_size=self.minibatch_size): - self.update_minibatch(minibatch) - if train_stats_collector: - minibatch_set = minibatch.examples() - minibatch_set.setAttributes(self.attributeNames(),self.attributes()) - train_stats_collector.update(minibatch_set) - self.update_end() - return self.use def __init__(self,lambda=0.,max_memory_use=500): """