# HG changeset patch # User bengioy@bengiomac.local # Date 1209994530 14400 # Node ID 3499918faa9db49c131674cd46473d472ff0eb99 # Parent 1e2bb5bad636c61130405a8e246e46b6e2652a68 In the middle of designing TLearner diff -r 1e2bb5bad636 -r 3499918faa9d dataset.py --- a/dataset.py Sun May 04 15:09:22 2008 -0400 +++ b/dataset.py Mon May 05 09:35:30 2008 -0400 @@ -88,6 +88,9 @@ the name . The following properties should be supported: - 'description': a textual description or name for the dataset - 'fieldtypes': a list of types (one per field) + A DataSet may have other attributes that it makes visible to other objects. These are + used to store information that is not example-wise but global to the dataset. + The list of names of these attributes is given by the attribute_names() method. Datasets can be concatenated either vertically (increasing the length) or horizontally (augmenting the set of fields), if they are compatible, using @@ -114,7 +117,7 @@ or other properties of the dataset or associated with the dataset or the result of a computation stored in a dataset. These can be accessed through the [key] syntax when key is a string (or more specifically, neither an integer, a slice, nor a list). - + A DataSet sub-class should always redefine the following methods: - __len__ if it is not a stream - fieldNames @@ -125,6 +128,11 @@ - hasFields - __getitem__ may not be feasible with some streams - __iter__ + A sub-class should also append attributes to self._attribute_names + (the default value returned by attributeNames()). + By convention, attributes not in attributeNames() should have a name + starting with an underscore. + @todo enforce/test that convention! """ numpy_vstack = lambda fieldname,values: return numpy.vstack(values) @@ -136,6 +144,15 @@ description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" self.description=description self.fieldtypes=fieldtypes + self._attribute_names = ["description"] + if fieldtypes: + self._attribute_names.append("fieldtypes") + + def attributeNames(self): return self._attribute_names + + def setAttributes(self,attribute_names,attribute_values): + for name,value in zip(attribute_names,attribute_values): + self.__setattr__(name,value) class MinibatchToSingleExampleIterator(object): """ diff -r 1e2bb5bad636 -r 3499918faa9d learner.py --- a/learner.py Sun May 04 15:09:22 2008 -0400 +++ b/learner.py Mon May 05 09:35:30 2008 -0400 @@ -57,4 +57,35 @@ """ raise NotImplementedError + def attribute_names(self): + """ + A Learner may have attributes that it wishes to export to other objects. To automate + such export, sub-classes should define here the names (list of strings) of these attributes. + """ + return [] +class TLearner(Learner): + """ + TLearner is a virtual class of Learners that attempts to factor out of the definition + of a learner the steps that are common to many implementations of learning algorithms, + so as to leave only "the equations" to define in particular sub-classes, using Theano. + + In the default implementations of use and update, it is assumed that the 'use' and 'update' methods + visit examples in the input dataset sequentially. In the 'use' method only one pass through the dataset is done, + whereas the sub-learner may wish to iterate over the examples multiple times. Subclasses where this + basic model is not appropriate can simply redefine update or use. + + Sub-classes must provide the following functions and functionalities: + - attributeNames(): defines all the names of attributes which can be used as fields or + attributes in input/output datasets or in stats collectors. + All these attributes are expected to be theano.Result objects + (with a .data property and recognized by theano.Function for compilation). + The sub-class constructor defines the relations between + the Theano variables that may be used by 'use' and 'update' + or by a stats collector. + - defaultOutputFields(input_fields): return a list of default dataset output fields when + None are provided by the caller of use. + - + + """ + diff -r 1e2bb5bad636 -r 3499918faa9d linear_regression.py --- a/linear_regression.py Sun May 04 15:09:22 2008 -0400 +++ b/linear_regression.py Mon May 05 09:35:30 2008 -0400 @@ -96,10 +96,10 @@ self.output = t.dot(self.input,self.W.T) + self.b # (n_examples , n_outputs) matrix self.squared_error = t.sum_within_rows(t.sqr(self.output-self.target)) # (n_examples ) vector - def attribute_names(self): + def attributeNames(self): return ["lambda","b","W","regularization_term","XtX","XtY"] - def default_output_fields(self, input_fields): + def defaultOutputFields(self, input_fields): output_fields = ["output"] if "target" in input_fields: output_fields.append("squared_error") @@ -107,23 +107,37 @@ # poutine generale basee sur ces fonctions - def minibatchwise_use_functions(self, input_fields, output_fields): + def minibatchwise_use_functions(self, input_fields, output_fields, stats_collector): if not output_fields: - output_fields = self.default_output_fields(input_fields) + output_fields = self.defaultOutputFields(input_fields) + if stats_collector: + stats_collector_inputs = stats_collector.inputUpdateAttributes() + for attribute in stats_collector_inputs: + if attribute not in input_fields: + output_fields.append(attribute) key = (input_fields,output_fields) - if key not in use_functions_dictionary: - use_functions_dictionary[key]=Function(self.names2attributes(input_fields), + if key not in self.use_functions_dictionary: + self.use_functions_dictionary[key]=Function(self.names2attributes(input_fields), self.names2attributes(output_fields)) - return use_functions_dictionary[key] + return self.use_functions_dictionary[key] - def names2attributes(self,names,return_Result=True): + def attributes(self,return_copy=False): + return self.names2attributes(self.attributeNames()) + + def names2attributes(self,names,return_Result=False, return_copy=False): if return_Result: - return [self.__getattr__(name) for name in names] + if return_copy: + return [copy.deepcopy(self.__getattr__(name)) for name in names] + else: + return [self.__getattr__(name) for name in names] else: - return [self.__getattr__(name).data for name in names] + if return_copy: + return [copy.deepcopy(self.__getattr__(name).data) for name in names] + else: + return [self.__getattr__(name).data for name in names] def use(self,input_dataset,output_fieldnames=None,test_stats_collector=None,copy_inputs=True): - minibatchwise_use_function = use_functions(input_dataset.fieldNames(),output_fieldnames) + minibatchwise_use_function = minibatchwise_use_functions(input_dataset.fieldNames(),output_fieldnames,test_stats_collector) virtual_output_dataset = ApplyFunctionDataSet(input_dataset, minibatchwise_use_function, True,DataSet.numpy_vstack, @@ -133,17 +147,23 @@ if copy_inputs: output_dataset = input_dataset | output_dataset # compute the attributes that should be copied in the dataset - for attribute in self.attribute_names(): - # .data assumes that all attributes are Result objects - output_dataset.__setattr__(attribute) = copy.deepcopy(self.__getattr__(attribute).data) + output_dataset.setAttributes(self.attributeNames(),self.attributes(return_copy=True)) if test_stats_collector: test_stats_collector.update(output_dataset) - for attribute in test_stats_collector.attribute_names(): + for attribute in test_stats_collector.attributeNames(): output_dataset[attribute] = copy.deepcopy(test_stats_collector[attribute]) return output_dataset def update(self,training_set,train_stats_collector=None): - + self.update_start() + for minibatch in training_set.minibatches(self.training_set_input_fields, minibatch_size=self.minibatch_size): + self.update_minibatch(minibatch) + if train_stats_collector: + minibatch_set = minibatch.examples() + minibatch_set.setAttributes(self.attributeNames(),self.attributes()) + train_stats_collector.update(minibatch_set) + self.update_end() + return self.use def __init__(self,lambda=0.,max_memory_use=500): """