# HG changeset patch # User bengioy@bengiomac.local # Date 1207576119 14400 # Node ID 266c68cb6136800d347e6c9ce8ae0890b1898d77 # Parent 57f4015e2e094b0e0329dc9e2cc7a658056d9179 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works. diff -r 57f4015e2e09 -r 266c68cb6136 dataset.py --- a/dataset.py Thu Mar 27 01:59:44 2008 -0400 +++ b/dataset.py Mon Apr 07 09:48:39 2008 -0400 @@ -10,12 +10,13 @@ A DataSet is a generator of iterators; these iterators can run through the examples in a variety of ways. A DataSet need not necessarily have a finite or known length, so this class can be used to interface to a 'stream' which - feed on-line learning. + feeds on-line learning. To iterate over examples, there are several possibilities: - - for i in dataset.zip(field1, field2,field3, ...) - - for i in dataset.minibatches(N, field1, field2, ...) - - for i in dataset + - for example in dataset.zip([field1, field2,field3, ...]) + - for val1,val2,val3 in dataset.zip([field1, field2,field3]) + - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N) + - for example in dataset Each of these is documented below. Note: For a dataset of fixed and known length, which can implement item @@ -36,19 +37,19 @@ Using this syntax, "i" will be an Example instance (or equivalent) with all the fields of DataSet self. Every field of "i" will give access to - a the field of a single example. Fields should be accessible via - i[identifier], but the derived class is free to accept any type of - identifier, and add extra functionality to the iterator. + a field of a single example. Fields should be accessible via + i["fielname"] or i[3] (in the fieldNames() order), but the derived class is free + to accept any type of identifier, and add extra functionality to the iterator. """ - raise AbstractFunction() + return self.zip(*self.fieldNames()) def zip(self, *fieldnames): """ Supports two forms of syntax: - for i in dataset.zip(f1, f2, f3): ... + for i in dataset.zip([f1, f2, f3]): ... - for i1, i2, i3 in dataset.zip(f1, f2, f3): ... + for i1, i2, i3 in dataset.zip([f1, f2, f3]): ... Using the first syntax, "i" will be an indexable object, such as a list, tuple, or Example instance, such that on every iteration, i[0] is the f1 @@ -120,7 +121,27 @@ # This list may not be finite; what would make sense in the use you have # in mind? # -JB - """Return the list of field names in the examples of this dataset.""" + #James- + # You are right. I had put this to be able to iterate over the fields + # but maybe an iterator mechanism (over fields rather than examples) + # would be more appropriate. Fieldnames are needed in general + # by the iterators over examples or minibatches, to construct + # examples or minibatches with the corresponding names as attributes. + # -YB + """ + Return an iterator (an object with an __iter__ method) that + iterates over the names of the fields. As a special cases, + a list or a tuple of field names can be returned. + """" + # Note that some datasets + # may have virtual fields and support a virtually infinite number + # of possible field names. In that case, fieldNames() should + # either raise an error or iterate over a particular set of + # names as appropriate. Another option would be to iterate + # over the sub-datasets comprising a single field at a time. + # I am not sure yet what is most appropriate. + # -YB + """ raise AbstractFunction() def rename(*new_field_specifications): @@ -129,6 +150,9 @@ # Wouldn't this functionality be easier to provide via a # RenamingDataSet, such as the one I've written below? # -JB + # You are right. Whichever implementation, however, we need a generic way to + # 'concatenate' fields, to handle the ([old_field1, old_field2, ...], new_field) semantics. + # -YB """ Return a new dataset that maps old fields (of self) to new fields (of the returned dataset). The minimal syntax that should be supported is the following: @@ -140,6 +164,24 @@ """ raise AbstractFunction() + + def apply_function(function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): + """ + Return a dataset that contains as fields the results of applying + the given function (example-wise) to the specified input_fields. The + function should return a sequence whose elements will be stored in + fields whose names are given in the output_fields list. If copy_inputs + is True then the resulting dataset will also contain the fields of self. + If accept_minibatches, then the function may be called + with minibatches as arguments (what is returned by the minibatches + iterator). In any case, the computations may be delayed until the examples + of the resulting dataset are requested. If cache is True, then + once the output fields for some examples have been computed, then + are cached (to avoid recomputation if the same examples are again + requested). + """ + return ApplyFunctionDataSet(function, input_fields, output_fields, copy_inputs, accept_minibatches, cache) + class RenamingDataSet(DataSet): """A DataSet that wraps another one, and makes it look like the field names are different @@ -287,9 +329,9 @@ class ArrayDataSet(FiniteDataSet): """ An ArrayDataSet behaves like a numpy array but adds the notion of named fields - from DataSet (and the ability to view multiple field values as an 'Example'). + from DataSet (and the ability to view the values of multiple fields as an 'Example'). It is a fixed-length and fixed-width dataset - in which each element is a numpy array or a number, hence the whole + in which each element is a fixed dimension numpy array or a number, hence the whole dataset corresponds to a numpy array. Fields must correspond to a slice of array columns. If the dataset has fields, each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. @@ -382,9 +424,6 @@ # and coherent with the data array assert fieldslice.start >= 0 and fieldslice.stop <= cols - def __iter__(self): - return self.zip(*self.fieldNames()) - def minibatches(self, fieldnames = DataSet.minibatches_fieldnames, minibatch_size = DataSet.minibatches_minibatch_size, @@ -446,7 +485,8 @@ return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields) def __array__(self): - """Return an view of this dataset which is an numpy.ndarray + """Return a view of this dataset which is an numpy.ndarray (i.e. losing + the identity and name of fields within the dataset). Numpy uses this special function name to retrieve an ndarray view for function such as numpy.sum, numpy.dot, numpy.asarray, etc. @@ -454,6 +494,9 @@ If this dataset has no fields, then we simply return self.data, otherwise things are complicated. - why do we want this behaviour when there are fields? (JB) + - for convenience and completeness (but maybe it would make + more sense to implement this through a 'field-merging' + dataset). (YB) """ if not self.fields: return self.data @@ -497,4 +540,75 @@ c+=slice_width return result +class ApplyFunctionDataset(DataSet): + """ + A dataset that contains as fields the results of applying + a given function (example-wise) to specified input_fields of a source + dataset. The function should return a sequence whose elements will be stored in + fields whose names are given in the output_fields list. If copy_inputs + is True then the resulting dataset will also contain the fields of the source. + dataset. If accept_minibatches, then the function expects + minibatches as arguments (what is returned by the minibatches + iterator). In any case, the computations may be delayed until the examples + of self are requested. If cache is True, then + once the output fields for some examples have been computed, then + are cached (to avoid recomputation if the same examples are again requested). + """ + def __init__(src,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): + DataSet.__init__(self) + self.src=src + self.function=function + self.input_fields=input_fields + self.output_fields=output_fields + self.copy_inputs=copy_inputs + self.accept_minibatches=accept_minibatches + src_fieldnames = src.fieldNames() + if copy_inputs: + for src_field in src_fieldnames: + assert src_field not in output_fields + self.fieldnames=src_fieldnames+output_fields + else: + self.fieldnames=output_fields + for input_field in input_fields: + assert input_field in src_fieldnames + self.cache=cache + if cache: + # maybe a fixed-size array kind of structure would be more efficient than a list + # in the case where src is FiniteDataSet. -YB + self.cached_examples = [] + def fieldNames(self): return self.fieldnames + + def minibatches(self, + fieldnames = DataSet.minibatches_fieldnames, + minibatch_size = DataSet.minibatches_minibatch_size, + n_batches = DataSet.minibatches_n_batches): + + class Iterator(LookupList): + + def __init__(self,dataset): + LookupList.__init__(self, fieldnames, [0]*len(fieldnames)) + self.dataset=dataset + if dataset.copy_inputs: + src_fields=dataset.fieldNames() + else: + src_fields=dataset.input_fields + self.src_iterator=self.src.minibatches(src_fields,minibatch_size,n_batches) + + def __iter__(self): + return self + + def next(self): + src_examples = self.src_iterator.next() + if self.dataset.copy_inputs: + function_inputs = src_examples + else: + function_inputs = + [src_examples[field_name] for field_name in self.dataset.input_fields]) + return self.dataset.function(*function_inputs) + + for fieldname in fieldnames: + assert fieldname in self.input_fields + return Iterator(self) + + diff -r 57f4015e2e09 -r 266c68cb6136 gradient_learner.py --- a/gradient_learner.py Thu Mar 27 01:59:44 2008 -0400 +++ b/gradient_learner.py Mon Apr 07 09:48:39 2008 -0400 @@ -3,7 +3,6 @@ from tensor import * import gradient from compile import Function -from gradient_based_optimizer import * class GradientLearner(Learner): """ @@ -13,38 +12,49 @@ The user provides a Theano formula that maps the fields of a training example and parameters to output fields (for the use function), one of which must be a cost that is the training criterion to be minimized. Subclasses implement - a training strategy that uses the function to compute gradients and + a training strategy that uses the Theano formula to compute gradients and to compute outputs in the update method. The inputs, parameters, and outputs are lists of Theano tensors, while the example_wise_cost and regularization_term are Theano tensors. The user can specify a regularization coefficient that multiplies the regularization term. The training algorithm looks for parameters that minimize - regularization_coefficienet * regularization_term(parameters) + + regularization_coefficient * regularization_term(parameters) + sum_{inputs in training_set} example_wise_cost(inputs,parameters) i.e. the regularization_term should not depend on the inputs, only on the parameters. The learned function can map a subset of inputs to a subset of outputs (as long as the inputs subset includes all the inputs required in the Theano expression for the selected outputs). - It is assumed that all the inputs are provided in the training set, but - not necessarily when using the learned function. + It is assumed that all the inputs are provided in the training set (as dataset fields + with the corresponding name), but not necessarily when using the learned function. """ def __init__(self, inputs, parameters, outputs, example_wise_cost, regularization_term, - gradient_based_optimizer=StochasticGradientDescent(), regularization_coefficient = astensor(1.0)): + regularization_coefficient = astensor(1.0)): self.inputs = inputs self.outputs = outputs self.parameters = parameters self.example_wise_cost = example_wise_cost self.regularization_term = regularization_term - self.gradient_based_optimizer = gradient_based_optimizer self.regularization_coefficient = regularization_coefficient self.parameters_example_wise_gradient = gradient.grad(example_wise_cost, parameters) - self.parameters_regularization_gradient = gradient.grad(self.regularization_coefficient * regularization, parameters) + self.parameters_regularization_gradient = gradient.grad(self.regularization_coefficient * regularization_term, parameters) if example_wise_cost not in outputs: outputs.append(example_wise_cost) if regularization_term not in outputs: outputs.append(regularization_term) self.example_wise_gradient_fn = Function(inputs + parameters, [self.parameters_example_wise_gradient + self.parameters_regularization_gradient]) - self.use_functions = {frozenset([input.name for input in inputs]) : Function(inputs, outputs)} + self.use_functions = {frozenset([input.name for input in inputs]+[output.name for output in outputs]) + : Function(inputs, outputs)} - def update(self,training_set): - + def use(self,input_dataset,output_fields=None,copy_inputs=True): + # obtain the function that maps the desired inputs to desired outputs + input_fields = input_dataset.fieldNames() + if output_fields is None: output_fields = [output.name for output in outputs] + # handle special case of inputs that are directly copied into outputs + + use_function_key = input_fields+output_fields + if not self.use_functions.has_key(use_function_key): + self.use_function[use_function_key]=Function(input_fields,output_fields) + use_function = self.use_functions[use_function_key] + # return a virtual dataset that computes the outputs on demand + return input_dataset.apply_function(use_function,input_fields,output_fields,copy_inputs,accept_minibatches=???) + diff -r 57f4015e2e09 -r 266c68cb6136 learner.py --- a/learner.py Thu Mar 27 01:59:44 2008 -0400 +++ b/learner.py Mon Apr 07 09:48:39 2008 -0400 @@ -36,20 +36,21 @@ return self.use # default behavior is 'non-adaptive', i.e. update does not do anything - def __call__(self,training_set): + def __call__(self,training_set,train_stats_collector=None): """ Train a learner from scratch using the provided training set, and return the learned function. """ self.forget() - return self.update(learning_task) + return self.update(learning_task,train_stats_collector) - - def use(self,input_dataset,output_fields=None): + def use(self,input_dataset,output_fields=None,copy_inputs=True): """Once a Learner has been trained by one or more call to 'update', it can be used with one or more calls to 'use'. The argument is a DataSet (possibly containing a single example) and the result is a DataSet of the same length. If output_fields is specified, it may be use to indicate which fields should be constructed in the output DataSet (for example ['output','classification_error']). + Optionally, if copy_inputs, the input fields (of the input_dataset) can be made + visible in the output DataSet returned by this function. """ raise NotImplementedError diff -r 57f4015e2e09 -r 266c68cb6136 lookup_list.py --- a/lookup_list.py Thu Mar 27 01:59:44 2008 -0400 +++ b/lookup_list.py Mon Apr 07 09:48:39 2008 -0400 @@ -55,16 +55,6 @@ except KeyError, e: raise AttributeError(name) - if 0: - # This makes subclassing horrible, just call append_keyval if it's - # really what you want to do. - # -JB - def __setattr__(self,name,value): - if name in self._name2index: - self._values[self._name2index[name]]=value - else: - raise AttributeError(name) - def append_keyval(self, key, value): self._name2index[key]=len(self) self._values.append(value)