# HG changeset patch # User bengioy@grenat.iro.umontreal.ca # Date 1207926969 14400 # Node ID e6c550cb2896af72e4993ce0e8dfcf5272af0074 # Parent 672fe4b23032b376b61887785e3c0e25155d4d12# Parent b63e8c0bf21b753baabe90808cae390588ec5cff Merging? what? diff -r b63e8c0bf21b -r e6c550cb2896 _test_dataset.py --- a/_test_dataset.py Thu Apr 10 20:53:44 2008 -0400 +++ b/_test_dataset.py Fri Apr 11 11:16:09 2008 -0400 @@ -78,6 +78,19 @@ for i, x in enumerate(a.minibatches(["x"], minibatch_size=3, n_batches=6)): self.failUnless(numpy.all( x == arr2[i*3:i*3+3,0:2])) + +class T_renamingdataset(unittest.TestCase): + def setUp(self): + numpy.random.seed(123456) + + + def test_hasfield(self): + n = numpy.random.rand(3,8) + a=ArrayDataSet(data=n,fields={"x":slice(2),"y":slice(1,4),"z":slice(4,6)}) + b=a.rename({'xx':'x','zz':'z'}) + self.failUnless(b.hasFields('xx','zz') and not b.hasFields('x') and not b.hasFields('y')) + + if __name__ == '__main__': unittest.main() diff -r b63e8c0bf21b -r e6c550cb2896 dataset.py --- a/dataset.py Thu Apr 10 20:53:44 2008 -0400 +++ b/dataset.py Fri Apr 11 11:16:09 2008 -0400 @@ -1,6 +1,7 @@ from lookup_list import LookupList Example = LookupList +import copy class AbstractFunction (Exception): """Derived class must override this function""" @@ -142,7 +143,7 @@ """ raise AbstractFunction() - def hasFields(*fieldnames): + def hasFields(self,*fieldnames): """ Return true if the given field name (or field names, if multiple arguments are given) is recognized by the DataSet (i.e. can be used as a field name in one @@ -150,7 +151,7 @@ """ raise AbstractFunction() - def merge_fields(*specifications): + def merge_fields(self,*specifications): """ Return a new dataset that maps old fields (of self) to new fields (of the returned dataset). The minimal syntax that should be supported is the following: @@ -162,7 +163,7 @@ """ raise AbstractFunction() - def merge_field_values(*field_value_pairs) + def merge_field_values(self,*field_value_pairs): """ Return the value that corresponds to merging the values of several fields, given as arguments (field_name, field_value) pairs with self.hasField(field_name). @@ -172,22 +173,28 @@ fieldnames,fieldvalues = zip(*field_value_pairs) raise ValueError("Unable to merge values of these fields:"+repr(fieldnames)) - def examples2minibatch(examples): + def examples2minibatch(self,examples): """ Combine a list of Examples into a minibatch. A minibatch is an Example whose fields are iterable over the examples of the minibatch. """ raise AbstractFunction() - def rename(rename_dict): + def rename(self,rename_dict): """ Return a new dataset that renames fields, using a dictionnary that maps old field names to new field names. The only fields visible by the returned dataset are those whose names are keys of the rename_dict. """ - return RenamingDataSet(self,rename_dict) - - def applyFunction(function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): + self_class = self.__class__ + class SelfRenamingDataSet(RenamingDataSet,self_class): + pass + self.__class__ = SelfRenamingDataSet + # set the rename_dict and src fields + SelfRenamingDataSet.__init__(self,self,rename_dict) + return self + + def applyFunction(self,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): """ Return a dataset that contains as fields the results of applying the given function (example-wise) to the specified input_fields. The @@ -204,25 +211,6 @@ """ return ApplyFunctionDataSet(function, input_fields, output_fields, copy_inputs, accept_minibatches, cache) -class RenamingDataSet(DataSet): - """A DataSet that wraps another one, and makes it look like the field names - are different - - Renaming is done by a dictionary that maps new names to the old ones used in - self.src. - """ - def __init__(self, src, rename_dct): - DataSet.__init__(self) - self.src = src - self.rename_dct = copy.copy(rename_dct) - - def minibatches(self, - fieldnames = DataSet.minibatches_fieldnames, - minibatch_size = DataSet.minibatches_minibatch_size, - n_batches = DataSet.minibatches_n_batches): - dct = self.rename_dct - new_fieldnames = [dct.get(f, f) for f in fieldnames] - return self.src.minibatches(new_fieldnames, minibatches_size, n_batches) class FiniteLengthDataSet(DataSet): """ @@ -278,10 +266,11 @@ def __init__(self): DataSet.__init__(self) - def hasFields(*fieldnames): + def hasFields(self,*fields): has_fields=True - for fieldname in fieldnames: - if fieldname not in self.fields.keys(): + fieldnames = self.fieldNames() + for name in fields: + if name not in fieldnames: has_fields=False return has_fields @@ -291,6 +280,30 @@ raise AbstractFunction() +class RenamingDataSet(FiniteWidthDataSet): + """A DataSet that wraps another one, and makes it look like the field names + are different + + Renaming is done by a dictionary that maps new names to the old ones used in + self.src. + """ + def __init__(self, src, rename_dct): + DataSet.__init__(self) + self.src = src + self.rename_dct = copy.copy(rename_dct) + + def fieldNames(self): + return self.rename_dct.keys() + + def minibatches(self, + fieldnames = DataSet.minibatches_fieldnames, + minibatch_size = DataSet.minibatches_minibatch_size, + n_batches = DataSet.minibatches_n_batches): + dct = self.rename_dct + new_fieldnames = [dct.get(f, f) for f in fieldnames] + return self.src.minibatches(new_fieldnames, minibatches_size, n_batches) + + # we may want ArrayDataSet defined in another python file import numpy @@ -548,19 +561,6 @@ c+=slice_width return result - def rename(*new_field_specifications): - """ - Return a new dataset that maps old fields (of self) to new fields (of the returned - dataset). The minimal syntax that should be supported is the following: - new_field_specifications = [new_field_spec1, new_field_spec2, ...] - new_field_spec = ([old_field1, old_field2, ...], new_field) - In general both old_field and new_field should be strings, but some datasets may also - support additional indexing schemes within each field (e.g. column slice - of a matrix-like field). - """ - # if all old fields of each spec are - raise NotImplementedError() - class ApplyFunctionDataSet(DataSet): """ A dataset that contains as fields the results of applying @@ -599,7 +599,7 @@ else: # compute a list with one tuple per example, with the function outputs self.cached_examples = [ function(input) for input in src.zip(input_fields) ] - else if cache: + elif cache: # maybe a fixed-size array kind of structure would be more efficient than a list # in the case where src is FiniteDataSet. -YB self.cached_examples = [] diff -r b63e8c0bf21b -r e6c550cb2896 gradient_learner.py --- a/gradient_learner.py Thu Apr 10 20:53:44 2008 -0400 +++ b/gradient_learner.py Fri Apr 11 11:16:09 2008 -0400 @@ -26,7 +26,7 @@ It is assumed that all the inputs are provided in the training set (as dataset fields with the corresponding name), but not necessarily when using the learned function. """ - def __init__(self, inputs, parameters, outputs, example_wise_cost, regularization_term, + def __init__(self, inputs, parameters, outputs, example_wise_cost, regularization_term=astensor(0.0), regularization_coefficient = astensor(1.0)): self.inputs = inputs self.outputs = outputs @@ -48,13 +48,24 @@ def use(self,input_dataset,output_fields=None,copy_inputs=True): # obtain the function that maps the desired inputs to desired outputs input_fields = input_dataset.fieldNames() + # map names of input fields to Theano tensors in self.inputs + input_variables = ??? if output_fields is None: output_fields = [output.name for output in outputs] # handle special case of inputs that are directly copied into outputs - + # map names of output fields to Theano tensors in self.outputs + output_variables = ??? use_function_key = input_fields+output_fields if not self.use_functions.has_key(use_function_key): - self.use_function[use_function_key]=Function(input_fields,output_fields) + self.use_function[use_function_key]=Function(input_variables,output_variables) use_function = self.use_functions[use_function_key] # return a dataset that computes the outputs return input_dataset.applyFunction(use_function,input_fields,output_fields,copy_inputs,compute_now=True) + +class StochasticGradientDescent(object): + def update_parameters(self): + +class StochasticGradientLearner(GradientLearner,StochasticGradientDescent): + def __init__(self,inputs, parameters, outputs, example_wise_cost, regularization_term=astensor(0.0), + regularization_coefficient = astensor(1.0),) + def update()