Mercurial > pylearn
changeset 31:2d6c49ec5749
merged
author | bergstrj@iro.umontreal.ca |
---|---|
date | Fri, 11 Apr 2008 21:42:07 -0400 |
parents | bf0145fa73e8 (current diff) 46c5c90019c2 (diff) |
children | 039c0f249859 |
files | |
diffstat | 3 files changed, 110 insertions(+), 98 deletions(-) [+] |
line wrap: on
line diff
--- a/_test_dataset.py Fri Apr 11 21:41:09 2008 -0400 +++ b/_test_dataset.py Fri Apr 11 21:42:07 2008 -0400 @@ -48,16 +48,6 @@ a_y = a.y self.failUnless(numpy.all( a_y == arr[:,1:4])) - def test_asarray(self): - arr = numpy.random.rand(3,4) - a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(2,4)}) - a_arr = numpy.asarray(a) - self.failUnless(a_arr.shape[1] == 2 + 2) - self.failUnless(numpy.sum(numpy.square(a_arr-a.data))==0) - a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)}) - a_arr = numpy.asarray(a) - self.failUnless(a_arr.shape[1] == 2 + 3) - def test_minibatch_wraparound_even(self): arr = numpy.random.rand(10,4) arr2 = ArrayDataSet.Iterator.matcat(arr,arr) @@ -90,6 +80,17 @@ b=a.rename({'xx':'x','zz':'z'}) self.failUnless(b.hasFields('xx','zz') and not b.hasFields('x') and not b.hasFields('y')) +class T_applyfunctiondataset(unittest.TestCase): + def setUp(self): + numpy.random.seed(123456) + + def test_function(self): + n = numpy.random.rand(3,8) + a=ArrayDataSet(data=n,fields={"x":slice(2),"y":slice(1,4),"z":slice(4,6)}) + b=a.apply_function(lambda x,y: x+y,x+1, ['x','y'], ['x+y','x+1'], False,False,False) + print b.fieldNames() + print b('x+y') + if __name__ == '__main__': unittest.main()
--- a/dataset.py Fri Apr 11 21:41:09 2008 -0400 +++ b/dataset.py Fri Apr 11 21:42:07 2008 -0400 @@ -37,8 +37,8 @@ Datasets of finite length should be sub-classes of FiniteLengthDataSet. - Datasets whose elements can be indexed and sub-datasets of consecutive - examples (i.e. slices) can be extracted from should be sub-classes of + Datasets whose elements can be indexed and whose sub-datasets (with a subset + of examples) can be extracted should be sub-classes of SliceableDataSet. Datasets with a finite number of fields should be sub-classes of @@ -150,6 +150,7 @@ of the iterators). """ raise AbstractFunction() + def merge_fields(self,*specifications): """ @@ -182,7 +183,7 @@ def rename(self,rename_dict): """ - Return a new dataset that renames fields, using a dictionnary that maps old field + Changes a dataset into one that renames fields, using a dictionnary that maps old field names to new field names. The only fields visible by the returned dataset are those whose names are keys of the rename_dict. """ @@ -194,9 +195,9 @@ SelfRenamingDataSet.__init__(self,self,rename_dict) return self - def applyFunction(self,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): + def apply_function(self,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): """ - Return a dataset that contains as fields the results of applying + Changes a dataset into one that contains as fields the results of applying the given function (example-wise) to the specified input_fields. The function should return a sequence whose elements will be stored in fields whose names are given in the output_fields list. If copy_inputs @@ -209,7 +210,13 @@ are cached (to avoid recomputation if the same examples are again requested). """ - return ApplyFunctionDataSet(function, input_fields, output_fields, copy_inputs, accept_minibatches, cache) + self_class = self.__class__ + class SelfApplyFunctionDataSet(ApplyFunctionDataSet,self_class): + pass + self.__class__ = SelfApplyFunctionDataSet + # set the required additional fields + ApplyFunctionDataSet.__init__(self,self,function, input_fields, output_fields, copy_inputs, accept_minibatches, cache) + return self class FiniteLengthDataSet(DataSet): @@ -223,15 +230,31 @@ def __len__(self): """len(dataset) returns the number of examples in the dataset.""" raise AbstractFunction() - + + def __call__(self,fieldname_or_fieldnames): + """ + Extract one or more fields. This may be an expensive operation when the + dataset is large. It is not the recommanded way to access individual values + (use the iterators instead). If the argument is a string fieldname, then the result + is a sequence (iterable object) of values for that field, for the whole dataset. If the + argument is a list of field names, then the result is a 'batch', i.e., an Example with keys + corresponding to the given field names and values being iterable objects over the + individual example values. + """ + if type(fieldname_or_fieldnames) is string: + minibatch = self.minibatches([fieldname_or_fieldnames],len(self)).next() + return minibatch[fieldname_or_fieldnames] + return self.minibatches(fieldname_or_fieldnames,len(self)).next() class SliceableDataSet(DataSet): """ Virtual interface, a subclass of DataSet for datasets which are sliceable and whose individual elements can be accessed, generally respecting the python semantics for [spec], where spec is either a non-negative integer - (for selecting one example), or a python slice (for selecting a sub-dataset - comprising the specified examples). This is useful for obtaining + (for selecting one example), a python slice(start,stop,step) for selecting a regular + sub-dataset comprising examples start,start+step,start+2*step,...,n (with n<stop), or a + sequence (e.g. a list) of integers [i1,i2,...,in] for selecting + an arbitrary subset of examples. This is useful for obtaining sub-datasets, e.g. for splitting a dataset into training and test sets. """ def __init__(self): @@ -250,11 +273,19 @@ return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches) def __getitem__(self,i): - """dataset[i] returns the (i+1)-th example of the dataset.""" + """ + dataset[i] returns the (i+1)-th example of the dataset. + dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. + dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. + dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in. + """ raise AbstractFunction() def __getslice__(self,*slice_args): - """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" + """ + dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. + dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. + """ raise AbstractFunction() @@ -348,7 +379,8 @@ It is a fixed-length and fixed-width dataset in which each element is a fixed dimension numpy array or a number, hence the whole dataset corresponds to a numpy array. Fields - must correspond to a slice of array columns. If the dataset has fields, + must correspond to a slice of array columns or to a list of column numbers. + If the dataset has fields, each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. Any dataset can also be converted to a numpy array (losing the notion of fields by the numpy.array(dataset) call. @@ -396,7 +428,7 @@ if self.next_count == self.next_max: raise StopIteration - #determine the first and last elements of the slice we'll return + #determine the first and last elements of the minibatch slice we'll return n_rows = self.dataset.data.shape[0] self.current = self.next_index() upper = self.current + self.minibatch_size @@ -423,7 +455,7 @@ There are two ways to construct an ArrayDataSet: (1) from an existing dataset (which may result in a copy of the data in a numpy array), or (2) from a numpy.array (the data argument), along with an optional description - of the fields (a LookupList of column slices indexed by field names). + of the fields (a LookupList of column slices (or column lists) indexed by field names). """ self.data=data self.fields=fields @@ -431,17 +463,22 @@ if fields: for fieldname,fieldslice in fields.items(): - # make sure fieldslice.start and fieldslice.step are defined - start=fieldslice.start - step=fieldslice.step - if not start: - start=0 - if not step: - step=1 - if not fieldslice.start or not fieldslice.step: - fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) - # and coherent with the data array - assert fieldslice.start >= 0 and fieldslice.stop <= cols + assert type(fieldslice) is int or isinstance(fieldslice,slice) or hasattr(fieldslice,"__iter__") + if hasattr(fieldslice,"__iter__"): # is a sequence + for i in fieldslice: + assert type(i) is int + elif isinstance(fieldslice,slice): + # make sure fieldslice.start and fieldslice.step are defined + start=fieldslice.start + step=fieldslice.step + if not start: + start=0 + if not step: + step=1 + if not fieldslice.start or not fieldslice.step: + fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) + # and coherent with the data array + assert fieldslice.start >= 0 and fieldslice.stop <= cols def minibatches(self, fieldnames = DataSet.minibatches_fieldnames, @@ -457,28 +494,6 @@ if n_batches is None: n_batches = len(self) / minibatch_size return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches) - def __getattr__(self,fieldname): - """ - Return a numpy array with the content associated with the given field name. - If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension - than the dataset itself) is returned. - """ - if len(self.data)==1: - return self.data[0,self.fields[fieldname]] - return self.data[:,self.fields[fieldname]] - - def __call__(self,*fieldnames): - """Return a sub-dataset containing only the given fieldnames as fields.""" - min_col=self.data.shape[1] - max_col=0 - for field_slice in self.fields.values(): - min_col=min(min_col,field_slice.start) - max_col=max(max_col,field_slice.stop) - new_fields=LookupList() - for fieldname,fieldslice in self.fields.items(): - new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step) - return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields) - def fieldNames(self): """Return the list of field names that are supported by getattr and hasField.""" return self.fields.keys() @@ -489,8 +504,11 @@ def __getitem__(self,i): """ - dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields - the result is just a numpy array (for the i-th row of the dataset data matrix). + dataset[i] returns the (i+1)-th Example of the dataset. + If there are no fields the result is just a numpy array (for the i-th row of the dataset data matrix). + dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. + dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. + dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in. """ if self.fields: fieldnames,fieldslices=zip(*self.fields.items()) @@ -499,36 +517,34 @@ return self.data[i] def __getslice__(self,*args): - """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" + """ + dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. + dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. + """ return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields) - def __array__(self): - """Return a view of this dataset which is an numpy.ndarray (i.e. losing - the identity and name of fields within the dataset). - - Numpy uses this special function name to retrieve an ndarray view for - function such as numpy.sum, numpy.dot, numpy.asarray, etc. - - If this dataset has no fields, then we simply return self.data, - otherwise things are complicated. - - why do we want this behaviour when there are fields? (JB) - - for convenience and completeness (but maybe it would make - more sense to implement this through a 'field-merging' - dataset). (YB) + def indices_of_unique_columns_used(self): + """ + Return the unique indices of the columns actually used by the fields, and a boolean + that signals (if True) that used columns overlap. If they do then the + indices are not repeated in the result. """ - if not self.fields: - return self.data - # else, select subsets of columns mapped by the fields columns_used = numpy.zeros((self.data.shape[1]),dtype=bool) - overlapping_fields = False - n_columns = 0 + overlapping_columns = False for field_slice in self.fields.values(): - for c in xrange(field_slice.start,field_slice.stop,field_slice.step): - n_columns += 1 - if columns_used[c]: overlapping_fields=True - columns_used[c]=True - # try to figure out if we can map all the slices into one slice: - mappable_to_one_slice = not overlapping_fields + if sum(columns_used[field_slice])>0: overlapping_columns=True + columns_used[field_slice]=True + return [i for i,used in enumerate(columns_used) if used],overlapping_columns + + def slice_of_unique_columns_used(self): + """ + Return None if the indices_of_unique_columns_used do not form a slice. If they do, + return that slice. It means that the columns used can be extracted + from the data array without making a copy. If the fields overlap + but their unique columns used form a slice, still return that slice. + """ + columns_used,overlapping_columns = self.indices_of_columns_used() + mappable_to_one_slice = True if not overlapping_fields: start=0 while start<len(columns_used) and not columns_used[start]: @@ -549,19 +565,9 @@ else: step = j-i i=j - if mappable_to_one_slice: - return self.data[:,slice(start,stop,step)] - # else make contiguous copy (copying the overlapping columns) - result = numpy.zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype) - c=0 - for field_slice in self.fields.values(): - slice_width=(field_slice.stop-field_slice.start)/field_slice.step - # copy the field here - result[:,slice(c,c+slice_width)]=self.data[:,field_slice] - c+=slice_width - return result - -class ApplyFunctionDataSet(DataSet): + return slice(start,stop,step) + +class ApplyFunctionDataSet(FiniteWidthDataSet): """ A dataset that contains as fields the results of applying a given function (example-wise) to specified input_fields of a source @@ -604,6 +610,11 @@ # in the case where src is FiniteDataSet. -YB self.cached_examples = [] + def fieldNames(self): + if self.copy_inputs: + return self.output_fields + self.src.fieldNames() + return self.output_fields + def minibatches(self, fieldnames = DataSet.minibatches_fieldnames, minibatch_size = DataSet.minibatches_minibatch_size,
--- a/gradient_learner.py Fri Apr 11 21:41:09 2008 -0400 +++ b/gradient_learner.py Fri Apr 11 21:42:07 2008 -0400 @@ -59,7 +59,7 @@ self.use_function[use_function_key]=Function(input_variables,output_variables) use_function = self.use_functions[use_function_key] # return a dataset that computes the outputs - return input_dataset.applyFunction(use_function,input_fields,output_fields,copy_inputs,compute_now=True) + return input_dataset.apply_function(use_function,input_fields,output_fields,copy_inputs,compute_now=True) class StochasticGradientDescent(object):