# HG changeset patch # User James Bergstra # Date 1212619730 14400 # Node ID 3f1cd8897fdae1439664d36ed1714985ba5cbdee # Parent 4dad41215967b23dccbb273b341a1e2907488899 reverting dataset diff -r 4dad41215967 -r 3f1cd8897fda dataset.py --- a/dataset.py Wed Jun 04 17:49:28 2008 -0400 +++ b/dataset.py Wed Jun 04 18:48:50 2008 -0400 @@ -109,6 +109,10 @@ - dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in. + - dataset[fieldname] an iterable over the values of the field fieldname across + the dataset (the iterable is obtained by default by calling valuesVStack + over the values for individual examples). + - dataset. returns the value of a property associated with the name . The following properties should be supported: - 'description': a textual description or name for the dataset @@ -158,14 +162,10 @@ By convention, attributes not in attributeNames() should have a name starting with an underscore. @todo enforce/test that convention! - """ - if 0: - # removed by James June 4... these aren't used anywhere according to - # grep - numpy_vstack = lambda fieldname,values: numpy.vstack(values) - numpy_hstack = lambda fieldnames,values: numpy.hstack(values) + numpy_vstack = lambda fieldname,values: numpy.vstack(values) + numpy_hstack = lambda fieldnames,values: numpy.hstack(values) def __init__(self,description=None,fieldtypes=None): if description is None: @@ -277,11 +277,9 @@ # first get the beginning of our minibatch (top of dataset) first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next() second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next() - - blah = [self.dataset.valuesVStack(name,[first_part[name],second_part[name]]) - for name in self.fieldnames] - print type(self.dataset), blah - minibatch = Example(self.fieldnames,blah) + minibatch = Example(self.fieldnames, + [self.dataset.valuesVStack(name,[first_part[name],second_part[name]]) + for name in self.fieldnames]) self.next_row=upper self.n_batches_done+=1 if upper >= self.L and self.n_batches: @@ -462,16 +460,12 @@ for fieldname,field_values in zip(self.fieldNames(),fields_values)]), self.valuesVStack,self.valuesHStack) - - raise TypeError(i) - if 0: - # else check for a fieldname - #after talk with Yoshua June 4, this is disabled. - if self.hasFields(i): - return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] - # else we are trying to access a property of the dataset - assert i in self.__dict__ # else it means we are trying to access a non-existing property - return self.__dict__[i] + # else check for a fieldname + if self.hasFields(i): + return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] + # else we are trying to access a property of the dataset + assert i in self.__dict__ # else it means we are trying to access a non-existing property + return self.__dict__[i] def valuesHStack(self,fieldnames,fieldvalues): """ @@ -497,20 +491,21 @@ def valuesVStack(self,fieldname,values): """ - @param fieldname: the name of the field from which the values were taken - @type fieldname: any type - - @param values: bits near the beginning or end of the dataset - @type values: list of minibatches (returned by minibatch_nowrap) - - @return: the concatenation (stacking) of the values - @rtype: something suitable as a minibatch field - + Return a value that corresponds to concatenating (vertically) several values of the + same field. This can be important to build a minibatch out of individual examples. This + is likely to involve a copy of the original values. When the values are numpy arrays, the + result should be numpy.vstack(values). + The default is to use numpy.vstack for numpy.ndarray values, and a list + pointing to the original values for other data types. """ - rval = [] - for sub_batch in values: - rval.extend(sub_batch) - return rval + all_numpy=True + for value in values: + if not type(value) is numpy.ndarray: + all_numpy=False + if all_numpy: + return numpy.vstack(values) + # the default implementation of vertical stacking is to put values in a list + return values def __or__(self,other): """ @@ -958,29 +953,16 @@ Virtual super-class of datasets whose field values are numpy array, thus defining valuesHStack and valuesVStack for sub-classes. """ - def __init__(self, description=None, field_types=None): - DataSet.__init__(self, description, field_types) - def valuesHStack(self, fieldnames, fieldvalues): + def __init__(self,description=None,field_types=None): + DataSet.__init__(self,description,field_types) + def valuesHStack(self,fieldnames,fieldvalues): """Concatenate field values horizontally, e.g. two vectors become a longer vector, two matrices become a wider matrix, etc.""" return numpy.hstack(fieldvalues) - def valuesVStack(self, fieldname, values): + def valuesVStack(self,fieldname,values): """Concatenate field values vertically, e.g. two vectors become a two-row matrix, two matrices become a longer matrix, etc.""" - #print len(values) - for v in values: - if not isinstance(v, numpy.ndarray): - raise TypeError(v, type(v)) - - s0 = sum([v.shape[0] for v in values]) - #TODO: there's gotta be a better way to do this! - dtype = values[0].dtype - rval = numpy.ndarray([s0] + list(values[0].shape[1:]), dtype=dtype) - cur_row = 0 - for v in values: - rval[cur_row:cur_row+v.shape[0]] = v - cur_row += v.shape[0] - return rval + return numpy.vstack(values) class ArrayDataSet(ArrayFieldsDataSet): """ @@ -1005,7 +987,7 @@ for fieldname, fieldcolumns in self.fields_columns.items(): if type(fieldcolumns) is int: assert fieldcolumns>=0 and fieldcolumnscache_len: - # whole minibatch is not already in cache - # cache everything from current length to upper - for example in self.dataset.source_dataset[cache_len:upper]: - self.dataset.cached_examples.append(example) - - next_range = slice(self.current, self.current+minibatch_size) - blah = self.dataset.cached_examples[next_range] - all_fields_minibatch = Example(self.dataset.fieldNames(), zip(*blah)) - self.current+=minibatch_size - - #little optimization to avoid second Example computation if - #possible. - if self.all_fields: - return all_fields_minibatch + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + class CacheIterator(object): + def __init__(self,dataset): + self.dataset=dataset + self.current=offset + self.all_fields = self.dataset.fieldNames()==fieldnames + def __iter__(self): return self + def next(self): + upper = self.current+minibatch_size + cache_len = len(self.dataset.cached_examples) + if upper>cache_len: # whole minibatch is not already in cache + # cache everything from current length to upper + for example in self.dataset.source_dataset[cache_len:upper]: + self.dataset.cached_examples.append(example) + all_fields_minibatch = Example(self.dataset.fieldNames(), + zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size])) + self.current+=minibatch_size + if self.all_fields: + return all_fields_minibatch + return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames]) + return CacheIterator(self) - rval = Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames]) - return rval - return CacheIterator(self) + def __getitem__(self,i): + if type(i)==int and len(self.cached_examples)>i: + return self.cached_examples[i] + else: + return self.source_dataset[i] + + def __iter__(self): + class CacheIteratorIter(object): + def __init__(self,dataset): + self.dataset=dataset + self.l = len(dataset) + self.current = 0 + self.fieldnames = self.dataset.fieldNames() + self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames)) + def __iter__(self): return self + def next(self): + if self.current>=self.l: + raise StopIteration + cache_len = len(self.dataset.cached_examples) + if self.current>=cache_len: # whole minibatch is not already in cache + # cache everything from current length to upper + self.dataset.cached_examples.append( + self.dataset.source_dataset[self.current]) + self.example._values = self.dataset.cached_examples[self.current] + self.current+=1 + return self.example - def __getitem__(self,i): - if type(i)==int and len(self.cached_examples)>i: - return self.cached_examples[i] - else: - return self.source_dataset[i] - - def __iter__(self): - class CacheIteratorIter(object): - def __init__(self,dataset): - self.dataset=dataset - self.l = len(dataset) - self.current = 0 - self.fieldnames = self.dataset.fieldNames() - self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames)) - def __iter__(self): return self - def next(self): - if self.current>=self.l: - raise StopIteration - cache_len = len(self.dataset.cached_examples) - if self.current>=cache_len: # whole minibatch is not already in cache - # cache everything from current length to upper - self.dataset.cached_examples.append( - self.dataset.source_dataset[self.current]) - self.example._values = self.dataset.cached_examples[self.current] - self.current+=1 - return self.example - - return CacheIteratorIter(self) + return CacheIteratorIter(self) class ApplyFunctionDataSet(DataSet): - """ - A L{DataSet} that contains as fields the results of applying a - given function example-wise or minibatch-wise to all the fields of - an input dataset. The output of the function should be an iterable - (e.g. a list or a LookupList) over the resulting values. - - The function take as input the fields of the dataset, not the examples. + """ + A L{DataSet} that contains as fields the results of applying a + given function example-wise or minibatch-wise to all the fields of + an input dataset. The output of the function should be an iterable + (e.g. a list or a LookupList) over the resulting values. + + The function take as input the fields of the dataset, not the examples. - In minibatch mode, the function is expected to work on minibatches - (takes a minibatch in input and returns a minibatch in output). More - precisely, it means that each element of the input or output list - should be iterable and indexable over the individual example values - (typically these elements will be numpy arrays). All of the elements - in the input and output lists should have the same length, which is - the length of the minibatch. + In minibatch mode, the function is expected to work on minibatches + (takes a minibatch in input and returns a minibatch in output). More + precisely, it means that each element of the input or output list + should be iterable and indexable over the individual example values + (typically these elements will be numpy arrays). All of the elements + in the input and output lists should have the same length, which is + the length of the minibatch. - The function is applied each time an example or a minibatch is accessed. - To avoid re-doing computation, wrap this dataset inside a CachedDataSet. + The function is applied each time an example or a minibatch is accessed. + To avoid re-doing computation, wrap this dataset inside a CachedDataSet. - If the values_{h,v}stack functions are not provided, then - the input_dataset.values{H,V}Stack functions are used by default. - """ - def __init__(self,input_dataset,function,output_names,minibatch_mode=True, - values_hstack=None,values_vstack=None, - description=None,fieldtypes=None): - """ - Constructor takes an input dataset that has as many fields as the function - expects as inputs. The resulting dataset has as many fields as the function - produces as outputs, and that should correspond to the number of output names - (provided in a list). + If the values_{h,v}stack functions are not provided, then + the input_dataset.values{H,V}Stack functions are used by default. + """ + def __init__(self,input_dataset,function,output_names,minibatch_mode=True, + values_hstack=None,values_vstack=None, + description=None,fieldtypes=None): + """ + Constructor takes an input dataset that has as many fields as the function + expects as inputs. The resulting dataset has as many fields as the function + produces as outputs, and that should correspond to the number of output names + (provided in a list). - Note that the expected semantics of the function differs in minibatch mode - (it takes minibatches of inputs and produces minibatches of outputs, as - documented in the class comment). + Note that the expected semantics of the function differs in minibatch mode + (it takes minibatches of inputs and produces minibatches of outputs, as + documented in the class comment). - TBM: are filedtypes the old field types (from input_dataset) or the new ones - (for the new dataset created)? - """ - self.input_dataset=input_dataset - self.function=function - self.output_names=output_names - self.minibatch_mode=minibatch_mode - DataSet.__init__(self,description,fieldtypes) - self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack - self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack + TBM: are filedtypes the old field types (from input_dataset) or the new ones + (for the new dataset created)? + """ + self.input_dataset=input_dataset + self.function=function + self.output_names=output_names + self.minibatch_mode=minibatch_mode + DataSet.__init__(self,description,fieldtypes) + self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack + self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack - def __len__(self): - return len(self.input_dataset) + def __len__(self): + return len(self.input_dataset) - def fieldNames(self): - return self.output_names + def fieldNames(self): + return self.output_names - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - class ApplyFunctionIterator(object): - def __init__(self,output_dataset): - self.input_dataset=output_dataset.input_dataset - self.output_dataset=output_dataset - self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size, - n_batches=n_batches,offset=offset).__iter__() + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + class ApplyFunctionIterator(object): + def __init__(self,output_dataset): + self.input_dataset=output_dataset.input_dataset + self.output_dataset=output_dataset + self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size, + n_batches=n_batches,offset=offset).__iter__() - def __iter__(self): return self + def __iter__(self): return self - def next(self): - function_inputs = self.input_iterator.next() - all_output_names = self.output_dataset.output_names - if self.output_dataset.minibatch_mode: - function_outputs = self.output_dataset.function(*function_inputs) - else: - input_examples = zip(*function_inputs) - output_examples = [self.output_dataset.function(*input_example) - for input_example in input_examples] - function_outputs = [self.output_dataset.valuesVStack(name,values) - for name,values in zip(all_output_names, - zip(*output_examples))] - all_outputs = Example(all_output_names,function_outputs) - if fieldnames==all_output_names: - return all_outputs - return Example(fieldnames,[all_outputs[name] for name in fieldnames]) + def next(self): + function_inputs = self.input_iterator.next() + all_output_names = self.output_dataset.output_names + if self.output_dataset.minibatch_mode: + function_outputs = self.output_dataset.function(*function_inputs) + else: + input_examples = zip(*function_inputs) + output_examples = [self.output_dataset.function(*input_example) + for input_example in input_examples] + function_outputs = [self.output_dataset.valuesVStack(name,values) + for name,values in zip(all_output_names, + zip(*output_examples))] + all_outputs = Example(all_output_names,function_outputs) + if fieldnames==all_output_names: + return all_outputs + return Example(fieldnames,[all_outputs[name] for name in fieldnames]) - return ApplyFunctionIterator(self) + return ApplyFunctionIterator(self) - def __iter__(self): # only implemented for increased efficiency - class ApplyFunctionSingleExampleIterator(object): - def __init__(self,output_dataset): - self.current=0 - self.output_dataset=output_dataset - self.input_iterator=output_dataset.input_dataset.__iter__() - def __iter__(self): return self - def next(self): - if self.output_dataset.minibatch_mode: - function_inputs = [[input] for input in self.input_iterator.next()] - outputs = self.output_dataset.function(*function_inputs) - assert all([hasattr(output,'__iter__') for output in outputs]) - function_outputs = [output[0] for output in outputs] - else: - function_inputs = self.input_iterator.next() - function_outputs = self.output_dataset.function(*function_inputs) - return Example(self.output_dataset.output_names,function_outputs) - return ApplyFunctionSingleExampleIterator(self) - + def __iter__(self): # only implemented for increased efficiency + class ApplyFunctionSingleExampleIterator(object): + def __init__(self,output_dataset): + self.current=0 + self.output_dataset=output_dataset + self.input_iterator=output_dataset.input_dataset.__iter__() + def __iter__(self): return self + def next(self): + if self.output_dataset.minibatch_mode: + function_inputs = [[input] for input in self.input_iterator.next()] + outputs = self.output_dataset.function(*function_inputs) + assert all([hasattr(output,'__iter__') for output in outputs]) + function_outputs = [output[0] for output in outputs] + else: + function_inputs = self.input_iterator.next() + function_outputs = self.output_dataset.function(*function_inputs) + return Example(self.output_dataset.output_names,function_outputs) + return ApplyFunctionSingleExampleIterator(self) + def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): """ diff -r 4dad41215967 -r 3f1cd8897fda test_dataset.py --- a/test_dataset.py Wed Jun 04 17:49:28 2008 -0400 +++ b/test_dataset.py Wed Jun 04 18:48:50 2008 -0400 @@ -421,7 +421,7 @@ test_all(a2,ds) - del a2, ds #removes from list of active objects in debugger + del a2, ds def test_LookupList(): #test only the example in the doc??? @@ -544,8 +544,6 @@ f_array_iter(array) f_ds_index(ds) - f_ds_index(ds) - f_ds_iter(ds) f_ds_iter(ds) f_ds_mb1(ds,10) @@ -558,92 +556,8 @@ f_ds_mb2(ds,10000) - - - - -#**************************************************************** -# dummy tests, less powerful than the previous tests, but can work with any new weird dataset. -# Basically, emphasis is put on consistency, but it never checks the actual values. -# To be used as a checklist, or a first test, when creating a new dataset - -def dummytest_all(ds) : - """ Launches all the dummytests with a given dataset. """ - - dummytest1_basicstats(ds) - dummytest2_slicing(ds) - dummytest3_fields_iterator_consistency(ds) - - -def dummytest1_basicstats(ds) : - """print basics stats on a dataset, like length""" - - print 'len(ds) = ',len(ds) - print 'num fields = ', len(ds.fieldNames()) - print 'types of field: ', - for k in ds.fieldNames() : - print type(ds[0](k)[0]), - print '' - -def dummytest2_slicing(ds) : - """test if slicing seems to works properly""" - print 'testing slicing...', - sys.stdout.flush() - - middle = len(ds) / 2 - tenpercent = int(len(ds) * .1) - set1 = ds[:middle+tenpercent] - set2 = ds[middle-tenpercent:] - for k in range(tenpercent + tenpercent -1): - for k2 in ds.fieldNames() : - if type(set1[middle-tenpercent+k](k2)[0]) == N.ndarray : - for k3 in range(len(set1[middle-tenpercent+k](k2)[0])) : - assert set1[middle-tenpercent+k](k2)[0][k3] == set2[k](k2)[0][k3] - else : - assert set1[middle-tenpercent+k](k2)[0] == set2[k](k2)[0] - assert tenpercent > 1 - set3 = ds[middle-tenpercent:middle+tenpercent:2] - for k2 in ds.fieldNames() : - if type(set2[2](k2)[0]) == N.ndarray : - for k3 in range(len(set2[2](k2)[0])) : - assert set2[2](k2)[0][k3] == set3[1](k2)[0][k3] - else : - assert set2[2](k2)[0] == set3[1](k2)[0] - - print 'done' - - -def dummytest3_fields_iterator_consistency(ds) : - """test if the number of iterator corresponds to the number of fields, also do it for minibatches""" - print 'testing fields/iterator consistency...', - sys.stdout.flush() - - # basic test - maxsize = min(len(ds)-1,100) - for iter in ds[:maxsize] : - assert len(iter) == len(ds.fieldNames()) - if len(ds.fieldNames()) == 1 : - print 'done' - return - - # with minibatches iterator - ds2 = ds[:maxsize].minibatches([ds.fieldNames()[0],ds.fieldNames()[1]],minibatch_size=2) - for iter in ds2 : - assert len(iter) == 2 - - print 'done' - - - - - - - - - if __name__=='__main__': - if 0: - test1() + test1() test_LookupList() test_ArrayDataSet() test_CachedDataSet()