Mercurial > pylearn
diff dataset.py @ 268:3f1cd8897fda
reverting dataset
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Wed, 04 Jun 2008 18:48:50 -0400 |
parents | 6e69fb91f3c0 |
children | fdce496c3b56 |
line wrap: on
line diff
--- a/dataset.py Wed Jun 04 17:49:28 2008 -0400 +++ b/dataset.py Wed Jun 04 18:48:50 2008 -0400 @@ -109,6 +109,10 @@ - dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in. + - dataset[fieldname] an iterable over the values of the field fieldname across + the dataset (the iterable is obtained by default by calling valuesVStack + over the values for individual examples). + - dataset.<property> returns the value of a property associated with the name <property>. The following properties should be supported: - 'description': a textual description or name for the dataset @@ -158,14 +162,10 @@ By convention, attributes not in attributeNames() should have a name starting with an underscore. @todo enforce/test that convention! - """ - if 0: - # removed by James June 4... these aren't used anywhere according to - # grep - numpy_vstack = lambda fieldname,values: numpy.vstack(values) - numpy_hstack = lambda fieldnames,values: numpy.hstack(values) + numpy_vstack = lambda fieldname,values: numpy.vstack(values) + numpy_hstack = lambda fieldnames,values: numpy.hstack(values) def __init__(self,description=None,fieldtypes=None): if description is None: @@ -277,11 +277,9 @@ # first get the beginning of our minibatch (top of dataset) first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next() second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next() - - blah = [self.dataset.valuesVStack(name,[first_part[name],second_part[name]]) - for name in self.fieldnames] - print type(self.dataset), blah - minibatch = Example(self.fieldnames,blah) + minibatch = Example(self.fieldnames, + [self.dataset.valuesVStack(name,[first_part[name],second_part[name]]) + for name in self.fieldnames]) self.next_row=upper self.n_batches_done+=1 if upper >= self.L and self.n_batches: @@ -462,16 +460,12 @@ for fieldname,field_values in zip(self.fieldNames(),fields_values)]), self.valuesVStack,self.valuesHStack) - - raise TypeError(i) - if 0: - # else check for a fieldname - #after talk with Yoshua June 4, this is disabled. - if self.hasFields(i): - return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] - # else we are trying to access a property of the dataset - assert i in self.__dict__ # else it means we are trying to access a non-existing property - return self.__dict__[i] + # else check for a fieldname + if self.hasFields(i): + return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] + # else we are trying to access a property of the dataset + assert i in self.__dict__ # else it means we are trying to access a non-existing property + return self.__dict__[i] def valuesHStack(self,fieldnames,fieldvalues): """ @@ -497,20 +491,21 @@ def valuesVStack(self,fieldname,values): """ - @param fieldname: the name of the field from which the values were taken - @type fieldname: any type - - @param values: bits near the beginning or end of the dataset - @type values: list of minibatches (returned by minibatch_nowrap) - - @return: the concatenation (stacking) of the values - @rtype: something suitable as a minibatch field - + Return a value that corresponds to concatenating (vertically) several values of the + same field. This can be important to build a minibatch out of individual examples. This + is likely to involve a copy of the original values. When the values are numpy arrays, the + result should be numpy.vstack(values). + The default is to use numpy.vstack for numpy.ndarray values, and a list + pointing to the original values for other data types. """ - rval = [] - for sub_batch in values: - rval.extend(sub_batch) - return rval + all_numpy=True + for value in values: + if not type(value) is numpy.ndarray: + all_numpy=False + if all_numpy: + return numpy.vstack(values) + # the default implementation of vertical stacking is to put values in a list + return values def __or__(self,other): """ @@ -958,29 +953,16 @@ Virtual super-class of datasets whose field values are numpy array, thus defining valuesHStack and valuesVStack for sub-classes. """ - def __init__(self, description=None, field_types=None): - DataSet.__init__(self, description, field_types) - def valuesHStack(self, fieldnames, fieldvalues): + def __init__(self,description=None,field_types=None): + DataSet.__init__(self,description,field_types) + def valuesHStack(self,fieldnames,fieldvalues): """Concatenate field values horizontally, e.g. two vectors become a longer vector, two matrices become a wider matrix, etc.""" return numpy.hstack(fieldvalues) - def valuesVStack(self, fieldname, values): + def valuesVStack(self,fieldname,values): """Concatenate field values vertically, e.g. two vectors become a two-row matrix, two matrices become a longer matrix, etc.""" - #print len(values) - for v in values: - if not isinstance(v, numpy.ndarray): - raise TypeError(v, type(v)) - - s0 = sum([v.shape[0] for v in values]) - #TODO: there's gotta be a better way to do this! - dtype = values[0].dtype - rval = numpy.ndarray([s0] + list(values[0].shape[1:]), dtype=dtype) - cur_row = 0 - for v in values: - rval[cur_row:cur_row+v.shape[0]] = v - cur_row += v.shape[0] - return rval + return numpy.vstack(values) class ArrayDataSet(ArrayFieldsDataSet): """ @@ -1005,7 +987,7 @@ for fieldname, fieldcolumns in self.fields_columns.items(): if type(fieldcolumns) is int: assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1] - if 0: + if 1: #I changed this because it didn't make sense to me, # and it made it more difficult to write my learner. # If it breaks stuff, let's talk about it. @@ -1037,7 +1019,7 @@ values=self.fields_columns.values() if type(key) is int: return Example(fieldnames, - [numpy.asarray(self.data[key,col]) for col in values]) + [self.data[key,col] for col in values]) if type(key) is slice: return MinibatchDataSet(Example(fieldnames, [self.data[key,col] for col in values])) @@ -1115,207 +1097,198 @@ class CachedDataSet(DataSet): - """ - Wrap a L{DataSet} whose values are computationally expensive to obtain - (e.g. because they involve some computation, or disk access), - so that repeated accesses to the same example are done cheaply, - by caching every example value that has been accessed at least once. + """ + Wrap a L{DataSet} whose values are computationally expensive to obtain + (e.g. because they involve some computation, or disk access), + so that repeated accesses to the same example are done cheaply, + by caching every example value that has been accessed at least once. - Optionally, for finite-length dataset, all the values can be computed - (and cached) upon construction of the CachedDataSet, rather at the - first access. + Optionally, for finite-length dataset, all the values can be computed + (and cached) upon construction of the CachedDataSet, rather at the + first access. - @todo: when cache_all_upon_construction create mini-batches that are as - large as possible but not so large as to fill up memory. - - @todo: add disk-buffering capability, so that when the cache becomes too - big for memory, we cache things on disk, trying to keep in memory only - the record most likely to be accessed next. - """ - def __init__(self,source_dataset,cache_all_upon_construction=False): - self.source_dataset=source_dataset - self.cache_all_upon_construction=cache_all_upon_construction - self.cached_examples = [] #a list of LookupList (copies) - if cache_all_upon_construction: - # this potentially brings all the source examples - # into memory at once, which may be too much - # the work could possibly be done by minibatches - # that are as large as possible but no more than what memory allows. - fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next() - assert all([len(self)==len(fval) for fval in fields_values]) - for example in fields_values.examples(): - dup = copy.copy(example) - self.cached_examples.append(dup) + @todo: when cache_all_upon_construction create mini-batches that are as + large as possible but not so large as to fill up memory. + + @todo: add disk-buffering capability, so that when the cache becomes too + big for memory, we cache things on disk, trying to keep in memory only + the record most likely to be accessed next. + """ + def __init__(self,source_dataset,cache_all_upon_construction=False): + self.source_dataset=source_dataset + self.cache_all_upon_construction=cache_all_upon_construction + self.cached_examples = [] + if cache_all_upon_construction: + # this potentially brings all the source examples + # into memory at once, which may be too much + # the work could possibly be done by minibatches + # that are as large as possible but no more than what memory allows. + fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next() + assert all([len(self)==len(field_values) for field_values in fields_values]) + for example in fields_values.examples(): + self.cached_examples.append(copy.copy(example)) - self.fieldNames = source_dataset.fieldNames - self.hasFields = source_dataset.hasFields - self.valuesHStack = source_dataset.valuesHStack - self.valuesVStack = source_dataset.valuesVStack + self.fieldNames = source_dataset.fieldNames + self.hasFields = source_dataset.hasFields + self.valuesHStack = source_dataset.valuesHStack + self.valuesVStack = source_dataset.valuesVStack - def __len__(self): - return len(self.source_dataset) + def __len__(self): + return len(self.source_dataset) - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - class CacheIterator(object): - def __init__(self,dataset): - self.dataset=dataset - self.current=offset - self.all_fields = self.dataset.fieldNames()==fieldnames - def __iter__(self): return self - def next(self): - upper = self.current+minibatch_size - cache_len = len(self.dataset.cached_examples) - if upper>cache_len: - # whole minibatch is not already in cache - # cache everything from current length to upper - for example in self.dataset.source_dataset[cache_len:upper]: - self.dataset.cached_examples.append(example) - - next_range = slice(self.current, self.current+minibatch_size) - blah = self.dataset.cached_examples[next_range] - all_fields_minibatch = Example(self.dataset.fieldNames(), zip(*blah)) - self.current+=minibatch_size - - #little optimization to avoid second Example computation if - #possible. - if self.all_fields: - return all_fields_minibatch + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + class CacheIterator(object): + def __init__(self,dataset): + self.dataset=dataset + self.current=offset + self.all_fields = self.dataset.fieldNames()==fieldnames + def __iter__(self): return self + def next(self): + upper = self.current+minibatch_size + cache_len = len(self.dataset.cached_examples) + if upper>cache_len: # whole minibatch is not already in cache + # cache everything from current length to upper + for example in self.dataset.source_dataset[cache_len:upper]: + self.dataset.cached_examples.append(example) + all_fields_minibatch = Example(self.dataset.fieldNames(), + zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size])) + self.current+=minibatch_size + if self.all_fields: + return all_fields_minibatch + return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames]) + return CacheIterator(self) - rval = Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames]) - return rval - return CacheIterator(self) + def __getitem__(self,i): + if type(i)==int and len(self.cached_examples)>i: + return self.cached_examples[i] + else: + return self.source_dataset[i] + + def __iter__(self): + class CacheIteratorIter(object): + def __init__(self,dataset): + self.dataset=dataset + self.l = len(dataset) + self.current = 0 + self.fieldnames = self.dataset.fieldNames() + self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames)) + def __iter__(self): return self + def next(self): + if self.current>=self.l: + raise StopIteration + cache_len = len(self.dataset.cached_examples) + if self.current>=cache_len: # whole minibatch is not already in cache + # cache everything from current length to upper + self.dataset.cached_examples.append( + self.dataset.source_dataset[self.current]) + self.example._values = self.dataset.cached_examples[self.current] + self.current+=1 + return self.example - def __getitem__(self,i): - if type(i)==int and len(self.cached_examples)>i: - return self.cached_examples[i] - else: - return self.source_dataset[i] - - def __iter__(self): - class CacheIteratorIter(object): - def __init__(self,dataset): - self.dataset=dataset - self.l = len(dataset) - self.current = 0 - self.fieldnames = self.dataset.fieldNames() - self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames)) - def __iter__(self): return self - def next(self): - if self.current>=self.l: - raise StopIteration - cache_len = len(self.dataset.cached_examples) - if self.current>=cache_len: # whole minibatch is not already in cache - # cache everything from current length to upper - self.dataset.cached_examples.append( - self.dataset.source_dataset[self.current]) - self.example._values = self.dataset.cached_examples[self.current] - self.current+=1 - return self.example - - return CacheIteratorIter(self) + return CacheIteratorIter(self) class ApplyFunctionDataSet(DataSet): - """ - A L{DataSet} that contains as fields the results of applying a - given function example-wise or minibatch-wise to all the fields of - an input dataset. The output of the function should be an iterable - (e.g. a list or a LookupList) over the resulting values. - - The function take as input the fields of the dataset, not the examples. + """ + A L{DataSet} that contains as fields the results of applying a + given function example-wise or minibatch-wise to all the fields of + an input dataset. The output of the function should be an iterable + (e.g. a list or a LookupList) over the resulting values. + + The function take as input the fields of the dataset, not the examples. - In minibatch mode, the function is expected to work on minibatches - (takes a minibatch in input and returns a minibatch in output). More - precisely, it means that each element of the input or output list - should be iterable and indexable over the individual example values - (typically these elements will be numpy arrays). All of the elements - in the input and output lists should have the same length, which is - the length of the minibatch. + In minibatch mode, the function is expected to work on minibatches + (takes a minibatch in input and returns a minibatch in output). More + precisely, it means that each element of the input or output list + should be iterable and indexable over the individual example values + (typically these elements will be numpy arrays). All of the elements + in the input and output lists should have the same length, which is + the length of the minibatch. - The function is applied each time an example or a minibatch is accessed. - To avoid re-doing computation, wrap this dataset inside a CachedDataSet. + The function is applied each time an example or a minibatch is accessed. + To avoid re-doing computation, wrap this dataset inside a CachedDataSet. - If the values_{h,v}stack functions are not provided, then - the input_dataset.values{H,V}Stack functions are used by default. - """ - def __init__(self,input_dataset,function,output_names,minibatch_mode=True, - values_hstack=None,values_vstack=None, - description=None,fieldtypes=None): - """ - Constructor takes an input dataset that has as many fields as the function - expects as inputs. The resulting dataset has as many fields as the function - produces as outputs, and that should correspond to the number of output names - (provided in a list). + If the values_{h,v}stack functions are not provided, then + the input_dataset.values{H,V}Stack functions are used by default. + """ + def __init__(self,input_dataset,function,output_names,minibatch_mode=True, + values_hstack=None,values_vstack=None, + description=None,fieldtypes=None): + """ + Constructor takes an input dataset that has as many fields as the function + expects as inputs. The resulting dataset has as many fields as the function + produces as outputs, and that should correspond to the number of output names + (provided in a list). - Note that the expected semantics of the function differs in minibatch mode - (it takes minibatches of inputs and produces minibatches of outputs, as - documented in the class comment). + Note that the expected semantics of the function differs in minibatch mode + (it takes minibatches of inputs and produces minibatches of outputs, as + documented in the class comment). - TBM: are filedtypes the old field types (from input_dataset) or the new ones - (for the new dataset created)? - """ - self.input_dataset=input_dataset - self.function=function - self.output_names=output_names - self.minibatch_mode=minibatch_mode - DataSet.__init__(self,description,fieldtypes) - self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack - self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack + TBM: are filedtypes the old field types (from input_dataset) or the new ones + (for the new dataset created)? + """ + self.input_dataset=input_dataset + self.function=function + self.output_names=output_names + self.minibatch_mode=minibatch_mode + DataSet.__init__(self,description,fieldtypes) + self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack + self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack - def __len__(self): - return len(self.input_dataset) + def __len__(self): + return len(self.input_dataset) - def fieldNames(self): - return self.output_names + def fieldNames(self): + return self.output_names - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - class ApplyFunctionIterator(object): - def __init__(self,output_dataset): - self.input_dataset=output_dataset.input_dataset - self.output_dataset=output_dataset - self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size, - n_batches=n_batches,offset=offset).__iter__() + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + class ApplyFunctionIterator(object): + def __init__(self,output_dataset): + self.input_dataset=output_dataset.input_dataset + self.output_dataset=output_dataset + self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size, + n_batches=n_batches,offset=offset).__iter__() - def __iter__(self): return self + def __iter__(self): return self - def next(self): - function_inputs = self.input_iterator.next() - all_output_names = self.output_dataset.output_names - if self.output_dataset.minibatch_mode: - function_outputs = self.output_dataset.function(*function_inputs) - else: - input_examples = zip(*function_inputs) - output_examples = [self.output_dataset.function(*input_example) - for input_example in input_examples] - function_outputs = [self.output_dataset.valuesVStack(name,values) - for name,values in zip(all_output_names, - zip(*output_examples))] - all_outputs = Example(all_output_names,function_outputs) - if fieldnames==all_output_names: - return all_outputs - return Example(fieldnames,[all_outputs[name] for name in fieldnames]) + def next(self): + function_inputs = self.input_iterator.next() + all_output_names = self.output_dataset.output_names + if self.output_dataset.minibatch_mode: + function_outputs = self.output_dataset.function(*function_inputs) + else: + input_examples = zip(*function_inputs) + output_examples = [self.output_dataset.function(*input_example) + for input_example in input_examples] + function_outputs = [self.output_dataset.valuesVStack(name,values) + for name,values in zip(all_output_names, + zip(*output_examples))] + all_outputs = Example(all_output_names,function_outputs) + if fieldnames==all_output_names: + return all_outputs + return Example(fieldnames,[all_outputs[name] for name in fieldnames]) - return ApplyFunctionIterator(self) + return ApplyFunctionIterator(self) - def __iter__(self): # only implemented for increased efficiency - class ApplyFunctionSingleExampleIterator(object): - def __init__(self,output_dataset): - self.current=0 - self.output_dataset=output_dataset - self.input_iterator=output_dataset.input_dataset.__iter__() - def __iter__(self): return self - def next(self): - if self.output_dataset.minibatch_mode: - function_inputs = [[input] for input in self.input_iterator.next()] - outputs = self.output_dataset.function(*function_inputs) - assert all([hasattr(output,'__iter__') for output in outputs]) - function_outputs = [output[0] for output in outputs] - else: - function_inputs = self.input_iterator.next() - function_outputs = self.output_dataset.function(*function_inputs) - return Example(self.output_dataset.output_names,function_outputs) - return ApplyFunctionSingleExampleIterator(self) - + def __iter__(self): # only implemented for increased efficiency + class ApplyFunctionSingleExampleIterator(object): + def __init__(self,output_dataset): + self.current=0 + self.output_dataset=output_dataset + self.input_iterator=output_dataset.input_dataset.__iter__() + def __iter__(self): return self + def next(self): + if self.output_dataset.minibatch_mode: + function_inputs = [[input] for input in self.input_iterator.next()] + outputs = self.output_dataset.function(*function_inputs) + assert all([hasattr(output,'__iter__') for output in outputs]) + function_outputs = [output[0] for output in outputs] + else: + function_inputs = self.input_iterator.next() + function_outputs = self.output_dataset.function(*function_inputs) + return Example(self.output_dataset.output_names,function_outputs) + return ApplyFunctionSingleExampleIterator(self) + def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): """