pylearn: dataset.py comparison

comparison dataset.py @ 268:3f1cd8897fda

reverting dataset

author	James Bergstra <bergstrj@iro.umontreal.ca>
date	Wed, 04 Jun 2008 18:48:50 -0400
parents	6e69fb91f3c0
children	fdce496c3b56

comparison

equal deleted inserted replaced

-:4dad41215967
+:3f1cd8897fda
 - dataset[i] returns an Example.
 - dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in.
+- dataset[fieldname] an iterable over the values of the field fieldname across
+the dataset (the iterable is obtained by default by calling valuesVStack
+over the values for individual examples).
 - dataset.<property> returns the value of a property associated with
 the name <property>. The following properties should be supported:
 - 'description': a textual description or name for the dataset
 - 'fieldtypes': a list of types (one per field)
 A DataSet may have other attributes that it makes visible to other objects. These are
 A sub-class should also append attributes to self._attribute_names
 (the default value returned by attributeNames()).
 By convention, attributes not in attributeNames() should have a name
 starting with an underscore.
 @todo enforce/test that convention!
+"""
-"""
+numpy_vstack = lambda fieldname,values: numpy.vstack(values)
-if 0:
+numpy_hstack = lambda fieldnames,values: numpy.hstack(values)
-# removed by James June 4... these aren't used anywhere according to
-# grep
-numpy_vstack = lambda fieldname,values: numpy.vstack(values)
-numpy_hstack = lambda fieldnames,values: numpy.hstack(values)
 def __init__(self,description=None,fieldtypes=None):
 if description is None:
 # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)"
 description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )"
 else:
 # we must concatenate (vstack) the bottom and top parts of our minibatch
 # first get the beginning of our minibatch (top of dataset)
 first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
 second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next()
+minibatch = Example(self.fieldnames,
-blah = [self.dataset.valuesVStack(name,[first_part[name],second_part[name]])
+[self.dataset.valuesVStack(name,[first_part[name],second_part[name]])
-for name in self.fieldnames]
+for name in self.fieldnames])
-print type(self.dataset), blah
-minibatch = Example(self.fieldnames,blah)
 self.next_row=upper
 self.n_batches_done+=1
 if upper >= self.L and self.n_batches:
 self.next_row -= self.L
 ds_nbatches =  (self.L-self.next_row)/self.minibatch_size
 return MinibatchDataSet(
 Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values)
 for fieldname,field_values
 in zip(self.fieldNames(),fields_values)]),
 self.valuesVStack,self.valuesHStack)
+# else check for a fieldname
-raise TypeError(i)
+if self.hasFields(i):
-if 0:
+return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
-# else check for a fieldname
+# else we are trying to access a property of the dataset
-#after talk with Yoshua June 4, this is disabled.
+assert i in self.__dict__ # else it means we are trying to access a non-existing property
-if self.hasFields(i):
+return self.__dict__[i]
-return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
-# else we are trying to access a property of the dataset
-assert i in self.__dict__ # else it means we are trying to access a non-existing property
-return self.__dict__[i]
 def valuesHStack(self,fieldnames,fieldvalues):
 """
 Return a value that corresponds to concatenating (horizontally) several field values.
 This can be useful to merge some fields. The implementation of this operation is likely
 return fieldvalues
 def valuesVStack(self,fieldname,values):
 """
-@param fieldname: the name of the field from which the values were taken
+Return a value that corresponds to concatenating (vertically) several values of the
-@type fieldname: any type
+same field. This can be important to build a minibatch out of individual examples. This
+is likely to involve a copy of the original values. When the values are numpy arrays, the
-@param values: bits near the beginning or end of the dataset
+result should be numpy.vstack(values).
-@type values: list of minibatches (returned by minibatch_nowrap)
+The default is to use numpy.vstack for numpy.ndarray values, and a list
+pointing to the original values for other data types.
-@return: the concatenation (stacking) of the values
+"""
-@rtype: something suitable as a minibatch field
+all_numpy=True
+for value in values:
-"""
+if not type(value) is numpy.ndarray:
-rval = []
+all_numpy=False
-for sub_batch in values:
+if all_numpy:
-rval.extend(sub_batch)
+return numpy.vstack(values)
-return rval
+# the default implementation of vertical stacking is to put values in a list
+return values
 def __or__(self,other):
 """
 dataset1 | dataset2 returns a dataset whose list of fields is the concatenation of the list of
 fields of the argument datasets. This only works if they all have the same length.
 class ArrayFieldsDataSet(DataSet):
 """
 Virtual super-class of datasets whose field values are numpy array,
 thus defining valuesHStack and valuesVStack for sub-classes.
 """
-def __init__(self, description=None, field_types=None):
+def __init__(self,description=None,field_types=None):
-DataSet.__init__(self, description, field_types)
+DataSet.__init__(self,description,field_types)
-def valuesHStack(self, fieldnames, fieldvalues):
+def valuesHStack(self,fieldnames,fieldvalues):
 """Concatenate field values horizontally, e.g. two vectors
 become a longer vector, two matrices become a wider matrix, etc."""
 return numpy.hstack(fieldvalues)
-def valuesVStack(self, fieldname, values):
+def valuesVStack(self,fieldname,values):
 """Concatenate field values vertically, e.g. two vectors
 become a two-row matrix, two matrices become a longer matrix, etc."""
-#print len(values)
+return numpy.vstack(values)
-for v in values:
-if not isinstance(v, numpy.ndarray):
-raise TypeError(v, type(v))
-s0 = sum([v.shape[0] for v in values])
-#TODO: there's gotta be a better way to do this!
-dtype = values[0].dtype
-rval = numpy.ndarray([s0] + list(values[0].shape[1:]), dtype=dtype)
-cur_row = 0
-for v in values:
-rval[cur_row:cur_row+v.shape[0]] = v
-cur_row += v.shape[0]
-return rval
 class ArrayDataSet(ArrayFieldsDataSet):
 """
 An ArrayDataSet stores the fields as groups of columns in a numpy tensor,
 whose first axis iterates over examples, second axis determines fields.
 # check consistency and complete slices definitions
 for fieldname, fieldcolumns in self.fields_columns.items():
 if type(fieldcolumns) is int:
 assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1]
-if 0:
+if 1:
 #I changed this because it didn't make sense to me,
 # and it made it more difficult to write my learner.
 # If it breaks stuff, let's talk about it.
 # - James 22/05/2008
 self.fields_columns[fieldname]=[fieldcolumns]
 """More efficient implementation than the default __getitem__"""
 fieldnames=self.fields_columns.keys()
 values=self.fields_columns.values()
 if type(key) is int:
 return Example(fieldnames,
-[numpy.asarray(self.data[key,col]) for col in values])
+[self.data[key,col] for col in values])
 if type(key) is slice:
 return MinibatchDataSet(Example(fieldnames,
 [self.data[key,col] for col in values]))
 if type(key) is list:
 for i in range(len(key)):
 return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
 class CachedDataSet(DataSet):
 """
 Wrap a L{DataSet} whose values are computationally expensive to obtain
 (e.g. because they involve some computation, or disk access),
 so that repeated accesses to the same example are done cheaply,
 by caching every example value that has been accessed at least once.
 Optionally, for finite-length dataset, all the values can be computed
 (and cached) upon construction of the CachedDataSet, rather at the
 first access.
 @todo: when cache_all_upon_construction create mini-batches that are as
 large as possible but not so large as to fill up memory.
 @todo: add disk-buffering capability, so that when the cache becomes too
 big for memory, we cache things on disk, trying to keep in memory only
 the record most likely to be accessed next.
 """
 def __init__(self,source_dataset,cache_all_upon_construction=False):
 self.source_dataset=source_dataset
 self.cache_all_upon_construction=cache_all_upon_construction
-self.cached_examples = [] #a list of LookupList (copies)
+self.cached_examples = []
 if cache_all_upon_construction:
 # this potentially brings all the source examples
 # into memory at once, which may be too much
 # the work could possibly be done by minibatches
 # that are as large as possible but no more than what memory allows.
 fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()
-assert all([len(self)==len(fval) for fval in fields_values])
+assert all([len(self)==len(field_values) for field_values in fields_values])
 for example in fields_values.examples():
-dup = copy.copy(example)
+self.cached_examples.append(copy.copy(example))
-self.cached_examples.append(dup)
+self.fieldNames = source_dataset.fieldNames
-self.fieldNames = source_dataset.fieldNames
+self.hasFields = source_dataset.hasFields
-self.hasFields = source_dataset.hasFields
+self.valuesHStack = source_dataset.valuesHStack
-self.valuesHStack = source_dataset.valuesHStack
+self.valuesVStack = source_dataset.valuesVStack
-self.valuesVStack = source_dataset.valuesVStack
 def __len__(self):
 return len(self.source_dataset)
 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
 class CacheIterator(object):
 def __init__(self,dataset):
 self.dataset=dataset
 self.current=offset
 self.all_fields = self.dataset.fieldNames()==fieldnames
 def __iter__(self): return self
 def next(self):
 upper = self.current+minibatch_size
 cache_len = len(self.dataset.cached_examples)
-if upper>cache_len:
+if upper>cache_len: # whole minibatch is not already in cache
-# whole minibatch is not already in cache
+# cache everything from current length to upper
-# cache everything from current length to upper
+for example in self.dataset.source_dataset[cache_len:upper]:
-for example in self.dataset.source_dataset[cache_len:upper]:
+self.dataset.cached_examples.append(example)
-self.dataset.cached_examples.append(example)
+all_fields_minibatch = Example(self.dataset.fieldNames(),
+zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size]))
-next_range = slice(self.current, self.current+minibatch_size)
+self.current+=minibatch_size
-blah = self.dataset.cached_examples[next_range]
+if self.all_fields:
-all_fields_minibatch = Example(self.dataset.fieldNames(), zip(*blah))
+return all_fields_minibatch
-self.current+=minibatch_size
+return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
+return CacheIterator(self)
-#little optimization to avoid second Example computation if
-#possible.
+def __getitem__(self,i):
-if self.all_fields:
+if type(i)==int and len(self.cached_examples)>i:
-return all_fields_minibatch
+return self.cached_examples[i]
+else:
-rval = Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
+return self.source_dataset[i]
-return rval
-return CacheIterator(self)
+def __iter__(self):
+class CacheIteratorIter(object):
-def __getitem__(self,i):
+def __init__(self,dataset):
-if type(i)==int and len(self.cached_examples)>i:
+self.dataset=dataset
-return self.cached_examples[i]
+self.l = len(dataset)
-else:
+self.current = 0
-return self.source_dataset[i]
+self.fieldnames = self.dataset.fieldNames()
+self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames))
-def __iter__(self):
+def __iter__(self): return self
-class CacheIteratorIter(object):
+def next(self):
-def __init__(self,dataset):
+if self.current>=self.l:
-self.dataset=dataset
+raise StopIteration
-self.l = len(dataset)
+cache_len = len(self.dataset.cached_examples)
-self.current = 0
+if self.current>=cache_len: # whole minibatch is not already in cache
-self.fieldnames = self.dataset.fieldNames()
+# cache everything from current length to upper
-self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames))
+self.dataset.cached_examples.append(
-def __iter__(self): return self
+self.dataset.source_dataset[self.current])
-def next(self):
+self.example._values = self.dataset.cached_examples[self.current]
-if self.current>=self.l:
+self.current+=1
-raise StopIteration
+return self.example
-cache_len = len(self.dataset.cached_examples)
-if self.current>=cache_len: # whole minibatch is not already in cache
+return CacheIteratorIter(self)
-# cache everything from current length to upper
-self.dataset.cached_examples.append(
-self.dataset.source_dataset[self.current])
-self.example._values = self.dataset.cached_examples[self.current]
-self.current+=1
-return self.example
-return CacheIteratorIter(self)
 class ApplyFunctionDataSet(DataSet):
 """
 A L{DataSet} that contains as fields the results of applying a
 given function example-wise or minibatch-wise to all the fields of
 an input dataset.  The output of the function should be an iterable
 (e.g. a list or a LookupList) over the resulting values.
 The function take as input the fields of the dataset, not the examples.
 In minibatch mode, the function is expected to work on minibatches
 (takes a minibatch in input and returns a minibatch in output). More
 precisely, it means that each element of the input or output list
 should be iterable and indexable over the individual example values
 (typically these elements will be numpy arrays). All of the elements
 in the input and output lists should have the same length, which is
 the length of the minibatch.
 The function is applied each time an example or a minibatch is accessed.
 To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
 If the values_{h,v}stack functions are not provided, then
 the input_dataset.values{H,V}Stack functions are used by default.
 """
 def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
 values_hstack=None,values_vstack=None,
 description=None,fieldtypes=None):
 """
 Constructor takes an input dataset that has as many fields as the function
 expects as inputs. The resulting dataset has as many fields as the function
 produces as outputs, and that should correspond to the number of output names
 (provided in a list).
 Note that the expected semantics of the function differs in minibatch mode
 (it takes minibatches of inputs and produces minibatches of outputs, as
 documented in the class comment).
 TBM: are filedtypes the old field types (from input_dataset) or the new ones
 (for the new dataset created)?
 """
 self.input_dataset=input_dataset
 self.function=function
 self.output_names=output_names
 self.minibatch_mode=minibatch_mode
 DataSet.__init__(self,description,fieldtypes)
 self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
 self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
 def __len__(self):
 return len(self.input_dataset)
 def fieldNames(self):
 return self.output_names
 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
 class ApplyFunctionIterator(object):
 def __init__(self,output_dataset):
 self.input_dataset=output_dataset.input_dataset
 self.output_dataset=output_dataset
 self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size,
 n_batches=n_batches,offset=offset).__iter__()
 def __iter__(self): return self
 def next(self):
 function_inputs = self.input_iterator.next()
 all_output_names = self.output_dataset.output_names
 if self.output_dataset.minibatch_mode:
 function_outputs = self.output_dataset.function(*function_inputs)
 else:
 input_examples = zip(*function_inputs)
 output_examples = [self.output_dataset.function(*input_example)
 for input_example in input_examples]
 function_outputs = [self.output_dataset.valuesVStack(name,values)
 for name,values in zip(all_output_names,
 zip(*output_examples))]
 all_outputs = Example(all_output_names,function_outputs)
 if fieldnames==all_output_names:
 return all_outputs
 return Example(fieldnames,[all_outputs[name] for name in fieldnames])
 return ApplyFunctionIterator(self)
 def __iter__(self): # only implemented for increased efficiency
 class ApplyFunctionSingleExampleIterator(object):
 def __init__(self,output_dataset):
 self.current=0
 self.output_dataset=output_dataset
 self.input_iterator=output_dataset.input_dataset.__iter__()
 def __iter__(self): return self
 def next(self):
 if self.output_dataset.minibatch_mode:
 function_inputs = [[input] for input in self.input_iterator.next()]
 outputs = self.output_dataset.function(*function_inputs)
 assert all([hasattr(output,'__iter__') for output in outputs])
 function_outputs = [output[0] for output in outputs]
 else:
 function_inputs = self.input_iterator.next()
 function_outputs = self.output_dataset.function(*function_inputs)
 return Example(self.output_dataset.output_names,function_outputs)
 return ApplyFunctionSingleExampleIterator(self)
 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
 """
 Wraps an arbitrary L{DataSet} into one for supervised learning tasks
 by forcing the user to define a set of fields as the 'input' field

Mercurial > pylearn

comparison dataset.py @ 268:3f1cd8897fda