pylearn: dataset.py comparison

comparison dataset.py @ 290:9b533cc7874a

trying to get default implemenations to work

author	James Bergstra <bergstrj@iro.umontreal.ca>
date	Thu, 05 Jun 2008 18:38:42 -0400
parents	ed70580f2324
children	174374d59405

comparison

equal deleted inserted replaced

-:271a16d42072
+:9b533cc7874a
-from lookup_list import LookupList
+from lookup_list import LookupList as Example
-Example = LookupList
 from misc import unique_elements_list_intersection
 from string import join
 from sys import maxint
 import numpy, copy
 attribute_names = self.attributeNames()
 if return_copy:
 return [copy.copy(self.__getattribute__(name)) for name in attribute_names]
 else:
 return [self.__getattribute__(name) for name in attribute_names]
 class DataSet(AttributesHolder):
 """A virtual base class for datasets.
 A DataSet can be seen as a generalization of a matrix, meant to be used in conjunction
 self.fieldnames=fieldnames
 self.minibatch_size=minibatch_size
 self.n_batches=n_batches
 self.n_batches_done=0
 self.next_row=offset
-self.offset=offset
 self.L=len(dataset)
-assert offset+minibatch_size<=self.L
+self.offset=offset % self.L
 ds_nbatches =  (self.L-self.next_row)/self.minibatch_size
 if n_batches is not None:
 ds_nbatches = min(n_batches,ds_nbatches)
 if fieldnames:
 assert dataset.hasFields(*fieldnames)
 else:
 self.fieldnames=dataset.fieldNames()
-self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size,
+self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, ds_nbatches,self.next_row)
-ds_nbatches,self.next_row)
 def __iter__(self):
 return self
 def next_index(self):
 Using the first syntax, all the fields will be returned in "i".
 Using the third syntax, i1, i2, i3 will be list-like containers of the
 f1, f2, and f3 fields of a batch of examples on each loop iteration.
 The minibatches iterator is expected to return upon each call to next()
-a DataSetFields object, which is a LookupList (indexed by the field names) whose
+a DataSetFields object, which is a Example (indexed by the field names) whose
 elements are iterable and indexable over the minibatch examples, and which keeps a pointer to
 a sub-dataset that can be used to iterate over the individual examples
 in the minibatch. Hence a minibatch can be converted back to a regular
 dataset or its fields can be looked at individually (and possibly iterated over).
 assert i in self.__dict__ # else it means we are trying to access a non-existing property
 return self.__dict__[i]
 def __getitem__(self,i):
 """
-dataset[i] returns the (i+1)-th example of the dataset.
+@rtype: Example
-dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
+@returns: single or multiple examples
-dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
-dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in.
+@type i: integer or slice or <iterable> of integers
-dataset['key'] returns a property associated with the given 'key' string.
+@param i:
-If 'key' is a fieldname, then the VStacked field values (iterable over
+dataset[i] returns the (i+1)-th example of the dataset.
-field values) for that field is returned. Other keys may be supported
+dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
-by different dataset subclasses. The following key names are encouraged:
+dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
-- 'description': a textual description or name for the dataset
+dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in.
-- '<fieldname>.type': a type name or value for a given <fieldname>
+@note:
-Note that some stream datasets may be unable to implement random access, i.e.
+Some stream datasets may be unable to implement random access, i.e.
-arbitrary slicing/indexing
+arbitrary slicing/indexing because they can only iterate through
-because they can only iterate through examples one or a minibatch at a time
+examples one or a minibatch at a time and do not actually store or keep
-and do not actually store or keep past (or future) examples.
+past (or future) examples.
 The default implementation of getitem uses the minibatches iterator
 to obtain one example, one slice, or a list of examples. It may not
 always be the most efficient way to obtain the result, especially if
 the data are actually stored in a memory array.
 """
-# check for an index
 if type(i) is int:
-return DataSet.MinibatchToSingleExampleIterator(
+#TODO: consider asserting that i >= 0
-self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next()
+i_batch = self.minibatches_nowrap(self.fieldNames(),
-rows=None
+minibatch_size=1, n_batches=1, offset=i % len(self))
-# or a slice
+return DataSet.MinibatchToSingleExampleIterator(i_batch).next()
+#if i is a contiguous slice
+if type(i) is slice and (i.step in (None, 1)):
+offset = 0 if i.start is None else i.start
+upper_bound = len(self) if i.stop is None else i.stop
+return MinibatchDataSet(self.minibatches_nowrap(self.fieldNames(),
+minibatch_size=upper_bound - offset,
+n_batches=1,
+offset=offset).next())
+# if slice has a step param, convert it to list and handle it with the
+# list code
 if type(i) is slice:
-#print 'i=',i
+offset = 0 if i.start is None else i.start
-if not i.start: i=slice(0,i.stop,i.step)
+upper_bound = len(self) if i.stop is None else i.stop
-if not i.stop: i=slice(i.start,len(self),i.step)
+i = list(range(offset, upper_bound, i.step))
-if not i.step: i=slice(i.start,i.stop,1)
-if i.step is 1:
+# handle tuples, arrays, lists
-return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples()
+if hasattr(i, '__getitem__'):
-rows = range(i.start,i.stop,i.step)
+for idx in i:
-# or a list of indices
+#dis-allow nested slices
-elif type(i) is list:
+if not isinstance(idx, int):
-rows = i
+raise TypeError(idx)
-if rows is not None:
+# call back into self.__getitem__
-examples = [self[row] for row in rows]
+examples = [self.minibatches_nowrap(self.fieldNames(),
-fields_values = zip(*examples)
+minibatch_size=1, n_batches=1, offset=ii%len(self)).next()
-return MinibatchDataSet(
+for ii in i]
-Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values)
+# re-index the fields in each example by field instead of by example
-for fieldname,field_values
+field_values = [[] for blah in  self.fieldNames()]
-in zip(self.fieldNames(),fields_values)]),
+for e in examples:
-self.valuesVStack,self.valuesHStack)
+for f,v in zip(field_values, e):
+f.append(v)
+#build them into a LookupList (a.ka. Example)
+zz = zip(self.fieldNames(),field_values)
+vst = [self.valuesVStack(fieldname,field_values) for fieldname,field_values in zz]
+example = Example(self.fieldNames(), vst)
+return MinibatchDataSet(example, self.valuesVStack, self.valuesHStack)
 raise TypeError(i, type(i))
 def valuesHStack(self,fieldnames,fieldvalues):
 """
 Return a value that corresponds to concatenating (horizontally) several field values.
 if all_numpy:
 return numpy.hstack(fieldvalues)
 # the default implementation of horizontal stacking is to put values in a list
 return fieldvalues
 def valuesVStack(self,fieldname,values):
 """
-Return a value that corresponds to concatenating (vertically) several values of the
+@param fieldname: the name of the field from which the values were taken
-same field. This can be important to build a minibatch out of individual examples. This
+@type fieldname: any type
-is likely to involve a copy of the original values. When the values are numpy arrays, the
-result should be numpy.vstack(values).
+@param values: bits near the beginning or end of the dataset
-The default is to use numpy.vstack for numpy.ndarray values, and a list
+@type values: list of minibatches (returned by minibatch_nowrap)
-pointing to the original values for other data types.
-"""
+@return: the concatenation (stacking) of the values
-all_numpy=True
+@rtype: something suitable as a minibatch field
-for value in values:
+"""
-if not type(value) is numpy.ndarray:
+rval = []
-all_numpy=False
+for v in values:
-if all_numpy:
+rval.extend(v)
-return numpy.vstack(values)
+return rval
-# the default implementation of vertical stacking is to put values in a list
-return values
 def __or__(self,other):
 """
 dataset1 | dataset2 returns a dataset whose list of fields is the concatenation of the list of
 fields of the argument datasets. This only works if they all have the same length.
 return FieldsSubsetIterator(self)
 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
 assert self.hasFields(*fieldnames)
 return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset)
-def __getitem__(self,i):
+def dontuse__getitem__(self,i):
 return FieldsSubsetDataSet(self.src[i],self.fieldnames)
-class DataSetFields(LookupList):
+class DataSetFields(Example):
 """
 Although a L{DataSet} iterates over examples (like rows of a matrix), an associated
 DataSetFields iterates over fields (like columns of a matrix), and can be understood
 as a transpose of the associated dataset.
 dataset = FieldsSubsetDataSet(dataset,fieldnames)
 assert dataset.hasFields(*fieldnames)
 self.dataset=dataset
 if isinstance(dataset,MinibatchDataSet):
-LookupList.__init__(self,fieldnames,list(dataset._fields))
+Example.__init__(self,fieldnames,list(dataset._fields))
 elif isinstance(original_dataset,MinibatchDataSet):
-LookupList.__init__(self,fieldnames,
+Example.__init__(self,fieldnames,
 [original_dataset._fields[field]
 for field in fieldnames])
 else:
 minibatch_iterator = dataset.minibatches(fieldnames,
 minibatch_size=len(dataset),
 n_batches=1)
 minibatch=minibatch_iterator.next()
-LookupList.__init__(self,fieldnames,minibatch)
+Example.__init__(self,fieldnames,minibatch)
 def examples(self):
 return self.dataset
 def __or__(self,other):
 return (self.examples() | other.examples()).fields()
 class MinibatchDataSet(DataSet):
 """
-Turn a L{LookupList} of same-length (iterable) fields into an example-iterable dataset.
+Turn a L{Example} of same-length (iterable) fields into an example-iterable dataset.
 Each element of the lookup-list should be an iterable and sliceable, all of the same length.
 """
 def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack,
 values_hstack=DataSet().valuesHStack):
 """
 if self.length != len(field) :
 print 'self.length = ',self.length
 print 'len(field) = ', len(field)
 print 'self._fields.keys() = ', self._fields.keys()
 print 'field=',field
+print 'fields_lookuplist=', fields_lookuplist
 assert self.length==len(field)
-self.values_vstack=values_vstack
+self.valuesVStack=values_vstack
-self.values_hstack=values_hstack
+self.valuesHStack=values_hstack
 def __len__(self):
 return self.length
-def __getitem__(self,i):
+def dontuse__getitem__(self,i):
 if type(i) in (slice,list):
 return DataSetFields(MinibatchDataSet(
 Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames())
 if type(i) is int:
 return Example(self._fields.keys(),[field[i] for field in self._fields])
 if fieldnames is None: fieldnames = ds._fields.keys()
 self.fieldnames = fieldnames
 self.ds=ds
 self.next_example=offset
-assert minibatch_size > 0
+assert minibatch_size >= 0
 if offset+minibatch_size > ds.length:
 raise NotImplementedError()
 def __iter__(self):
 return self
 def next(self):
 return minibatch
 # tbm: added fieldnames to handle subset of fieldnames
 return Iterator(self,fieldnames)
-def valuesVStack(self,fieldname,fieldvalues):
-return self.values_vstack(fieldname,fieldvalues)
-def valuesHStack(self,fieldnames,fieldvalues):
-return self.values_hstack(fieldnames,fieldvalues)
 class HStackedDataSet(DataSet):
 """
 A L{DataSet} that wraps several datasets and shows a view that includes all their fields,
 i.e. whose list of fields is the concatenation of their lists of fields.
 self.iterators=iterators
 def __iter__(self):
 return self
 def next(self):
 # concatenate all the fields of the minibatches
-l=LookupList()
+l=Example()
 for iter in self.iterators:
 l.append_lookuplist(iter.next())
 return l
 assert self.hasFields(*fieldnames)
 datasets=self.datasets
 iterators=[dataset.minibatches(None,minibatch_size,n_batches,offset) for dataset in datasets]
 return HStackedIterator(self,iterators)
-def valuesVStack(self,fieldname,fieldvalues):
+def untested_valuesVStack(self,fieldname,fieldvalues):
 return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues)
-def valuesHStack(self,fieldnames,fieldvalues):
+def untested_valuesHStack(self,fieldnames,fieldvalues):
 """
 We will use the sub-dataset associated with the first fieldname in the fieldnames list
 to do the work, hoping that it can cope with the other values (i.e. won't care
 about the incompatible fieldnames). Hence this heuristic will always work if
 all the fieldnames are of the same sub-dataset.
 Virtual super-class of datasets whose field values are numpy array,
 thus defining valuesHStack and valuesVStack for sub-classes.
 """
 def __init__(self,description=None,field_types=None):
 DataSet.__init__(self,description,field_types)
-def valuesHStack(self,fieldnames,fieldvalues):
+def untested_valuesHStack(self,fieldnames,fieldvalues):
 """Concatenate field values horizontally, e.g. two vectors
 become a longer vector, two matrices become a wider matrix, etc."""
 return numpy.hstack(fieldvalues)
-def valuesVStack(self,fieldname,values):
+def untested_valuesVStack(self,fieldname,values):
 """Concatenate field values vertically, e.g. two vectors
 become a two-row matrix, two matrices become a longer matrix, etc."""
 return numpy.vstack(values)
 class ArrayDataSet(ArrayFieldsDataSet):
 return self.fields_columns.keys()
 def __len__(self):
 return len(self.data)
-def __getitem__(self,key):
+def dontuse__getitem__(self,key):
 """More efficient implementation than the default __getitem__"""
 fieldnames=self.fields_columns.keys()
 values=self.fields_columns.values()
 if type(key) is int:
 return Example(fieldnames,
 return self.data[:,self.fields_columns[key]]
 # else we are trying to access a property of the dataset
 assert key in self.__dict__ # else it means we are trying to access a non-existing property
 return self.__dict__[key]
-def __iter__(self):
+def dontuse__iter__(self):
 class ArrayDataSetIteratorIter(object):
 def __init__(self,dataset,fieldnames):
 if fieldnames is None: fieldnames = dataset.fieldNames()
 # store the resulting minibatch in a lookup-list of values
-self.minibatch = LookupList(fieldnames,[0]*len(fieldnames))
+self.minibatch = Example(fieldnames,[0]*len(fieldnames))
 self.dataset=dataset
 self.current=0
 self.columns = [self.dataset.fields_columns[f]
 for f in self.minibatch._names]
 self.l = self.dataset.data.shape[0]
 return self.minibatch
 return ArrayDataSetIteratorIter(self,self.fieldNames())
 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
-class ArrayDataSetIterator(object):
+cursor = Example(fieldnames,[0]*len(fieldnames))
-def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset):
+fieldnames = self.fieldNames() if fieldnames is None else fieldnames
-if fieldnames is None: fieldnames = dataset.fieldNames()
+for n in xrange(n_batches):
-# store the resulting minibatch in a lookup-list of values
+if offset == len(self):
-self.minibatch = LookupList(fieldnames,[0]*len(fieldnames))
+break
-self.dataset=dataset
+sub_data = self.data[offset : offset+minibatch_size]
-self.minibatch_size=minibatch_size
+offset += len(sub_data) #can be less than minibatch_size at end
-assert offset>=0 and offset<len(dataset.data)
+cursor._values = [sub_data[:,self.fields_columns[f]] for f in cursor._names]
-assert offset+minibatch_size<=len(dataset.data)
+yield cursor
-self.current=offset
-def __iter__(self):
+#return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
-return self
-def next(self):
-#@todo: we suppose that MinibatchWrapAroundIterator stop the iterator
-sub_data =  self.dataset.data[self.current:self.current+self.minibatch_size]
-self.minibatch._values = [sub_data[:,self.dataset.fields_columns[f]] for f in self.minibatch._names]
-self.current+=self.minibatch_size
-return self.minibatch
-return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
 class CachedDataSet(DataSet):
 """
 Wrap a L{DataSet} whose values are computationally expensive to obtain
 if self.all_fields:
 return all_fields_minibatch
 return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
 return CacheIterator(self)
-def __getitem__(self,i):
+def dontuse__getitem__(self,i):
 if type(i)==int and len(self.cached_examples)>i:
 return self.cached_examples[i]
 else:
 return self.source_dataset[i]
 def __init__(self,dataset):
 self.dataset=dataset
 self.l = len(dataset)
 self.current = 0
 self.fieldnames = self.dataset.fieldNames()
-self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames))
+self.example = Example(self.fieldnames,[0]*len(self.fieldnames))
 def __iter__(self): return self
 def next(self):
 if self.current>=self.l:
 raise StopIteration
 cache_len = len(self.dataset.cached_examples)
 return self.example
 return CacheIteratorIter(self)
 class ApplyFunctionDataSet(DataSet):
 """
 A L{DataSet} that contains as fields the results of applying a
 given function example-wise or minibatch-wise to all the fields of
 an input dataset.  The output of the function should be an iterable
-(e.g. a list or a LookupList) over the resulting values.
+(e.g. a list or a Example) over the resulting values.
 The function take as input the fields of the dataset, not the examples.
 In minibatch mode, the function is expected to work on minibatches
 (takes a minibatch in input and returns a minibatch in output). More
 precisely, it means that each element of the input or output list
 should be iterable and indexable over the individual example values
 (typically these elements will be numpy arrays). All of the elements
 in the input and output lists should have the same length, which is
 the length of the minibatch.
 The function is applied each time an example or a minibatch is accessed.
 To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
 If the values_{h,v}stack functions are not provided, then
 the input_dataset.values{H,V}Stack functions are used by default.
 """
 def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
 values_hstack=None,values_vstack=None,
 description=None,fieldtypes=None):
 """
 Constructor takes an input dataset that has as many fields as the function
 expects as inputs. The resulting dataset has as many fields as the function
 produces as outputs, and that should correspond to the number of output names
 (provided in a list).
 Note that the expected semantics of the function differs in minibatch mode
 (it takes minibatches of inputs and produces minibatches of outputs, as
 documented in the class comment).
 TBM: are filedtypes the old field types (from input_dataset) or the new ones
 (for the new dataset created)?
 """
 self.input_dataset=input_dataset
 self.function=function
 self.output_names=output_names
 self.minibatch_mode=minibatch_mode
 DataSet.__init__(self,description,fieldtypes)
 self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
 self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
 def __len__(self):
 return len(self.input_dataset)
 def fieldNames(self):
 return self.output_names
-def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+def minibatches_nowrap(self, fieldnames, *args, **kwargs):
-class ApplyFunctionIterator(object):
+for fields in self.input_dataset.minibatches_nowrap(fieldnames, *args, **kwargs):
-def __init__(self,output_dataset):
-self.input_dataset=output_dataset.input_dataset
+#function_inputs = self.input_iterator.next()
-self.output_dataset=output_dataset
+if self.minibatch_mode:
-self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size,
+function_outputs = self.function(*fields)
-n_batches=n_batches,offset=offset).__iter__()
+else:
+input_examples = zip(*fields)
-def __iter__(self): return self
+output_examples = [self.function(*input_example)
+for input_example in input_examples]
-def next(self):
+function_outputs = [self.valuesVStack(name,values)
-function_inputs = self.input_iterator.next()
+for name,values in zip(self.output_names,
-all_output_names = self.output_dataset.output_names
+zip(*output_examples))]
-if self.output_dataset.minibatch_mode:
+all_outputs = Example(self.output_names, function_outputs)
-function_outputs = self.output_dataset.function(*function_inputs)
+print fields
-else:
+print all_outputs
-input_examples = zip(*function_inputs)
+print '--------'
-output_examples = [self.output_dataset.function(*input_example)
+if fieldnames==self.output_names:
-for input_example in input_examples]
+yield all_outputs
-function_outputs = [self.output_dataset.valuesVStack(name,values)
+else:
-for name,values in zip(all_output_names,
+yield Example(fieldnames,[all_outputs[name] for name in fieldnames])
-zip(*output_examples))]
-all_outputs = Example(all_output_names,function_outputs)
+def untested__iter__(self): # only implemented for increased efficiency
-if fieldnames==all_output_names:
+class ApplyFunctionSingleExampleIterator(object):
-return all_outputs
+def __init__(self,output_dataset):
-return Example(fieldnames,[all_outputs[name] for name in fieldnames])
+self.current=0
+self.output_dataset=output_dataset
+self.input_iterator=output_dataset.input_dataset.__iter__()
-return ApplyFunctionIterator(self)
+def __iter__(self): return self
+def next(self):
-def __iter__(self): # only implemented for increased efficiency
+if self.output_dataset.minibatch_mode:
-class ApplyFunctionSingleExampleIterator(object):
+function_inputs = [[input] for input in self.input_iterator.next()]
-def __init__(self,output_dataset):
+outputs = self.output_dataset.function(*function_inputs)
-self.current=0
+assert all([hasattr(output,'__iter__') for output in outputs])
-self.output_dataset=output_dataset
+function_outputs = [output[0] for output in outputs]
-self.input_iterator=output_dataset.input_dataset.__iter__()
+else:
-def __iter__(self): return self
+function_inputs = self.input_iterator.next()
-def next(self):
+function_outputs = self.output_dataset.function(*function_inputs)
-if self.output_dataset.minibatch_mode:
+return Example(self.output_dataset.output_names,function_outputs)
-function_inputs = [[input] for input in self.input_iterator.next()]
+return ApplyFunctionSingleExampleIterator(self)
-outputs = self.output_dataset.function(*function_inputs)
-assert all([hasattr(output,'__iter__') for output in outputs])
-function_outputs = [output[0] for output in outputs]
-else:
-function_inputs = self.input_iterator.next()
-function_outputs = self.output_dataset.function(*function_inputs)
-return Example(self.output_dataset.output_names,function_outputs)
-return ApplyFunctionSingleExampleIterator(self)
 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
 """
 Wraps an arbitrary L{DataSet} into one for supervised learning tasks
 by forcing the user to define a set of fields as the 'input' field

Mercurial > pylearn

comparison dataset.py @ 290:9b533cc7874a