pylearn: pylearn/old_dataset/dataset.py comparison

comparison pylearn/old_dataset/dataset.py @ 537:b054271b2504

new file structure layout, factories, etc.

author	James Bergstra <bergstrj@iro.umontreal.ca>
date	Wed, 12 Nov 2008 21:57:54 -0500
parents	dataset.py@fb62f0e4bcfe
children

comparison

equal deleted inserted replaced

-:4aa7f74ea93f
+:b054271b2504
+from lookup_list import LookupList as Example
+from common.misc import unique_elements_list_intersection
+from string import join
+from sys import maxint
+import numpy, copy
+from exceptions import *
+class AttributesHolder(object):
+def __init__(self): pass
+def attributeNames(self):
+raise AbstractFunction()
+def setAttributes(self,attribute_names,attribute_values,make_copies=False):
+"""
+Allow the attribute_values to not be a list (but a single value) if the attribute_names is of length 1.
+"""
+if len(attribute_names)==1 and not (isinstance(attribute_values,list) or isinstance(attribute_values,tuple) ):
+attribute_values = [attribute_values]
+if make_copies:
+for name,value in zip(attribute_names,attribute_values):
+self.__setattr__(name,copy.deepcopy(value))
+else:
+for name,value in zip(attribute_names,attribute_values):
+self.__setattr__(name,value)
+def getAttributes(self,attribute_names=None, return_copy=False):
+"""
+Return all (if attribute_names=None, in the order of attributeNames()) or a specified subset of attributes.
+"""
+if attribute_names is None:
+attribute_names = self.attributeNames()
+if return_copy:
+return [copy.copy(self.__getattribute__(name)) for name in attribute_names]
+else:
+return [self.__getattribute__(name) for name in attribute_names]
+class DataSet(AttributesHolder):
+"""A virtual base class for datasets.
+A DataSet can be seen as a generalization of a matrix, meant to be used in conjunction
+with learning algorithms (for training and testing them): rows/records are called examples, and
+columns/attributes are called fields. The field value for a particular example can be an arbitrary
+python object, which depends on the particular dataset.
+We call a DataSet a 'stream' when its length is unbounded (in which case its __len__ method
+should return sys.maxint).
+A DataSet is a generator of iterators; these iterators can run through the
+examples or the fields in a variety of ways.  A DataSet need not necessarily have a finite
+or known length, so this class can be used to interface to a 'stream' which
+feeds on-line learning (however, as noted below, some operations are not
+feasible or not recommended on streams).
+To iterate over examples, there are several possibilities:
+- for example in dataset:
+- for val1,val2,... in dataset:
+- for example in dataset(field1, field2,field3, ...):
+- for val1,val2,val3 in dataset(field1, field2,field3):
+- for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N):
+- for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N):
+Each of these is documented below. All of these iterators are expected
+to provide, in addition to the usual 'next()' method, a 'next_index()' method
+which returns a non-negative integer pointing to the position of the next
+example that will be returned by 'next()' (or of the first example in the
+next minibatch returned). This is important because these iterators
+can wrap around the dataset in order to do multiple passes through it,
+in possibly unregular ways if the minibatch size is not a divisor of the
+dataset length.
+To iterate over fields, one can do
+- for field in dataset.fields():
+for field_value in field: # iterate over the values associated to that field for all the dataset examples
+- for field in dataset(field1,field2,...).fields() to select a subset of fields
+- for field in dataset.fields(field1,field2,...) to select a subset of fields
+and each of these fields is iterable over the examples:
+- for field_examples in dataset.fields():
+for example_value in field_examples:
+...
+but when the dataset is a stream (unbounded length), it is not recommended to do
+such things because the underlying dataset may refuse to access the different fields in
+an unsynchronized ways. Hence the fields() method is illegal for streams, by default.
+The result of fields() is a L{DataSetFields} object, which iterates over fields,
+and whose elements are iterable over examples. A DataSetFields object can
+be turned back into a DataSet with its examples() method::
+dataset2 = dataset1.fields().examples()
+and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1).
+Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
+Note: The content of a field can be of any type. Field values can also be 'missing'
+(e.g. to handle semi-supervised learning), and in the case of numeric (numpy array)
+fields (i.e. an ArrayFieldsDataSet), NaN plays the role of a missing value.
+What about non-numeric values? None.
+Dataset elements can be indexed and sub-datasets (with a subset
+of examples) can be extracted. These operations are not supported
+by default in the case of streams.
+- dataset[:n] returns an Example with the n first examples.
+- dataset[i1:i2:s] returns an Example with the examples i1,i1+s,...i2-s.
+- dataset[i] returns an Example.
+- dataset[[i1,i2,...in]] returns an Example with examples i1,i2,...in.
+A similar command gives you a DataSet instead of Examples :
+- dataset.subset[:n] returns a DataSet with the n first examples.
+- dataset.subset[i1:i2:s] returns a DataSet with the examples i1,i1+s,...i2-s.
+- dataset.subset[i] returns a DataSet.
+- dataset.subset[[i1,i2,...in]] returns a DataSet with examples i1,i2,...in.
+- dataset.<property> returns the value of a property associated with
+the name <property>. The following properties should be supported:
+- 'description': a textual description or name for the dataset
+- 'fieldtypes': a list of types (one per field)
+A DataSet may have other attributes that it makes visible to other objects. These are
+used to store information that is not example-wise but global to the dataset.
+The list of names of these attributes is given by the attribute_names() method.
+Datasets can be concatenated either vertically (increasing the length) or
+horizontally (augmenting the set of fields), if they are compatible, using
+the following operations (with the same basic semantics as numpy.hstack
+and numpy.vstack):
+- dataset1 | dataset2 | dataset3 == dataset.hstack([dataset1,dataset2,dataset3])
+creates a new dataset whose list of fields is the concatenation of the list of
+fields of the argument datasets. This only works if they all have the same length.
+- dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3])
+creates a new dataset that concatenates the examples from the argument datasets
+(and whose length is the sum of the length of the argument datasets). This only
+works if they all have the same fields.
+According to the same logic, and viewing a DataSetFields object associated to
+a DataSet as a kind of transpose of it, fields1 & fields2 concatenates fields of
+a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their
+examples.
+A dataset can hold arbitrary key-value pairs that may be used to access meta-data
+or other properties of the dataset or associated with the dataset or the result
+of a computation stored in a dataset. These can be accessed through the [key] syntax
+when key is a string (or more specifically, neither an integer, a slice, nor a list).
+A DataSet sub-class should always redefine the following methods:
+- __len__ if it is not a stream
+- fieldNames
+- minibatches_nowrap (called by DataSet.minibatches())
+For efficiency of implementation, a sub-class might also want to redefine
+- valuesHStack
+- valuesVStack
+- hasFields
+- __getitem__ may not be feasible with some streams
+- __iter__
+A sub-class should also append attributes to self._attribute_names
+(the default value returned by attributeNames()).
+By convention, attributes not in attributeNames() should have a name
+starting with an underscore.
+@todo enforce/test that convention!
+"""
+numpy_vstack = lambda fieldname,values: numpy.vstack(values)
+numpy_hstack = lambda fieldnames,values: numpy.hstack(values)
+def __init__(self, description=None, fieldnames=None, fieldtypes=None):
+"""
+@type fieldnames: list of strings
+@type fieldtypes: list of python types, same length as fieldnames
+@type description: string
+@param description: description/name for this dataset
+"""
+def default_desc():
+return type(self).__name__ \
++ " ( " + join([x.__name__ for x in type(self).__bases__]) + " )"
+#self.fieldnames = fieldnames
+self.fieldtypes = fieldtypes if fieldtypes is not None \
+else [None]*1 #len(fieldnames)
+self.description =  default_desc() if description is None \
+else description
+self._attribute_names = ["description"]
+attributeNames = property(lambda self: copy.copy(self._attribute_names))
+def __contains__(self, fieldname):
+return (fieldname in self.fieldNames()) \
+or (fieldname in self.attributeNames())
+def __iter__(self):
+"""Supports the syntax "for i in dataset: ..."
+Using this syntax, "i" will be an Example instance (or equivalent) with
+all the fields of DataSet self.  Every field of "i" will give access to
+a field of a single example.  Fields should be accessible via
+i["fielname"] or i[3] (in the order defined by the elements of the
+Example returned by this iterator), but the derived class is free
+to accept any type of identifier, and add extra functionality to the iterator.
+The default implementation calls the minibatches iterator and extracts the first example of each field.
+"""
+return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1))
+def __len__(self):
+"""
+len(dataset) returns the number of examples in the dataset.
+By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint).
+Sub-classes which implement finite-length datasets should redefine this method.
+Some methods only make sense for finite-length datasets.
+"""
+from sys import maxint
+return maxint
+class MinibatchToSingleExampleIterator(object):
+"""
+Converts the result of minibatch iterator with minibatch_size==1 into
+single-example values in the result. Therefore the result of
+iterating on the dataset itself gives a sequence of single examples
+(whereas the result of iterating over minibatches gives in each
+Example field an iterable object over the individual examples in
+the minibatch).
+"""
+def __init__(self, minibatch_iterator):
+self.minibatch_iterator = minibatch_iterator
+self.minibatch = None
+def __iter__(self): #makes for loop work
+return self
+def next(self):
+size1_minibatch = self.minibatch_iterator.next()
+if not self.minibatch:
+names = size1_minibatch.keys()
+# next lines are a hack, but there was problem when we were getting [array(327)] for instance
+try:
+values = [value[0] for value in size1_minibatch.values()]
+except :
+values = [value for value in size1_minibatch.values()]
+self.minibatch = Example(names,values)
+else:
+self.minibatch._values = [value[0] for value in size1_minibatch.values()]
+return self.minibatch
+def next_index(self):
+return self.minibatch_iterator.next_index()
+class MinibatchWrapAroundIterator(object):
+"""
+An iterator for minibatches that handles the case where we need to wrap around the
+dataset because n_batches*minibatch_size > len(dataset). It is constructed from
+a dataset that provides a minibatch iterator that does not need to handle that problem.
+This class is a utility for dataset subclass writers, so that they do not have to handle
+this issue multiple times, nor check that fieldnames are valid, nor handle the
+empty fieldnames (meaning 'use all the fields').
+"""
+def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset):
+self.dataset=dataset
+self.fieldnames=fieldnames
+self.minibatch_size=minibatch_size
+self.n_batches=n_batches
+self.n_batches_done=0
+self.next_row=offset
+self.L=len(dataset)
+self.offset=offset % self.L
+ds_nbatches =  (self.L-self.next_row)/self.minibatch_size
+if n_batches is not None:
+ds_nbatches = min(n_batches,ds_nbatches)
+if fieldnames:
+assert dataset.hasFields(*fieldnames)
+else:
+self.fieldnames=dataset.fieldNames()
+self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, ds_nbatches,self.next_row)
+def __iter__(self):
+return self
+def next_index(self):
+return self.next_row
+def next(self):
+if self.n_batches and self.n_batches_done==self.n_batches:
+raise StopIteration
+elif not self.n_batches and self.next_row ==self.L:
+raise StopIteration
+upper = self.next_row+self.minibatch_size
+if upper <=self.L:
+minibatch = self.iterator.next()
+else:
+if not self.n_batches:
+upper=min(upper, self.L)
+# if their is not a fixed number of batch, we continue to the end of the dataset.
+# this can create a minibatch that is smaller then the minibatch_size
+assert (self.L-self.next_row)<=self.minibatch_size
+minibatch = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
+else:
+# we must concatenate (vstack) the bottom and top parts of our minibatch
+# first get the beginning of our minibatch (top of dataset)
+first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
+second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next()
+minibatch = Example(self.fieldnames,
+[self.dataset.valuesVStack(name,[first_part[name],second_part[name]])
+for name in self.fieldnames])
+self.next_row=upper
+self.n_batches_done+=1
+if upper >= self.L and self.n_batches:
+self.next_row -= self.L
+ds_nbatches =  (self.L-self.next_row)/self.minibatch_size
+if self.n_batches is not None:
+ds_nbatches = min(self.n_batches,ds_nbatches)
+self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size,
+ds_nbatches,self.next_row)
+return DataSetFields(MinibatchDataSet(minibatch,self.dataset.valuesVStack,
+self.dataset.valuesHStack),
+minibatch.keys())
+minibatches_fieldnames = None
+minibatches_minibatch_size = 1
+minibatches_n_batches = None
+def minibatches(self,
+fieldnames = minibatches_fieldnames,
+minibatch_size = minibatches_minibatch_size,
+n_batches = minibatches_n_batches,
+offset = 0):
+"""
+Return an iterator that supports three forms of syntax:
+for i in dataset.minibatches(None,**kwargs): ...
+for i in dataset.minibatches([f1, f2, f3],**kwargs): ...
+for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ...
+Using the first two syntaxes, "i" will be an indexable object, such as a list,
+tuple, or Example instance. In both cases, i[k] is a list-like container
+of a batch of current examples. In the second case, i[0] is
+list-like container of the f1 field of a batch current examples, i[1] is
+a list-like container of the f2 field, etc.
+Using the first syntax, all the fields will be returned in "i".
+Using the third syntax, i1, i2, i3 will be list-like containers of the
+f1, f2, and f3 fields of a batch of examples on each loop iteration.
+The minibatches iterator is expected to return upon each call to next()
+a DataSetFields object, which is a Example (indexed by the field names) whose
+elements are iterable and indexable over the minibatch examples, and which keeps a pointer to
+a sub-dataset that can be used to iterate over the individual examples
+in the minibatch. Hence a minibatch can be converted back to a regular
+dataset or its fields can be looked at individually (and possibly iterated over).
+PARAMETERS
+- fieldnames (list of any type, default None):
+The loop variables i1, i2, i3 (in the example above) should contain the
+f1, f2, and f3 fields of the current batch of examples.  If None, the
+derived class can choose a default, e.g. all fields.
+- minibatch_size (integer, default 1)
+On every iteration, the variables i1, i2, i3 will have
+exactly minibatch_size elements. e.g. len(i1) == minibatch_size
+@DEPRECATED n_batches : not used anywhere
+- n_batches (integer, default None)
+The iterator will loop exactly this many times, and then stop.  If None,
+the derived class can choose a default.  If (-1), then the returned
+iterator should support looping indefinitely.
+- offset (integer, default 0)
+The iterator will start at example 'offset' in the dataset, rather than the default.
+Note: A list-like container is something like a tuple, list, numpy.ndarray or
+any other object that supports integer indexing and slicing.
+@ATTENTION: now minibatches returns minibatches_nowrap, which is supposed to return complete
+batches only, raise StopIteration.
+@ATTENTION: minibatches returns a LookupList, we can't iterate over examples on it.
+"""
+#return DataSet.MinibatchWrapAroundIterator(self, fieldnames, minibatch_size, n_batches,offset)
+assert offset >= 0
+assert offset < len(self)
+assert offset + minibatch_size -1 < len(self)
+if fieldnames == None :
+fieldnames = self.fieldNames()
+return self.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset)
+def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+"""
+This is the minibatches iterator generator that sub-classes must define.
+It does not need to worry about wrapping around multiple times across the dataset,
+as this is handled by MinibatchWrapAroundIterator when DataSet.minibatches() is called.
+The next() method of the returned iterator does not even need to worry about
+the termination condition (as StopIteration will be raised by DataSet.minibatches
+before an improper call to minibatches_nowrap's next() is made).
+That next() method can assert that its next row will always be within [0,len(dataset)).
+The iterator returned by minibatches_nowrap does not need to implement
+a next_index() method either, as this will be provided by MinibatchWrapAroundIterator.
+"""
+raise AbstractFunction()
+def is_unbounded(self):
+"""
+Tests whether a dataset is unbounded (e.g. a stream).
+"""
+return len(self)==maxint
+def hasFields(self,*fieldnames):
+"""
+Return true if the given field name (or field names, if multiple arguments are
+given) is recognized by the DataSet (i.e. can be used as a field name in one
+of the iterators).
+The default implementation may be inefficient (O(# fields in dataset)), as it calls the fieldNames()
+method. Many datasets may store their field names in a dictionary, which would allow more efficiency.
+"""
+return len(unique_elements_list_intersection(fieldnames,self.fieldNames()))>0
+def fieldNames(self):
+"""
+Return the list of field names that are supported by the iterators,
+and for which hasFields(fieldname) would return True.
+"""
+raise AbstractFunction()
+def __call__(self,*fieldnames):
+"""
+Return a dataset that sees only the fields whose name are specified.
+"""
+assert self.hasFields(*fieldnames)
+#return self.fields(*fieldnames).examples()
+fieldnames_list = list(fieldnames)
+return FieldsSubsetDataSet(self,fieldnames_list)
+def cached_fields_subset(self,*fieldnames) :
+"""
+Behaviour is supposed to be the same as __call__(*fieldnames), but the dataset returned is cached.
+@see : dataset.__call__
+"""
+assert self.hasFields(*fieldnames)
+return self.fields(*fieldnames).examples()
+def fields(self,*fieldnames):
+"""
+Return a DataSetFields object associated with this dataset.
+"""
+return DataSetFields(self,fieldnames)
+def getitem_key(self, fieldname):
+"""A not-so-well thought-out place to put code that used to be in
+getitem.
+"""
+#removing as per discussion June 4. --JSB
+i = fieldname
+# else check for a fieldname
+if self.hasFields(i):
+return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
+# else we are trying to access a property of the dataset
+assert i in self.__dict__ # else it means we are trying to access a non-existing property
+return self.__dict__[i]
+def __getitem__(self,i):
+"""
+@rtype: Example
+@returns: single or multiple examples
+@type i: integer or slice or <iterable> of integers
+@param i:
+dataset[i] returns the (i+1)-th example of the dataset.
+dataset[i:j] returns a LookupList with examples i,i+1,...,j-1.
+dataset[i:j:s] returns a LookupList with examples i,i+2,i+4...,j-2.
+dataset[[i1,i2,..,in]] returns a LookupList with examples i1,i2,...,in.
+@note:
+Some stream datasets may be unable to implement random access, i.e.
+arbitrary slicing/indexing because they can only iterate through
+examples one or a minibatch at a time and do not actually store or keep
+past (or future) examples.
+The default implementation of getitem uses the minibatches iterator
+to obtain one example, one slice, or a list of examples. It may not
+always be the most efficient way to obtain the result, especially if
+the data are actually stored in a memory array.
+"""
+if type(i) is int:
+assert i >= 0 # TBM: see if someone complains and want negative i
+if i >= len(self) :
+raise IndexError
+i_batch = self.minibatches_nowrap(self.fieldNames(),
+minibatch_size=1, n_batches=1, offset=i)
+return DataSet.MinibatchToSingleExampleIterator(i_batch).next()
+#if i is a contiguous slice
+if type(i) is slice and (i.step in (None, 1)):
+offset = 0 if i.start is None else i.start
+upper_bound = len(self) if i.stop is None else i.stop
+upper_bound = min(len(self) , upper_bound)
+#return MinibatchDataSet(self.minibatches_nowrap(self.fieldNames(),
+#        minibatch_size=upper_bound - offset,
+#        n_batches=1,
+#        offset=offset).next())
+# now returns a LookupList
+return self.minibatches_nowrap(self.fieldNames(),
+minibatch_size=upper_bound - offset,
+n_batches=1,
+offset=offset).next()
+# if slice has a step param, convert it to list and handle it with the
+# list code
+if type(i) is slice:
+offset = 0 if i.start is None else i.start
+upper_bound = len(self) if i.stop is None else i.stop
+upper_bound = min(len(self) , upper_bound)
+i = list(range(offset, upper_bound, i.step))
+# handle tuples, arrays, lists
+if hasattr(i, '__getitem__'):
+for idx in i:
+#dis-allow nested slices
+if not isinstance(idx, int):
+raise TypeError(idx)
+if idx >= len(self) :
+raise IndexError
+# call back into self.__getitem__
+examples = [self.minibatches_nowrap(self.fieldNames(),
+minibatch_size=1, n_batches=1, offset=ii).next()
+for ii in i]
+# re-index the fields in each example by field instead of by example
+field_values = [[] for blah in  self.fieldNames()]
+for e in examples:
+for f,v in zip(field_values, e):
+f.append(v)
+#build them into a LookupList (a.ka. Example)
+zz = zip(self.fieldNames(),field_values)
+vst = [self.valuesVStack(fieldname,field_values) for fieldname,field_values in zz]
+example = Example(self.fieldNames(), vst)
+#return MinibatchDataSet(example, self.valuesVStack, self.valuesHStack)
+# now returns a LookupList
+return example
+# what in the world is i?
+raise TypeError(i, type(i))
+"""
+Enables the call dataset.subset[a:b:c] that will return a DataSet
+around the examples returned by __getitem__(slice(a,b,c))
+@SEE DataSet.__getsubset(self)
+"""
+subset = property(lambda s : s.__getsubset(),doc="returns a subset as a DataSet")
+def __getsubset(self) :
+"""
+Enables the call data.subset[a:b:c], returns a DataSet.
+Default implementation is a simple wrap around __getitem__() using MinibatchDataSet.
+@RETURN DataSet
+@SEE DataSet.subset = property(lambda s : s.__getsubset())
+"""
+_self = self
+class GetSliceReturnsDataSet(object) :
+def __getitem__(self,slice) :
+return MinibatchDataSet(_self.__getitem__(slice))
+return GetSliceReturnsDataSet()
+def valuesHStack(self,fieldnames,fieldvalues):
+"""
+Return a value that corresponds to concatenating (horizontally) several field values.
+This can be useful to merge some fields. The implementation of this operation is likely
+to involve a copy of the original values. When the values are numpy arrays, the
+result should be numpy.hstack(values). If it makes sense, this operation should
+work as well when each value corresponds to multiple examples in a minibatch
+e.g. if each value is a Ni-vector and a minibatch of length L is a LxNi matrix,
+then the result should be a Lx(N1+N2+..) matrix equal to numpy.hstack(values).
+The default is to use numpy.hstack for numpy.ndarray values, and a list
+pointing to the original values for other data types.
+"""
+all_numpy=True
+for value in fieldvalues:
+if not type(value) is numpy.ndarray:
+all_numpy=False
+if all_numpy:
+return numpy.hstack(fieldvalues)
+# the default implementation of horizontal stacking is to put values in a list
+return fieldvalues
+def valuesVStack(self,fieldname,values):
+"""
+@param fieldname: the name of the field from which the values were taken
+@type fieldname: any type
+@param values: bits near the beginning or end of the dataset
+@type values: list of minibatches (returned by minibatches_nowrap)
+@return: the concatenation (stacking) of the values
+@rtype: something suitable as a minibatch field
+"""
+rval = []
+for v in values:
+rval.extend(v)
+return rval
+def __or__(self,other):
+"""
+dataset1 | dataset2 returns a dataset whose list of fields is the concatenation of the list of
+fields of the argument datasets. This only works if they all have the same length.
+"""
+return HStackedDataSet([self,other])
+def __and__(self,other):
+"""
+dataset1 & dataset2 is a dataset that concatenates the examples from the argument datasets
+(and whose length is the sum of the length of the argument datasets). This only
+works if they all have the same fields.
+"""
+return VStackedDataSet([self,other])
+def hstack(datasets):
+"""
+hstack(dataset1,dataset2,...) returns dataset1 | datataset2 | ...
+which is a dataset whose fields list is the concatenation of the fields
+of the individual datasets.
+"""
+assert len(datasets)>0
+if len(datasets)==1:
+return datasets[0]
+return HStackedDataSet(datasets)
+def vstack(datasets):
+"""
+vstack(dataset1,dataset2,...) returns dataset1 & datataset2 & ...
+which is a dataset which iterates first over the examples of dataset1, then
+over those of dataset2, etc.
+"""
+assert len(datasets)>0
+if len(datasets)==1:
+return datasets[0]
+return VStackedDataSet(datasets)
+class FieldsSubsetDataSet(DataSet):
+"""
+A sub-class of L{DataSet} that selects a subset of the fields.
+"""
+def __init__(self,src,fieldnames):
+self.src=src
+self.fieldnames=fieldnames
+assert src.hasFields(*fieldnames)
+self.valuesHStack = src.valuesHStack
+self.valuesVStack = src.valuesVStack
+def __len__(self): return len(self.src)
+def fieldNames(self):
+return self.fieldnames
+def __iter__(self):
+class FieldsSubsetIterator(object):
+def __init__(self,ds):
+self.ds=ds
+self.src_iter=ds.src.__iter__()
+self.example=None
+def __iter__(self): return self
+def next(self):
+complete_example = self.src_iter.next()
+if self.example:
+self.example._values=[complete_example[field]
+for field in self.ds.fieldnames]
+else:
+self.example=Example(self.ds.fieldnames,
+[complete_example[field] for field in self.ds.fieldnames])
+return self.example
+return FieldsSubsetIterator(self)
+def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+assert self.hasFields(*fieldnames)
+return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset)
+def dontuse__getitem__(self,i):
+return FieldsSubsetDataSet(self.src[i],self.fieldnames)
+class RenamedFieldsDataSet(DataSet):
+"""
+A sub-class of L{DataSet} that selects and renames a subset of the fields.
+"""
+def __init__(self,src,src_fieldnames,new_fieldnames):
+self.src=src
+self.src_fieldnames=src_fieldnames
+self.new_fieldnames=new_fieldnames
+assert src.hasFields(*src_fieldnames)
+assert len(src_fieldnames)==len(new_fieldnames)
+self.valuesHStack = src.valuesHStack
+self.valuesVStack = src.valuesVStack
+self.lookup_fields = Example(new_fieldnames,src_fieldnames)
+def __len__(self): return len(self.src)
+def fieldNames(self):
+return self.new_fieldnames
+def __iter__(self):
+class FieldsSubsetIterator(object):
+def __init__(self,ds):
+self.ds=ds
+self.src_iter=ds.src.__iter__()
+self.example=None
+def __iter__(self): return self
+def next(self):
+complete_example = self.src_iter.next()
+if self.example:
+self.example._values=[complete_example[field]
+for field in self.ds.src_fieldnames]
+else:
+self.example=Example(self.ds.new_fieldnames,
+[complete_example[field]
+for field in self.ds.src_fieldnames])
+return self.example
+return FieldsSubsetIterator(self)
+def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+assert self.hasFields(*fieldnames)
+cursor = Example(fieldnames,[0]*len(fieldnames))
+for batch in self.src.minibatches_nowrap([self.lookup_fields[f] for f in fieldnames],minibatch_size,n_batches,offset):
+cursor._values=batch._values
+yield cursor
+def __getitem__(self,i):
+#        return FieldsSubsetDataSet(self.src[i],self.new_fieldnames)
+complete_example = self.src[i]
+return Example(self.new_fieldnames,
+[complete_example[field]
+for field in self.src_fieldnames])
+class DataSetFields(Example):
+"""
+Although a L{DataSet} iterates over examples (like rows of a matrix), an associated
+DataSetFields iterates over fields (like columns of a matrix), and can be understood
+as a transpose of the associated dataset.
+To iterate over fields, one can do
+* for fields in dataset.fields()
+* for fields in dataset(field1,field2,...).fields() to select a subset of fields
+* for fields in dataset.fields(field1,field2,...) to select a subset of fields
+and each of these fields is iterable over the examples:
+* for field_examples in dataset.fields():
+for example_value in field_examples:
+...
+but when the dataset is a stream (unbounded length), it is not recommended to do
+such things because the underlying dataset may refuse to access the different fields in
+an unsynchronized ways. Hence the fields() method is illegal for streams, by default.
+The result of fields() is a DataSetFields object, which iterates over fields,
+and whose elements are iterable over examples. A DataSetFields object can
+be turned back into a DataSet with its examples() method:
+dataset2 = dataset1.fields().examples()
+and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1).
+DataSetFields can be concatenated vertically or horizontally. To be consistent with
+the syntax used for DataSets, the | concatenates the fields and the & concatenates
+the examples.
+"""
+def __init__(self,dataset,fieldnames):
+original_dataset=dataset
+if not fieldnames:
+fieldnames=dataset.fieldNames()
+elif not list(fieldnames)==list(dataset.fieldNames()):
+#we must cast to list, othersize('x','y')!=['x','y']
+dataset = FieldsSubsetDataSet(dataset,fieldnames)
+assert dataset.hasFields(*fieldnames)
+self.dataset=dataset
+if isinstance(dataset,MinibatchDataSet):
+Example.__init__(self,fieldnames,list(dataset._fields))
+elif isinstance(original_dataset,MinibatchDataSet):
+Example.__init__(self,fieldnames,
+[original_dataset._fields[field]
+for field in fieldnames])
+else:
+minibatch_iterator = dataset.minibatches(fieldnames,
+minibatch_size=len(dataset),
+n_batches=1)
+minibatch=minibatch_iterator.next()
+Example.__init__(self,fieldnames,minibatch)
+def examples(self):
+return self.dataset
+def __or__(self,other):
+"""
+fields1 | fields2 is a DataSetFields that whose list of examples is the concatenation
+of the list of examples of DataSetFields fields1 and fields2.
+"""
+return (self.examples() + other.examples()).fields()
+def __and__(self,other):
+"""
+fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation
+of the fields of DataSetFields fields1 and fields2.
+"""
+return (self.examples() | other.examples()).fields()
+class MinibatchDataSet(DataSet):
+"""
+Turn a L{Example} of same-length (iterable) fields into an example-iterable dataset.
+Each element of the lookup-list should be an iterable and sliceable, all of the same length.
+"""
+def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack,
+values_hstack=DataSet().valuesHStack):
+"""
+The user can (and generally should) also provide values_vstack(fieldname,fieldvalues)
+and a values_hstack(fieldnames,fieldvalues) functions behaving with the same
+semantics as the DataSet methods of the same name (but without the self argument).
+"""
+self._fields=fields_lookuplist
+assert len(fields_lookuplist)>0
+self.length=len(fields_lookuplist[0])
+for field in fields_lookuplist[1:]:
+if self.length != len(field) :
+print 'self.length = ',self.length
+print 'len(field) = ', len(field)
+print 'self._fields.keys() = ', self._fields.keys()
+print 'field=',field
+print 'fields_lookuplist=', fields_lookuplist
+assert self.length==len(field)
+self.valuesVStack=values_vstack
+self.valuesHStack=values_hstack
+def __len__(self):
+return self.length
+def dontuse__getitem__(self,i):
+if type(i) in (slice,list):
+return DataSetFields(MinibatchDataSet(
+Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames())
+if type(i) is int:
+return Example(self._fields.keys(),[field[i] for field in self._fields])
+if self.hasFields(i):
+return self._fields[i]
+assert i in self.__dict__ # else it means we are trying to access a non-existing property
+return self.__dict__[i]
+def fieldNames(self):
+return self._fields.keys()
+def hasFields(self,*fieldnames):
+for fieldname in fieldnames:
+if fieldname not in self._fields.keys():
+return False
+return True
+def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+#@TODO bug somewhere here, fieldnames doesnt seem to be well handled
+class Iterator(object):
+def __init__(self,ds,fieldnames):
+# tbm: added two next lines to handle fieldnames
+if fieldnames is None: fieldnames = ds._fields.keys()
+self.fieldnames = fieldnames
+self.ds=ds
+self.next_example=offset
+assert minibatch_size >= 0
+if offset+minibatch_size > ds.length:
+raise NotImplementedError()
+def __iter__(self):
+return self
+def next(self):
+upper = self.next_example+minibatch_size
+if upper > len(self.ds) :
+raise StopIteration()
+assert upper<=len(self.ds) # instead of self.ds.length
+#minibatch = Example(self.ds._fields.keys(),
+#                    [field[self.next_example:upper]
+#                     for field in self.ds._fields])
+# tbm: modif to use fieldnames
+values = []
+for f in self.fieldnames :
+#print 'we have field',f,'in fieldnames'
+values.append( self.ds._fields[f][self.next_example:upper] )
+minibatch = Example(self.fieldnames,values)
+#print minibatch
+self.next_example+=minibatch_size
+return minibatch
+# tbm: added fieldnames to handle subset of fieldnames
+return Iterator(self,fieldnames)
+class HStackedDataSet(DataSet):
+"""
+A L{DataSet} that wraps several datasets and shows a view that includes all their fields,
+i.e. whose list of fields is the concatenation of their lists of fields.
+If a field name is found in more than one of the datasets, then either an error is
+raised or the fields are renamed (either by prefixing the __name__ attribute
+of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list).
+@todo: automatically detect a chain of stacked datasets due to A | B | C | D ...
+"""
+def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None):
+DataSet.__init__(self,description,field_types)
+self.datasets=datasets
+self.accept_nonunique_names=accept_nonunique_names
+self.fieldname2dataset={}
+def rename_field(fieldname,dataset,i):
+if hasattr(dataset,"__name__"):
+return dataset.__name__ + "." + fieldname
+return fieldname+"."+str(i)
+# make sure all datasets have the same length and unique field names
+self.length=None
+names_to_change=[]
+for i in xrange(len(datasets)):
+dataset = datasets[i]
+length=len(dataset)
+if self.length:
+assert self.length==length
+else:
+self.length=length
+for fieldname in dataset.fieldNames():
+if fieldname in self.fieldname2dataset: # name conflict!
+if accept_nonunique_names:
+fieldname=rename_field(fieldname,dataset,i)
+names2change.append((fieldname,i))
+else:
+raise ValueError("Incompatible datasets: non-unique field name = "+fieldname)
+self.fieldname2dataset[fieldname]=i
+for fieldname,i in names_to_change:
+del self.fieldname2dataset[fieldname]
+self.fieldname2dataset[rename_field(fieldname,self.datasets[i],i)]=i
+def __len__(self):
+return len(self.datasets[0])
+def hasFields(self,*fieldnames):
+for fieldname in fieldnames:
+if not fieldname in self.fieldname2dataset:
+return False
+return True
+def fieldNames(self):
+return self.fieldname2dataset.keys()
+def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+class HStackedIterator(object):
+def __init__(self,hsds,iterators):
+self.hsds=hsds
+self.iterators=iterators
+def __iter__(self):
+return self
+def next(self):
+# concatenate all the fields of the minibatches
+l=Example()
+for iter in self.iterators:
+l.append_lookuplist(iter.next())
+return l
+assert self.hasFields(*fieldnames)
+# find out which underlying datasets are necessary to service the required fields
+# and construct corresponding minibatch iterators
+if fieldnames and fieldnames!=self.fieldNames():
+datasets=set([])
+fields_in_dataset=dict([(dataset,[]) for dataset in datasets])
+for fieldname in fieldnames:
+dataset=self.datasets[self.fieldname2dataset[fieldname]]
+datasets.add(dataset)
+fields_in_dataset[dataset].append(fieldname)
+datasets=list(datasets)
+iterators=[dataset.minibatches(fields_in_dataset[dataset],minibatch_size,n_batches,offset)
+for dataset in datasets]
+else:
+datasets=self.datasets
+iterators=[dataset.minibatches(None,minibatch_size,n_batches,offset) for dataset in datasets]
+return HStackedIterator(self,iterators)
+def untested_valuesVStack(self,fieldname,fieldvalues):
+return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues)
+def untested_valuesHStack(self,fieldnames,fieldvalues):
+"""
+We will use the sub-dataset associated with the first fieldname in the fieldnames list
+to do the work, hoping that it can cope with the other values (i.e. won't care
+about the incompatible fieldnames). Hence this heuristic will always work if
+all the fieldnames are of the same sub-dataset.
+"""
+return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues)
+class VStackedDataSet(DataSet):
+"""
+A L{DataSet} that wraps several datasets and shows a view that includes all their examples,
+in the order provided. This clearly assumes that they all have the same field names
+and all (except possibly the last one) are of finite length.
+@todo: automatically detect a chain of stacked datasets due to A + B + C + D ...
+"""
+def __init__(self,datasets):
+self.datasets=datasets
+self.length=0
+self.index2dataset={}
+assert len(datasets)>0
+fieldnames = datasets[-1].fieldNames()
+self.datasets_start_row=[]
+# We use this map from row index to dataset index for constant-time random access of examples,
+# to avoid having to search for the appropriate dataset each time and slice is asked for.
+for dataset,k in enumerate(datasets[0:-1]):
+assert dataset.is_unbounded() # All VStacked datasets (except possibly the last) must be bounded (have a length).
+L=len(dataset)
+for i in xrange(L):
+self.index2dataset[self.length+i]=k
+self.datasets_start_row.append(self.length)
+self.length+=L
+assert dataset.fieldNames()==fieldnames
+self.datasets_start_row.append(self.length)
+self.length+=len(datasets[-1])
+# If length is very large, we should use a more memory-efficient mechanism
+# that does not store all indices
+if self.length>1000000:
+# 1 million entries would require about 60 meg for the index2dataset map
+# TODO
+print "A more efficient mechanism for index2dataset should be implemented"
+def __len__(self):
+return self.length
+def fieldNames(self):
+return self.datasets[0].fieldNames()
+def hasFields(self,*fieldnames):
+return self.datasets[0].hasFields(*fieldnames)
+def locate_row(self,row):
+"""Return (dataset_index, row_within_dataset) for global row number"""
+dataset_index = self.index2dataset[row]
+row_within_dataset = self.datasets_start_row[dataset_index]
+return dataset_index, row_within_dataset
+def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+class VStackedIterator(object):
+def __init__(self,vsds):
+self.vsds=vsds
+self.next_row=offset
+self.next_dataset_index,self.next_dataset_row=self.vsds.locate_row(offset)
+self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
+self.next_iterator(vsds.datasets[0],offset,n_batches)
+def next_iterator(self,dataset,starting_offset,batches_left):
+L=len(dataset)
+ds_nbatches = (L-starting_offset)/minibatch_size
+if batches_left is not None:
+ds_nbatches = max(batches_left,ds_nbatches)
+if minibatch_size>L:
+ds_minibatch_size=L
+n_left_in_mb=minibatch_size-L
+ds_nbatches=1
+else:
+n_left_in_mb=0
+return dataset.minibatches(fieldnames,minibatch_size,ds_nbatches,starting_offset), \
+L-(starting_offset+ds_nbatches*minibatch_size), n_left_in_mb
+def move_to_next_dataset(self):
+if self.n_left_at_the_end_of_ds>0:
+self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
+self.next_iterator(vsds.datasets[self.next_dataset_index],
+self.n_left_at_the_end_of_ds,1)
+else:
+self.next_dataset_index +=1
+if self.next_dataset_index==len(self.vsds.datasets):
+self.next_dataset_index = 0
+self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
+self.next_iterator(vsds.datasets[self.next_dataset_index],starting_offset,n_batches)
+def __iter__(self):
+return self
+def next(self):
+dataset=self.vsds.datasets[self.next_dataset_index]
+mb = self.next_iterator.next()
+if self.n_left_in_mb:
+extra_mb = []
+while self.n_left_in_mb>0:
+self.move_to_next_dataset()
+extra_mb.append(self.next_iterator.next())
+mb = Example(fieldnames,
+[dataset.valuesVStack(name,
+[mb[name]]+[b[name] for b in extra_mb])
+for name in fieldnames])
+self.next_row+=minibatch_size
+self.next_dataset_row+=minibatch_size
+if self.next_row+minibatch_size>len(dataset):
+self.move_to_next_dataset()
+return examples
+return VStackedIterator(self)
+class ArrayFieldsDataSet(DataSet):
+"""
+Virtual super-class of datasets whose field values are numpy array,
+thus defining valuesHStack and valuesVStack for sub-classes.
+"""
+def __init__(self,description=None,field_types=None):
+DataSet.__init__(self,description,field_types)
+def untested_valuesHStack(self,fieldnames,fieldvalues):
+"""Concatenate field values horizontally, e.g. two vectors
+become a longer vector, two matrices become a wider matrix, etc."""
+return numpy.hstack(fieldvalues)
+def untested_valuesVStack(self,fieldname,values):
+"""Concatenate field values vertically, e.g. two vectors
+become a two-row matrix, two matrices become a longer matrix, etc."""
+return numpy.vstack(values)
+class NArraysDataSet(ArrayFieldsDataSet) :
+"""
+An NArraysDataSet stores fields that are numpy tensor, whose first axis
+iterates over examples. It's a generalization of ArrayDataSet.
+"""
+#@TODO not completely implemented yet
+def __init__(self, data_arrays, fieldnames, **kwargs) :
+"""
+Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list
+of fieldnames. The number of arrays must be the same as the number of
+fieldnames. Each set of numpy tensor must have the same first dimension (first
+axis) corresponding to the number of examples.
+Every tensor is treated as a numpy array (using numpy.asarray)
+"""
+ArrayFieldsDataSet.__init__(self,**kwargs)
+assert len(data_arrays) == len(fieldnames)
+assert len(fieldnames) > 0
+ndarrays = [numpy.asarray(a) for a in data_arrays]
+lens = [a.shape[0] for a in ndarrays]
+num_examples = lens[0] #they must all be equal anyway
+self._fieldnames = fieldnames
+for k in ndarrays :
+assert k.shape[0] == num_examples
+self._datas = ndarrays
+# create dict
+self.map_field_idx = dict()
+for k in range(len(fieldnames)):
+self.map_field_idx[fieldnames[k]] = k
+def __len__(self) :
+"""
+Length of the dataset is based on the first array = data_arrays[0], using its shape
+"""
+return self._datas[0].shape[0]
+def fieldNames(self) :
+"""
+Returns the fieldnames as set in self.__init__
+"""
+return self._fieldnames
+def field_pos(self,fieldname) :
+"""
+Returns the index of a given fieldname. Fieldname must exists! see fieldNames().
+"""
+return self.map_field_idx[fieldname]
+def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+cursor = Example(fieldnames,[0]*len(fieldnames))
+fieldnames = self.fieldNames() if fieldnames is None else fieldnames
+for n in xrange(n_batches):
+if offset == len(self):
+break
+for f in range(len(cursor._names)) :
+idx = self.field_pos(cursor._names[f])
+sub_data = self._datas[idx][offset : offset+minibatch_size]
+cursor._values[f] = sub_data
+offset += len(sub_data) #can be less than minibatch_size at end
+yield cursor
+#return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
+class ArrayDataSet(ArrayFieldsDataSet):
+"""
+An ArrayDataSet stores the fields as groups of columns in a numpy tensor,
+whose first axis iterates over examples, second axis determines fields.
+If the underlying array is N-dimensional (has N axes), then the field
+values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2).
+"""
+def __init__(self, data_array, fields_columns, **kwargs):
+"""
+Construct an ArrayDataSet from the underlying numpy array (data) and
+a map (fields_columns) from fieldnames to field columns. The columns of a field are specified
+using the standard arguments for indexing/slicing: integer for a column index,
+slice for an interval of columns (with possible stride), or iterable of column indices.
+"""
+ArrayFieldsDataSet.__init__(self, **kwargs)
+self.data=data_array
+self.fields_columns=fields_columns
+# check consistency and complete slices definitions
+for fieldname, fieldcolumns in self.fields_columns.items():
+if type(fieldcolumns) is int:
+assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1]
+if 1:
+#I changed this because it didn't make sense to me,
+# and it made it more difficult to write my learner.
+# If it breaks stuff, let's talk about it.
+# - James 22/05/2008
+self.fields_columns[fieldname]=[fieldcolumns]
+else:
+self.fields_columns[fieldname]=fieldcolumns
+elif type(fieldcolumns) is slice:
+start,step=fieldcolumns.start,fieldcolumns.step
+if not start:
+start=0
+if not step:
+step=1
+self.fields_columns[fieldname]=slice(start,fieldcolumns.stop,step)
+elif hasattr(fieldcolumns,"__iter__"): # something like a list
+for i in fieldcolumns:
+assert i>=0 and i<data_array.shape[1]
+def fieldNames(self):
+return self.fields_columns.keys()
+def __len__(self):
+return len(self.data)
+def __getitem__(self,key):
+"""More efficient implementation than the default __getitem__"""
+fieldnames=self.fields_columns.keys()
+values=self.fields_columns.values()
+if type(key) is int:
+return Example(fieldnames,
+[self.data[key,col] for col in values])
+if type(key) is slice:
+return Example(fieldnames,[self.data[key,col] for col in values])
+if type(key) is list:
+for i in range(len(key)):
+if self.hasFields(key[i]):
+key[i]=self.fields_columns[key[i]]
+return Example(fieldnames,
+#we must separate differently for list as numpy
+# doesn't support self.data[[i1,...],[i2,...]]
+# when their is more then two i1 and i2
+[self.data[key,:][:,col]
+if isinstance(col,list) else
+self.data[key,col] for col in values])
+# else check for a fieldname
+if self.hasFields(key):
+return self.data[:,self.fields_columns[key]]
+# else we are trying to access a property of the dataset
+assert key in self.__dict__ # else it means we are trying to access a non-existing property
+return self.__dict__[key]
+def dontuse__iter__(self):
+class ArrayDataSetIteratorIter(object):
+def __init__(self,dataset,fieldnames):
+if fieldnames is None: fieldnames = dataset.fieldNames()
+# store the resulting minibatch in a lookup-list of values
+self.minibatch = Example(fieldnames,[0]*len(fieldnames))
+self.dataset=dataset
+self.current=0
+self.columns = [self.dataset.fields_columns[f]
+for f in self.minibatch._names]
+self.l = self.dataset.data.shape[0]
+def __iter__(self):
+return self
+def next(self):
+#@todo: we suppose that we need to stop only when minibatch_size == 1.
+# Otherwise, MinibatchWrapAroundIterator do it.
+if self.current>=self.l:
+raise StopIteration
+sub_data =  self.dataset.data[self.current]
+self.minibatch._values = [sub_data[c] for c in self.columns]
+self.current+=1
+return self.minibatch
+return ArrayDataSetIteratorIter(self,self.fieldNames())
+def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+cursor = Example(fieldnames,[0]*len(fieldnames))
+fieldnames = self.fieldNames() if fieldnames is None else fieldnames
+if n_batches == None:
+n_batches = (len(self) - offset) / minibatch_size
+for n in xrange(n_batches):
+if offset == len(self):
+break
+sub_data = self.data[offset : offset+minibatch_size]
+offset += len(sub_data) #can be less than minibatch_size at end
+cursor._values = [sub_data[:,self.fields_columns[f]] for f in cursor._names]
+yield cursor
+#return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
+class CachedDataSet(DataSet):
+"""
+Wrap a L{DataSet} whose values are computationally expensive to obtain
+(e.g. because they involve some computation, or disk access),
+so that repeated accesses to the same example are done cheaply,
+by caching every example value that has been accessed at least once.
+Optionally, for finite-length dataset, all the values can be computed
+(and cached) upon construction of the CachedDataSet, rather at the
+first access.
+@todo: when cache_all_upon_construction create mini-batches that are as
+large as possible but not so large as to fill up memory.
+@todo: add disk-buffering capability, so that when the cache becomes too
+big for memory, we cache things on disk, trying to keep in memory only
+the record most likely to be accessed next.
+"""
+def __init__(self,source_dataset,cache_all_upon_construction=False):
+self.source_dataset=source_dataset
+self.cache_all_upon_construction=cache_all_upon_construction
+self.cached_examples = []
+if cache_all_upon_construction:
+# this potentially brings all the source examples
+# into memory at once, which may be too much
+# the work could possibly be done by minibatches
+# that are as large as possible but no more than what memory allows.
+#
+# field_values is supposed to be an DataSetFields, that inherits from LookupList
+#fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()
+fields_values = DataSetFields(source_dataset,None)
+assert all([len(self)==len(field_values) for field_values in fields_values])
+for example in fields_values.examples():
+self.cached_examples.append(copy.copy(example))
+self.fieldNames = source_dataset.fieldNames
+self.hasFields = source_dataset.hasFields
+self.valuesHStack = source_dataset.valuesHStack
+self.valuesVStack = source_dataset.valuesVStack
+def __len__(self):
+return len(self.source_dataset)
+def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+class CacheIterator(object):
+def __init__(self,dataset):
+self.dataset=dataset
+self.current=offset
+self.all_fields = self.dataset.fieldNames()==fieldnames
+self.n_batches = n_batches
+self.batch_counter = 0
+def __iter__(self): return self
+def next(self):
+self.batch_counter += 1
+if self.n_batches and self.batch_counter > self.n_batches :
+raise StopIteration()
+upper = self.current+minibatch_size
+if upper > len(self.dataset.source_dataset):
+raise StopIteration()
+cache_len = len(self.dataset.cached_examples)
+if upper>cache_len: # whole minibatch is not already in cache
+# cache everything from current length to upper
+#for example in self.dataset.source_dataset[cache_len:upper]:
+for example in self.dataset.source_dataset.subset[cache_len:upper]:
+self.dataset.cached_examples.append(example)
+all_fields_minibatch = Example(self.dataset.fieldNames(),
+zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size]))
+self.current+=minibatch_size
+if self.all_fields:
+return all_fields_minibatch
+return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
+return CacheIterator(self)
+def dontuse__getitem__(self,i):
+if type(i)==int and len(self.cached_examples)>i:
+return self.cached_examples[i]
+else:
+return self.source_dataset[i]
+def __iter__(self):
+class CacheIteratorIter(object):
+def __init__(self,dataset):
+self.dataset=dataset
+self.l = len(dataset)
+self.current = 0
+self.fieldnames = self.dataset.fieldNames()
+self.example = Example(self.fieldnames,[0]*len(self.fieldnames))
+def __iter__(self): return self
+def next(self):
+if self.current>=self.l:
+raise StopIteration
+cache_len = len(self.dataset.cached_examples)
+if self.current>=cache_len: # whole minibatch is not already in cache
+# cache everything from current length to upper
+self.dataset.cached_examples.append(
+self.dataset.source_dataset[self.current])
+self.example._values = self.dataset.cached_examples[self.current]
+self.current+=1
+return self.example
+return CacheIteratorIter(self)
+class ApplyFunctionDataSet(DataSet):
+"""
+A L{DataSet} that contains as fields the results of applying a
+given function example-wise or minibatch-wise to all the fields of
+an input dataset.  The output of the function should be an iterable
+(e.g. a list or a LookupList) over the resulting values.
+The function take as input the fields of the dataset, not the examples.
+In minibatch mode, the function is expected to work on minibatches
+(takes a minibatch in input and returns a minibatch in output). More
+precisely, it means that each element of the input or output list
+should be iterable and indexable over the individual example values
+(typically these elements will be numpy arrays). All of the elements
+in the input and output lists should have the same length, which is
+the length of the minibatch.
+The function is applied each time an example or a minibatch is accessed.
+To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
+If the values_{h,v}stack functions are not provided, then
+the input_dataset.values{H,V}Stack functions are used by default.
+"""
+def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
+values_hstack=None,values_vstack=None,
+description=None,fieldtypes=None):
+"""
+Constructor takes an input dataset that has as many fields as the function
+expects as inputs. The resulting dataset has as many fields as the function
+produces as outputs, and that should correspond to the number of output names
+(provided in a list).
+Note that the expected semantics of the function differs in minibatch mode
+(it takes minibatches of inputs and produces minibatches of outputs, as
+documented in the class comment).
+TBM: are fieldtypes the old field types (from input_dataset) or the new ones
+(for the new dataset created)?
+"""
+self.input_dataset=input_dataset
+self.function=function
+self.output_names=output_names
+#print 'self.output_names in afds:', self.output_names
+#print 'length in afds:', len(self.output_names)
+self.minibatch_mode=minibatch_mode
+DataSet.__init__(self,description,fieldtypes)
+self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
+self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
+def __len__(self):
+return len(self.input_dataset)
+def fieldNames(self):
+return self.output_names
+def minibatches_nowrap(self, fieldnames, *args, **kwargs):
+all_input_fieldNames = self.input_dataset.fieldNames()
+mbnw = self.input_dataset.minibatches_nowrap
+for input_fields in mbnw(all_input_fieldNames, *args, **kwargs):
+if self.minibatch_mode:
+all_output_fields = self.function(*input_fields)
+else:
+input_examples = zip(*input_fields) #makes so that [i] means example i
+output_examples = [self.function(*input_example)
+for input_example in input_examples]
+all_output_fields = zip(*output_examples)
+#print 'output_names=', self.output_names
+#print 'all_output_fields', all_output_fields
+#print 'len(all_output_fields)=', len(all_output_fields)
+all_outputs = Example(self.output_names, all_output_fields)
+if fieldnames==self.output_names:
+rval = all_outputs
+else:
+rval = Example(fieldnames,[all_outputs[name] for name in fieldnames])
+#print 'rval', rval
+#print '--------'
+yield rval
+def untested__iter__(self): # only implemented for increased efficiency
+class ApplyFunctionSingleExampleIterator(object):
+def __init__(self,output_dataset):
+self.current=0
+self.output_dataset=output_dataset
+self.input_iterator=output_dataset.input_dataset.__iter__()
+def __iter__(self): return self
+def next(self):
+if self.output_dataset.minibatch_mode:
+function_inputs = [[input] for input in self.input_iterator.next()]
+outputs = self.output_dataset.function(*function_inputs)
+assert all([hasattr(output,'__iter__') for output in outputs])
+function_outputs = [output[0] for output in outputs]
+else:
+function_inputs = self.input_iterator.next()
+function_outputs = self.output_dataset.function(*function_inputs)
+return Example(self.output_dataset.output_names,function_outputs)
+return ApplyFunctionSingleExampleIterator(self)
+def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
+"""
+Wraps an arbitrary L{DataSet} into one for supervised learning tasks
+by forcing the user to define a set of fields as the 'input' field
+and a set of fields as the 'target' field. Optionally, a single
+weight_field can also be defined.
+"""
+args = ((input_fields,'input'),(output_fields,'target'))
+if weight_field: args+=(([weight_field],'weight'))
+return src_dataset.merge_fields(*args)

Mercurial > pylearn

comparison pylearn/old_dataset/dataset.py @ 537:b054271b2504