Mercurial > pylearn
diff dataset.py @ 71:5b699b31770a
merge
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Fri, 02 May 2008 18:19:35 -0400 |
parents | dde1fb1b63ba |
children | 2b6656b2ef52 40476a7746e8 |
line wrap: on
line diff
--- a/dataset.py Fri May 02 18:19:15 2008 -0400 +++ b/dataset.py Fri May 02 18:19:35 2008 -0400 @@ -1,23 +1,39 @@ from lookup_list import LookupList Example = LookupList -import copy +from misc import unique_elements_list_intersection +from string import join +from sys import maxint +import numpy class AbstractFunction (Exception): """Derived class must override this function""" - +class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented""" + class DataSet(object): """A virtual base class for datasets. + A DataSet can be seen as a generalization of a matrix, meant to be used in conjunction + with learning algorithms (for training and testing them): rows/records are called examples, and + columns/attributes are called fields. The field value for a particular example can be an arbitrary + python object, which depends on the particular dataset. + + We call a DataSet a 'stream' when its length is unbounded (otherwise its __len__ method + should return sys.maxint). + A DataSet is a generator of iterators; these iterators can run through the - examples in a variety of ways. A DataSet need not necessarily have a finite + examples or the fields in a variety of ways. A DataSet need not necessarily have a finite or known length, so this class can be used to interface to a 'stream' which - feeds on-line learning. + feeds on-line learning (however, as noted below, some operations are not + feasible or not recommanded on streams). To iterate over examples, there are several possibilities: - - for example in dataset.zip([field1, field2,field3, ...]) - - for val1,val2,val3 in dataset.zip([field1, field2,field3]) - - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N) - - for example in dataset + * for example in dataset([field1, field2,field3, ...]): + * for val1,val2,val3 in dataset([field1, field2,field3]): + * for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N): + * for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N): + * for example in dataset: + print example['x'] + * for x,y,z in dataset: Each of these is documented below. All of these iterators are expected to provide, in addition to the usual 'next()' method, a 'next_index()' method which returns a non-negative integer pointing to the position of the next @@ -26,40 +42,122 @@ can wrap around the dataset in order to do multiple passes through it, in possibly unregular ways if the minibatch size is not a divisor of the dataset length. - + + To iterate over fields, one can do + * for field in dataset.fields(): + for field_value in field: # iterate over the values associated to that field for all the dataset examples + * for field in dataset(field1,field2,...).fields() to select a subset of fields + * for field in dataset.fields(field1,field2,...) to select a subset of fields + and each of these fields is iterable over the examples: + * for field_examples in dataset.fields(): + for example_value in field_examples: + ... + but when the dataset is a stream (unbounded length), it is not recommanded to do + such things because the underlying dataset may refuse to access the different fields in + an unsynchronized ways. Hence the fields() method is illegal for streams, by default. + The result of fields() is a DataSetFields object, which iterates over fields, + and whose elements are iterable over examples. A DataSetFields object can + be turned back into a DataSet with its examples() method: + dataset2 = dataset1.fields().examples() + and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1). + Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. - Note: The content of a field can be of any type. + Note: The content of a field can be of any type. Field values can also be 'missing' + (e.g. to handle semi-supervised learning), and in the case of numeric (numpy array) + fields (i.e. an ArrayFieldsDataSet), NaN plays the role of a missing value. + What about non-numeric values? None. + + Dataset elements can be indexed and sub-datasets (with a subset + of examples) can be extracted. These operations are not supported + by default in the case of streams. + + * dataset[:n] returns a dataset with the n first examples. + + * dataset[i1:i2:s] returns a dataset with the examples i1,i1+s,...i2-s. + + * dataset[i] returns an Example. - Note: A dataset can recognize a potentially infinite number of field names (i.e. the field - values can be computed on-demand, when particular field names are used in one of the - iterators). + * dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in. + + * dataset[fieldname] an iterable over the values of the field fieldname across + the dataset (the iterable is obtained by default by calling valuesVStack + over the values for individual examples). + + * dataset.<property> returns the value of a property associated with + the name <property>. The following properties should be supported: + - 'description': a textual description or name for the dataset + - 'fieldtypes': a list of types (one per field) + + Datasets can be concatenated either vertically (increasing the length) or + horizontally (augmenting the set of fields), if they are compatible, using + the following operations (with the same basic semantics as numpy.hstack + and numpy.vstack): - Datasets of finite length should be sub-classes of FiniteLengthDataSet. + * dataset1 | dataset2 | dataset3 == dataset.hstack([dataset1,dataset2,dataset3]) + + creates a new dataset whose list of fields is the concatenation of the list of + fields of the argument datasets. This only works if they all have the same length. + + * dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3]) + + creates a new dataset that concatenates the examples from the argument datasets + (and whose length is the sum of the length of the argument datasets). This only + works if they all have the same fields. + + According to the same logic, and viewing a DataSetFields object associated to + a DataSet as a kind of transpose of it, fields1 & fields2 concatenates fields of + a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their + examples. - Datasets whose elements can be indexed and whose sub-datasets (with a subset - of examples) can be extracted should be sub-classes of - SliceableDataSet. - - Datasets with a finite number of fields should be sub-classes of - FiniteWidthDataSet. + A dataset can hold arbitrary key-value pairs that may be used to access meta-data + or other properties of the dataset or associated with the dataset or the result + of a computation stored in a dataset. These can be accessed through the [key] syntax + when key is a string (or more specifically, neither an integer, a slice, nor a list). + + A DataSet sub-class should always redefine the following methods: + * __len__ if it is not a stream + * fieldNames + * minibatches_nowrap (called by DataSet.minibatches()) + * valuesHStack + * valuesVStack + For efficiency of implementation, a sub-class might also want to redefine + * hasFields + * __getitem__ may not be feasible with some streams + * __iter__ """ - def __init__(self): - pass + def __init__(self,description=None,fieldtypes=None): + if description is None: + # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)" + description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" + self.description=description + self.fieldtypes=fieldtypes - class Iterator(LookupList): - def __init__(self, ll): - LookupList.__init__(self, ll.keys(), ll.values()) - self.ll = ll + class MinibatchToSingleExampleIterator(object): + """ + Converts the result of minibatch iterator with minibatch_size==1 into + single-example values in the result. Therefore the result of + iterating on the dataset itself gives a sequence of single examples + (whereas the result of iterating over minibatches gives in each + Example field an iterable object over the individual examples in + the minibatch). + """ + def __init__(self, minibatch_iterator): + self.minibatch_iterator = minibatch_iterator + self.minibatch = None def __iter__(self): #makes for loop work return self def next(self): - self.ll.next() - self._values = [v[0] for v in self.ll._values] - return self + size1_minibatch = self.minibatch_iterator.next() + if not self.minibatch: + self.minibatch = Example(size1_minibatch.keys(),[value[0] for value in size1_minibatch.values()]) + else: + self.minibatch._values = [value[0] for value in size1_minibatch.values()] + return self.minibatch + def next_index(self): - return self.ll.next_index() + return self.minibatch_iterator.next_index() def __iter__(self): """Supports the syntax "for i in dataset: ..." @@ -70,38 +168,78 @@ i["fielname"] or i[3] (in the order defined by the elements of the Example returned by this iterator), but the derived class is free to accept any type of identifier, and add extra functionality to the iterator. - """ - return DataSet.Iterator(self.minibatches(None, minibatch_size = 1)) - def zip(self, *fieldnames): + The default implementation calls the minibatches iterator and extracts the first example of each field. """ - Supports two forms of syntax: + return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1)) - for i in dataset.zip([f1, f2, f3]): ... - - for i1, i2, i3 in dataset.zip([f1, f2, f3]): ... - Using the first syntax, "i" will be an indexable object, such as a list, - tuple, or Example instance, such that on every iteration, i[0] is the f1 - field of the current example, i[1] is the f2 field, and so on. + class MinibatchWrapAroundIterator(object): + """ + An iterator for minibatches that handles the case where we need to wrap around the + dataset because n_batches*minibatch_size > len(dataset). It is constructed from + a dataset that provides a minibatch iterator that does not need to handle that problem. + This class is a utility for dataset subclass writers, so that they do not have to handle + this issue multiple times, nor check that fieldnames are valid, nor handle the + empty fieldnames (meaning 'use all the fields'). + """ + def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset): + self.dataset=dataset + self.fieldnames=fieldnames + self.minibatch_size=minibatch_size + self.n_batches=n_batches + self.n_batches_done=0 + self.next_row=offset + self.L=len(dataset) + assert offset+minibatch_size<=self.L + ds_nbatches = (self.L-offset)/minibatch_size + if n_batches is not None: + ds_nbatches = max(n_batches,ds_nbatches) + if fieldnames: + assert dataset.hasFields(*fieldnames) + else: + fieldnames=dataset.fieldNames() + self.iterator = dataset.minibatches_nowrap(fieldnames,minibatch_size,ds_nbatches,offset) - Using the second syntax, i1, i2, i3 will contain the the contents of the - f1, f2, and f3 fields of a single example on each loop iteration. + def __iter__(self): + return self + + def next_index(self): + return self.next_row - The derived class may accept fieldname arguments of any type. + def next(self): + if self.n_batches and self.n_batches_done==self.n_batches: + raise StopIteration + upper = self.next_row+self.minibatch_size + if upper <=self.L: + minibatch = self.iterator.next() + else: + if not self.n_batches: + raise StopIteration + # we must concatenate (vstack) the bottom and top parts of our minibatch + # first get the beginning of our minibatch (top of dataset) + first_part = self.dataset.minibatches_nowrap(fieldnames,self.L-self.next_row,1,self.next_row).next() + second_part = self.dataset.minibatches_nowrap(fieldnames,upper-self.L,1,0).next() + minibatch = Example(self.fieldnames, + [self.dataset.valuesVStack(name,[first_part[name],second_part[name]]) + for name in self.fieldnames]) + self.next_row=upper + self.n_batches_done+=1 + if upper >= self.L and self.n_batches: + self.next_row -= self.L + return minibatch - """ - return DataSet.Iterator(self.minibatches(fieldnames, minibatch_size = 1)) minibatches_fieldnames = None minibatches_minibatch_size = 1 minibatches_n_batches = None def minibatches(self, - fieldnames = minibatches_fieldnames, - minibatch_size = minibatches_minibatch_size, - n_batches = minibatches_n_batches): + fieldnames = minibatches_fieldnames, + minibatch_size = minibatches_minibatch_size, + n_batches = minibatches_n_batches, + offset = 0): """ - Supports three forms of syntax: + Return an iterator that supports three forms of syntax: for i in dataset.minibatches(None,**kwargs): ... @@ -116,12 +254,16 @@ a list-like container of the f2 field, etc. Using the first syntax, all the fields will be returned in "i". - Beware that some datasets may not support this syntax, if the number - of fields is infinite (i.e. field values may be computed "on demand"). - Using the third syntax, i1, i2, i3 will be list-like containers of the f1, f2, and f3 fields of a batch of examples on each loop iteration. + The minibatches iterator is expected to return upon each call to next() + a DataSetFields object, which is a LookupList (indexed by the field names) whose + elements are iterable over the minibatch examples, and which keeps a pointer to + a sub-dataset that can be used to iterate over the individual examples + in the minibatch. Hence a minibatch can be converted back to a regular + dataset or its fields can be looked at individually (and possibly iterated over). + PARAMETERS - fieldnames (list of any type, default None): The loop variables i1, i2, i3 (in the example above) should contain the @@ -137,140 +279,74 @@ the derived class can choose a default. If (-1), then the returned iterator should support looping indefinitely. + - offset (integer, default 0) + The iterator will start at example 'offset' in the dataset, rather than the default. + Note: A list-like container is something like a tuple, list, numpy.ndarray or any other object that supports integer indexing and slicing. """ + return DataSet.MinibatchWrapAroundIterator(self,fieldnames,minibatch_size,n_batches,offset) + + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + """ + This is the minibatches iterator generator that sub-classes must define. + It does not need to worry about wrapping around multiple times across the dataset, + as this is handled by MinibatchWrapAroundIterator when DataSet.minibatches() is called. + The next() method of the returned iterator does not even need to worry about + the termination condition (as StopIteration will be raised by DataSet.minibatches + before an improper call to minibatches_nowrap's next() is made). + That next() method can assert that its next row will always be within [0,len(dataset)). + The iterator returned by minibatches_nowrap does not need to implement + a next_index() method either, as this will be provided by MinibatchWrapAroundIterator. + """ raise AbstractFunction() + def __len__(self): + """ + len(dataset) returns the number of examples in the dataset. + By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint). + Sub-classes which implement finite-length datasets should redefine this method. + Some methods only make sense for finite-length datasets. + """ + return sys.maxint + + def is_unbounded(self): + """ + Tests whether a dataset is unbounded (e.g. a stream). + """ + return len(self)==sys.maxint + def hasFields(self,*fieldnames): """ Return true if the given field name (or field names, if multiple arguments are given) is recognized by the DataSet (i.e. can be used as a field name in one of the iterators). - """ - raise AbstractFunction() - - def merge_fields(self,*specifications): + The default implementation may be inefficient (O(# fields in dataset)), as it calls the fieldNames() + method. Many datasets may store their field names in a dictionary, which would allow more efficiency. """ - Return a new dataset that maps old fields (of self) to new fields (of the returned - dataset). The minimal syntax that should be supported is the following: - new_field_specifications = [new_field_spec1, new_field_spec2, ...] - new_field_spec = ([old_field1, old_field2, ...], new_field) - In general both old_field and new_field should be strings, but some datasets may also - support additional indexing schemes within each field (e.g. column slice - of a matrix-like field). + return len(unique_elements_list_intersection(fieldnames,self.fieldNames()))>0 + + def fieldNames(self): + """ + Return the list of field names that are supported by the iterators, + and for which hasFields(fieldname) would return True. """ raise AbstractFunction() - def merge_field_values(self,*field_value_pairs): - """ - Return the value that corresponds to merging the values of several fields, - given as arguments (field_name, field_value) pairs with self.hasField(field_name). - This may be used by implementations of merge_fields. - Raise a ValueError if the operation is not possible. - """ - fieldnames,fieldvalues = zip(*field_value_pairs) - raise ValueError("Unable to merge values of these fields:"+repr(fieldnames)) - - def examples2minibatch(self,examples): - """ - Combine a list of Examples into a minibatch. A minibatch is an Example whose fields - are iterable over the examples of the minibatch. - """ - raise AbstractFunction() - - def rename(self,rename_dict): - """ - Changes a dataset into one that renames fields, using a dictionnary that maps old field - names to new field names. The only fields visible by the returned dataset are those - whose names are keys of the rename_dict. + def __call__(self,*fieldnames): """ - self_class = self.__class__ - class SelfRenamingDataSet(RenamingDataSet,self_class): - pass - self.__class__ = SelfRenamingDataSet - # set the rename_dict and src fields - SelfRenamingDataSet.__init__(self,self,rename_dict) - return self - - def apply_function(self,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): + Return a dataset that sees only the fields whose name are specified. """ - Changes a dataset into one that contains as fields the results of applying - the given function (example-wise) to the specified input_fields. The - function should return a sequence whose elements will be stored in - fields whose names are given in the output_fields list. If copy_inputs - is True then the resulting dataset will also contain the fields of self. - If accept_minibatches, then the function may be called - with minibatches as arguments (what is returned by the minibatches - iterator). In any case, the computations may be delayed until the examples - of the resulting dataset are requested. If cache is True, then - once the output fields for some examples have been computed, then - are cached (to avoid recomputation if the same examples are again - requested). - """ - self_class = self.__class__ - class SelfApplyFunctionDataSet(ApplyFunctionDataSet,self_class): - pass - self.__class__ = SelfApplyFunctionDataSet - # set the required additional fields - ApplyFunctionDataSet.__init__(self,self,function, input_fields, output_fields, copy_inputs, accept_minibatches, cache) - return self + assert self.hasFields(*fieldnames) + return self.fields(*fieldnames).examples() - -class FiniteLengthDataSet(DataSet): - """ - Virtual interface for datasets that have a finite length (number of examples), - and thus recognize a len(dataset) call. - """ - def __init__(self): - DataSet.__init__(self) - - def __len__(self): - """len(dataset) returns the number of examples in the dataset.""" - raise AbstractFunction() - - def __call__(self,fieldname_or_fieldnames): - """ - Extract one or more fields. This may be an expensive operation when the - dataset is large. It is not the recommanded way to access individual values - (use the iterators instead). If the argument is a string fieldname, then the result - is a sequence (iterable object) of values for that field, for the whole dataset. If the - argument is a list of field names, then the result is a 'batch', i.e., an Example with keys - corresponding to the given field names and values being iterable objects over the - individual example values. + def fields(self,*fieldnames): """ - if type(fieldname_or_fieldnames) is string: - minibatch = self.minibatches([fieldname_or_fieldnames],len(self)).next() - return minibatch[fieldname_or_fieldnames] - return self.minibatches(fieldname_or_fieldnames,len(self)).next() - -class SliceableDataSet(DataSet): - """ - Virtual interface, a subclass of DataSet for datasets which are sliceable - and whose individual elements can be accessed, generally respecting the - python semantics for [spec], where spec is either a non-negative integer - (for selecting one example), a python slice(start,stop,step) for selecting a regular - sub-dataset comprising examples start,start+step,start+2*step,...,n (with n<stop), or a - sequence (e.g. a list) of integers [i1,i2,...,in] for selecting - an arbitrary subset of examples. This is useful for obtaining - sub-datasets, e.g. for splitting a dataset into training and test sets. - """ - def __init__(self): - DataSet.__init__(self) - - def minibatches(self, - fieldnames = DataSet.minibatches_fieldnames, - minibatch_size = DataSet.minibatches_minibatch_size, - n_batches = DataSet.minibatches_n_batches): + Return a DataSetFields object associated with this dataset. """ - If the n_batches is empty, we want to see all the examples possible - for the given minibatch_size (possibly missing a few at the end of the dataset). - """ - # substitute the defaults: - if n_batches is None: n_batches = len(self) / minibatch_size - return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches) + return DataSetFields(self,*fieldnames) def __getitem__(self,i): """ @@ -278,405 +354,635 @@ dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in. - """ - raise AbstractFunction() - - def __getslice__(self,*slice_args): - """ - dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. - dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. - """ - raise AbstractFunction() - - -class FiniteWidthDataSet(DataSet): - """ - Virtual interface for datasets that have a finite width (number of fields), - and thus return a list of fieldNames. - """ - def __init__(self): - DataSet.__init__(self) - - def hasFields(self,*fields): - has_fields=True - fieldnames = self.fieldNames() - for name in fields: - if name not in fieldnames: - has_fields=False - return has_fields - - def fieldNames(self): - """Return the list of field names that are supported by the iterators, - and for which hasFields(fieldname) would return True.""" - raise AbstractFunction() - - -class RenamingDataSet(FiniteWidthDataSet): - """A DataSet that wraps another one, and makes it look like the field names - are different + dataset['key'] returns a property associated with the given 'key' string. + If 'key' is a fieldname, then the VStacked field values (iterable over + field values) for that field is returned. Other keys may be supported + by different dataset subclasses. The following key names are encouraged: + - 'description': a textual description or name for the dataset + - '<fieldname>.type': a type name or value for a given <fieldname> - Renaming is done by a dictionary that maps new names to the old ones used in - self.src. - """ - def __init__(self, src, rename_dct): - DataSet.__init__(self) - self.src = src - self.rename_dct = copy.copy(rename_dct) - - def fieldNames(self): - return self.rename_dct.keys() - - def minibatches(self, - fieldnames = DataSet.minibatches_fieldnames, - minibatch_size = DataSet.minibatches_minibatch_size, - n_batches = DataSet.minibatches_n_batches): - dct = self.rename_dct - new_fieldnames = [dct.get(f, f) for f in fieldnames] - return self.src.minibatches(new_fieldnames, minibatches_size, n_batches) - - -# we may want ArrayDataSet defined in another python file - -import numpy - -def as_array_dataset(dataset): - # Generally datasets can be efficient by making data fields overlap, but - # this function doesn't know which fields overlap. So, it should check if - # dataset supports an as_array_dataset member function, and return that if - # possible. - if hasattr(dataset, 'as_array_dataset'): - return dataset.as_array_dataset() - - raise NotImplementedError - - # Make ONE big minibatch with all the examples, to separate the fields. - n_examples = len(dataset) - batch = dataset.minibatches( minibatch_size = len(dataset)).next() + Note that some stream datasets may be unable to implement random access, i.e. + arbitrary slicing/indexing + because they can only iterate through examples one or a minibatch at a time + and do not actually store or keep past (or future) examples. - # Each field of the underlying dataset must be convertible to a numpy array of the same type - # currently just double, but should use the smallest compatible dtype - n_fields = len(batch) - fieldnames = batch.fields.keys() - total_width = 0 - type = None - fields = LookupList() - for i in xrange(n_fields): - field = array(batch[i]) - assert field.shape[0]==n_examples - width = field.shape[1] - start=total_width - total_width += width - fields[fieldnames[i]]=slice(start,total_width,1) - # many complicated things remain to be done: - # - find common dtype - # - decide what to do with extra dimensions if not the same in all fields - # - try to see if we can avoid the copy? - -class ArrayDataSet(FiniteLengthDataSet,FiniteWidthDataSet,SliceableDataSet): - """ - An ArrayDataSet behaves like a numpy array but adds the notion of named fields - from DataSet (and the ability to view the values of multiple fields as an 'Example'). - It is a fixed-length and fixed-width dataset - in which each element is a fixed dimension numpy array or a number, hence the whole - dataset corresponds to a numpy array. Fields - must correspond to a slice of array columns or to a list of column numbers. - If the dataset has fields, - each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. - Any dataset can also be converted to a numpy array (losing the notion of fields - by the numpy.array(dataset) call. - """ - - class Iterator(LookupList): - """An iterator over a finite dataset that implements wrap-around""" - def __init__(self, dataset, fieldnames, minibatch_size, next_max): - if fieldnames is None: fieldnames = dataset.fieldNames() - LookupList.__init__(self, fieldnames, [0]*len(fieldnames)) - self.dataset=dataset - self.minibatch_size=minibatch_size - self.next_count = 0 - self.next_max = next_max - self.current = -self.minibatch_size - assert minibatch_size > 0 - if minibatch_size >= len(dataset): - raise NotImplementedError() + The default implementation of getitem uses the minibatches iterator + to obtain one example, one slice, or a list of examples. It may not + always be the most efficient way to obtain the result, especially if + the data are actually stored in a memory array. + """ + # check for an index + if type(i) is int: + return DataSet.MinibatchToSingleExampleIterator( + self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next() + rows=None + # or a slice + if type(i) is slice: + if not i.start: i.start=0 + if not i.step: i.step=1 + if i.step is 1: + return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples() + rows = range(i.start,i.stop,i.step) + # or a list of indices + elif type(i) is list: + rows = i + if rows is not None: + examples = [self[row] for row in rows] + fields_values = zip(*examples) + return MinibatchDataSet( + Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values) + for fieldname,field_values + in zip(self.fieldNames(),fields_values)])) + # else check for a fieldname + if self.hasFields(i): + return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] + # else we are trying to access a property of the dataset + assert i in self.__dict__ # else it means we are trying to access a non-existing property + return self.__dict__[i] - def __iter__(self): #makes for loop work - return self - - @staticmethod - def matcat(a, b): - a0, a1 = a.shape - b0, b1 = b.shape - assert a1 == b1 - assert a.dtype is b.dtype - rval = numpy.empty( (a0 + b0, a1), dtype=a.dtype) - rval[:a0,:] = a - rval[a0:,:] = b - return rval - - def next_index(self): - n_rows = self.dataset.data.shape[0] - next_i = self.current+self.minibatch_size - if next_i >= n_rows: - next_i -= n_rows - return next_i - - def next(self): - - #check for end-of-loop - self.next_count += 1 - if self.next_count == self.next_max: - raise StopIteration - - #determine the first and last elements of the minibatch slice we'll return - n_rows = self.dataset.data.shape[0] - self.current = self.next_index() - upper = self.current + self.minibatch_size - - data = self.dataset.data - - if upper <= n_rows: - #this is the easy case, we only need once slice - dataview = data[self.current:upper] - else: - # the minibatch wraps around the end of the dataset - dataview = data[self.current:] - upper -= n_rows - assert upper > 0 - dataview = self.matcat(dataview, data[:upper]) - - self._values = [dataview[:, self.dataset.fields[f]]\ - for f in self._names] - return self + def valuesHStack(self,fieldnames,fieldvalues): + """ + Return a value that corresponds to concatenating (horizontally) several field values. + This can be useful to merge some fields. The implementation of this operation is likely + to involve a copy of the original values. When the values are numpy arrays, the + result should be numpy.hstack(values). If it makes sense, this operation should + work as well when each value corresponds to multiple examples in a minibatch + e.g. if each value is a Ni-vector and a minibatch of length L is a LxNi matrix, + then the result should be a Lx(N1+N2+..) matrix equal to numpy.hstack(values). + The default is to use numpy.hstack for numpy.ndarray values, and a list + pointing to the original values for other data types. + """ + all_numpy=True + for value in fieldvalues: + if not type(value) is numpy.ndarray: + all_numpy=False + if all_numpy: + return numpy.hstack(fieldvalues) + # the default implementation of horizontal stacking is to put values in a list + return fieldvalues - def __init__(self, data, fields=None): + def valuesVStack(self,fieldname,values): + """ + Return a value that corresponds to concatenating (vertically) several values of the + same field. This can be important to build a minibatch out of individual examples. This + is likely to involve a copy of the original values. When the values are numpy arrays, the + result should be numpy.vstack(values). + The default is to use numpy.vstack for numpy.ndarray values, and a list + pointing to the original values for other data types. + """ + all_numpy=True + for value in values: + if not type(value) is numpy.ndarray: + all_numpy=False + if all_numpy: + return numpy.vstack(values) + # the default implementation of vertical stacking is to put values in a list + return values + + def __or__(self,other): + """ + dataset1 | dataset2 returns a dataset whose list of fields is the concatenation of the list of + fields of the argument datasets. This only works if they all have the same length. + """ + return HStackedDataSet(self,other) + + def __and__(self,other): + """ + dataset1 & dataset2 is a dataset that concatenates the examples from the argument datasets + (and whose length is the sum of the length of the argument datasets). This only + works if they all have the same fields. """ - There are two ways to construct an ArrayDataSet: (1) from an - existing dataset (which may result in a copy of the data in a numpy array), - or (2) from a numpy.array (the data argument), along with an optional description - of the fields (a LookupList of column slices (or column lists) indexed by field names). - """ - self.data=data - self.fields=fields - rows, cols = data.shape + return VStackedDataSet(self,other) + +def hstack(datasets): + """ + hstack(dataset1,dataset2,...) returns dataset1 | datataset2 | ... + which is a dataset whose fields list is the concatenation of the fields + of the individual datasets. + """ + assert len(datasets)>0 + if len(datasets)==1: + return datasets[0] + return HStackedDataSet(datasets) + +def vstack(datasets): + """ + vstack(dataset1,dataset2,...) returns dataset1 & datataset2 & ... + which is a dataset which iterates first over the examples of dataset1, then + over those of dataset2, etc. + """ + assert len(datasets)>0 + if len(datasets)==1: + return datasets[0] + return VStackedDataSet(datasets) + +class FieldsSubsetDataSet(DataSet): + """ + A sub-class of DataSet that selects a subset of the fields. + """ + def __init__(self,src,fieldnames): + self.src=src + self.fieldnames=fieldnames + assert src.hasFields(*fieldnames) + self.valuesHStack = src.valuesHStack + self.valuesVStack = src.valuesVStack + + def __len__(self): return len(self.src) + + def fieldNames(self): + return self.fieldnames + + def __iter__(self): + class FieldsSubsetIterator(object): + def __init__(self,ds): + self.ds=ds + self.src_iter=ds.src.__iter__() + self.example=None + def __iter__(self): return self + def next(self): + complete_example = self.src_iter.next() + if self.example: + self.example._values=[complete_example[field] + for field in self.ds.fieldnames] + else: + self.example=Example(self.ds.fieldnames, + [complete_example[field] for field in self.ds.fieldnames]) + return self.example + return FieldsSubsetIterator(self) - if fields: - for fieldname,fieldslice in fields.items(): - assert type(fieldslice) is int or isinstance(fieldslice,slice) or hasattr(fieldslice,"__iter__") - if hasattr(fieldslice,"__iter__"): # is a sequence - for i in fieldslice: - assert type(i) is int - elif isinstance(fieldslice,slice): - # make sure fieldslice.start and fieldslice.step are defined - start=fieldslice.start - step=fieldslice.step - if not start: - start=0 - if not step: - step=1 - if not fieldslice.start or not fieldslice.step: - fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) - # and coherent with the data array - assert fieldslice.start >= 0 and fieldslice.stop <= cols + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + assert self.hasFields(*fieldnames) + return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset) + def __getitem__(self,i): + return FieldsSubsetDataSet(self.src[i],self.fieldnames) + + +class DataSetFields(LookupList): + """ + Although a DataSet iterates over examples (like rows of a matrix), an associated + DataSetFields iterates over fields (like columns of a matrix), and can be understood + as a transpose of the associated dataset. + + To iterate over fields, one can do + * for fields in dataset.fields() + * for fields in dataset(field1,field2,...).fields() to select a subset of fields + * for fields in dataset.fields(field1,field2,...) to select a subset of fields + and each of these fields is iterable over the examples: + * for field_examples in dataset.fields(): + for example_value in field_examples: + ... + but when the dataset is a stream (unbounded length), it is not recommanded to do + such things because the underlying dataset may refuse to access the different fields in + an unsynchronized ways. Hence the fields() method is illegal for streams, by default. + The result of fields() is a DataSetFields object, which iterates over fields, + and whose elements are iterable over examples. A DataSetFields object can + be turned back into a DataSet with its examples() method: + dataset2 = dataset1.fields().examples() + and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1). + + DataSetFields can be concatenated vertically or horizontally. To be consistent with + the syntax used for DataSets, the | concatenates the fields and the & concatenates + the examples. + """ + def __init__(self,dataset,*fieldnames): + original_dataset=dataset + if not fieldnames: + fieldnames=dataset.fieldNames() + elif not fieldnames==dataset.fieldNames(): + dataset = FieldsSubsetDataSet(dataset,fieldnames) + assert dataset.hasFields(*fieldnames) + self.dataset=dataset - def minibatches(self, - fieldnames = DataSet.minibatches_fieldnames, - minibatch_size = DataSet.minibatches_minibatch_size, - n_batches = DataSet.minibatches_n_batches): + if isinstance(dataset,MinibatchDataSet): + LookupList.__init__(self,fieldnames,list(dataset._fields)) + elif isinstance(original_dataset,MinibatchDataSet): + LookupList.__init__(self,fieldnames, + [original_dataset._fields[field] + for field in fieldnames]) + else: + minibatch_iterator = dataset.minibatches(fieldnames, + minibatch_size=len(dataset), + n_batches=1) + minibatch=minibatch_iterator.next() + LookupList.__init__(self,fieldnames,minibatch) + + def examples(self): + return self.dataset + + def __or__(self,other): """ - If the fieldnames list is None, it means that we want to see ALL the fields. + fields1 | fields2 is a DataSetFields that whose list of examples is the concatenation + of the list of examples of DataSetFields fields1 and fields2. + """ + return (self.examples() + other.examples()).fields() - If the n_batches is None, we want to see all the examples possible - for the given minibatch_size (possibly missing some near the end). + def __and__(self,other): + """ + fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation + of the fields of DataSetFields fields1 and fields2. """ - # substitute the defaults: - if n_batches is None: n_batches = len(self) / minibatch_size - return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches) + return (self.examples() | other.examples()).fields() - def fieldNames(self): - """Return the list of field names that are supported by getattr and hasField.""" - return self.fields.keys() + +class MinibatchDataSet(DataSet): + """ + Turn a LookupList of same-length fields into an example-iterable dataset. + Each element of the lookup-list should be an iterable and sliceable, all of the same length. + """ + def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack, + values_hstack=DataSet().valuesHStack): + """ + The user can (and generally should) also provide values_vstack(fieldname,fieldvalues) + and a values_hstack(fieldnames,fieldvalues) functions behaving with the same + semantics as the DataSet methods of the same name (but without the self argument). + """ + self._fields=fields_lookuplist + assert len(fields_lookuplist)>0 + self.length=len(fields_lookuplist[0]) + for field in fields_lookuplist[1:]: + assert self.length==len(field) + self.values_vstack=values_vstack + self.values_hstack=values_hstack def __len__(self): - """len(dataset) returns the number of examples in the dataset.""" - return len(self.data) - + return self.length + def __getitem__(self,i): - """ - dataset[i] returns the (i+1)-th Example of the dataset. - If there are no fields the result is just a numpy array (for the i-th row of the dataset data matrix). - dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. - dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. - dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in. - """ - if self.fields: - fieldnames,fieldslices=zip(*self.fields.items()) - return Example(self.fields.keys(),[self.data[i,fieldslice] for fieldslice in self.fields.values()]) - else: - return self.data[i] + if type(i) in (int,slice,list): + return DataSetFields(MinibatchDataSet( + Example(self._fields.keys(),[field[i] for field in self._fields])),self._fields) + if self.hasFields(i): + return self._fields[i] + assert i in self.__dict__ # else it means we are trying to access a non-existing property + return self.__dict__[i] + + def fieldNames(self): + return self._fields.keys() + + def hasFields(self,*fieldnames): + for fieldname in fieldnames: + if fieldname not in self._fields.keys(): + return False + return True - def __getslice__(self,*args): - """ - dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. - dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. - """ - return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields) + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + class Iterator(object): + def __init__(self,ds): + self.ds=ds + self.next_example=offset + assert minibatch_size > 0 + if offset+minibatch_size > ds.length: + raise NotImplementedError() + def __iter__(self): + return self + def next(self): + upper = self.next_example+minibatch_size + assert upper<=self.ds.length + minibatch = Example(self.ds._fields.keys(), + [field[self.next_example:upper] + for field in self.ds._fields]) + self.next_example+=minibatch_size + return DataSetFields(MinibatchDataSet(minibatch),*fieldnames) - def indices_of_unique_columns_used(self): - """ - Return the unique indices of the columns actually used by the fields, and a boolean - that signals (if True) that used columns overlap. If they do then the - indices are not repeated in the result. - """ - columns_used = numpy.zeros((self.data.shape[1]),dtype=bool) - overlapping_columns = False - for field_slice in self.fields.values(): - if sum(columns_used[field_slice])>0: overlapping_columns=True - columns_used[field_slice]=True - return [i for i,used in enumerate(columns_used) if used],overlapping_columns + return Iterator(self) - def slice_of_unique_columns_used(self): - """ - Return None if the indices_of_unique_columns_used do not form a slice. If they do, - return that slice. It means that the columns used can be extracted - from the data array without making a copy. If the fields overlap - but their unique columns used form a slice, still return that slice. - """ - columns_used,overlapping_columns = self.indices_of_columns_used() - mappable_to_one_slice = True - if not overlapping_fields: - start=0 - while start<len(columns_used) and not columns_used[start]: - start+=1 - stop=len(columns_used) - while stop>0 and not columns_used[stop-1]: - stop-=1 - step=0 - i=start - while i<stop: - j=i+1 - while j<stop and not columns_used[j]: - j+=1 - if step: - if step!=j-i: - mappable_to_one_slice = False - break - else: - step = j-i - i=j - return slice(start,stop,step) + def valuesVStack(self,fieldname,fieldvalues): + return self.values_vstack(fieldname,fieldvalues) + + def valuesHStack(self,fieldnames,fieldvalues): + return self.values_hstack(fieldnames,fieldvalues) -class ApplyFunctionDataSet(FiniteWidthDataSet): +class HStackedDataSet(DataSet): + """ + A DataSet that wraps several datasets and shows a view that includes all their fields, + i.e. whose list of fields is the concatenation of their lists of fields. + + If a field name is found in more than one of the datasets, then either an error is + raised or the fields are renamed (either by prefixing the __name__ attribute + of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list). + + TODO: automatically detect a chain of stacked datasets due to A | B | C | D ... """ - A dataset that contains as fields the results of applying - a given function (example-wise) to specified input_fields of a source - dataset. The function should return a sequence whose elements will be stored in - fields whose names are given in the output_fields list. If copy_inputs - is True then the resulting dataset will also contain the fields of the source. - dataset. If accept_minibatches, then the function expects - minibatches as arguments (what is returned by the minibatches - iterator). In any case, the computations may be delayed until the examples - of self are requested. If cache is True, then - once the output fields for some examples have been computed, then - are cached (to avoid recomputation if the same examples are again requested). - """ - def __init__(src,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True, compute_now=False): - DataSet.__init__(self) - self.src=src - self.function=function - assert src.hasFields(input_fields) - self.input_fields=input_fields - self.output_fields=output_fields - assert not (copy_inputs and compute_now and not hasattr(src,'fieldNames')) - self.copy_inputs=copy_inputs - self.accept_minibatches=accept_minibatches - self.cache=cache - self.compute_now=compute_now - if compute_now: - assert hasattr(src,'__len__') and len(src)>=0 - fieldnames = output_fields - if copy_inputs: fieldnames = src.fieldNames() + output_fields - if accept_minibatches: - # make a single minibatch with all the inputs - inputs = src.minibatches(input_fields,len(src)).next() - # and apply the function to it, and transpose into a list of examples (field values, actually) - self.cached_examples = zip(*Example(output_fields,function(*inputs))) + def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None): + DataSet.__init__(self,description,field_types) + self.datasets=datasets + self.accept_nonunique_names=accept_nonunique_names + self.fieldname2dataset={} + + def rename_field(fieldname,dataset,i): + if hasattr(dataset,"__name__"): + return dataset.__name__ + "." + fieldname + return fieldname+"."+str(i) + + # make sure all datasets have the same length and unique field names + self.length=None + names_to_change=[] + for i in xrange(len(datasets)): + dataset = datasets[i] + length=len(dataset) + if self.length: + assert self.length==length else: - # compute a list with one tuple per example, with the function outputs - self.cached_examples = [ function(input) for input in src.zip(input_fields) ] - elif cache: - # maybe a fixed-size array kind of structure would be more efficient than a list - # in the case where src is FiniteDataSet. -YB - self.cached_examples = [] + self.length=length + for fieldname in dataset.fieldNames(): + if fieldname in self.fieldname2dataset: # name conflict! + if accept_nonunique_names: + fieldname=rename_field(fieldname,dataset,i) + names2change.append((fieldname,i)) + else: + raise ValueError("Incompatible datasets: non-unique field name = "+fieldname) + self.fieldname2dataset[fieldname]=i + for fieldname,i in names_to_change: + del self.fieldname2dataset[fieldname] + self.fieldname2dataset[rename_field(fieldname,self.datasets[i],i)]=i + + def hasFields(self,*fieldnames): + for fieldname in fieldnames: + if not fieldname in self.fieldname2dataset: + return False + return True def fieldNames(self): - if self.copy_inputs: - return self.output_fields + self.src.fieldNames() - return self.output_fields + return self.fieldname2dataset.keys() + + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + + class HStackedIterator(object): + def __init__(self,hsds,iterators): + self.hsds=hsds + self.iterators=iterators + def __iter__(self): + return self + def next(self): + # concatenate all the fields of the minibatches + minibatch = reduce(LookupList.__add__,[iterator.next() for iterator in self.iterators]) + # and return a DataSetFields whose dataset is the transpose (=examples()) of this minibatch + return DataSetFields(MinibatchDataSet(minibatch,self.hsds.valuesVStack, + self.hsds.valuesHStack), + fieldnames if fieldnames else hsds.fieldNames()) + + assert self.hasfields(fieldnames) + # find out which underlying datasets are necessary to service the required fields + # and construct corresponding minibatch iterators + if fieldnames: + datasets=set([]) + fields_in_dataset=dict([(dataset,[]) for dataset in datasets]) + for fieldname in fieldnames: + dataset=self.datasets[self.fieldnames2dataset[fieldname]] + datasets.add(dataset) + fields_in_dataset[dataset].append(fieldname) + datasets=list(datasets) + iterators=[dataset.minibatches(fields_in_dataset[dataset],minibatch_size,n_batches,offset) + for dataset in datasets] + else: + datasets=self.datasets + iterators=[dataset.minibatches(None,minibatch_size,n_batches,offset) for dataset in datasets] + return HStackedIterator(self,iterators) + + + def valuesVStack(self,fieldname,fieldvalues): + return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues) - def minibatches(self, - fieldnames = DataSet.minibatches_fieldnames, - minibatch_size = DataSet.minibatches_minibatch_size, - n_batches = DataSet.minibatches_n_batches): - - class Iterator(LookupList): + def valuesHStack(self,fieldnames,fieldvalues): + """ + We will use the sub-dataset associated with the first fieldname in the fieldnames list + to do the work, hoping that it can cope with the other values (i.e. won't care + about the incompatible fieldnames). Hence this heuristic will always work if + all the fieldnames are of the same sub-dataset. + """ + return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues) + +class VStackedDataSet(DataSet): + """ + A DataSet that wraps several datasets and shows a view that includes all their examples, + in the order provided. This clearly assumes that they all have the same field names + and all (except possibly the last one) are of finite length. - def __init__(self,dataset): - if fieldnames is None: - assert hasattr(dataset,"fieldNames") - fieldnames = dataset.fieldNames() - self.example_index=0 - LookupList.__init__(self, fieldnames, [0]*len(fieldnames)) - self.dataset=dataset - self.src_iterator=self.src.minibatches(list(set.union(set(fieldnames),set(dataset.input_fields))), - minibatch_size,n_batches) - self.fieldnames_not_in_input = [] - if self.copy_inputs: - self.fieldnames_not_in_input = filter(lambda x: not x in dataset.input_fields, fieldnames) - + TODO: automatically detect a chain of stacked datasets due to A + B + C + D ... + """ + def __init__(self,datasets): + self.datasets=datasets + self.length=0 + self.index2dataset={} + assert len(datasets)>0 + fieldnames = datasets[-1].fieldNames() + self.datasets_start_row=[] + # We use this map from row index to dataset index for constant-time random access of examples, + # to avoid having to search for the appropriate dataset each time and slice is asked for. + for dataset,k in enumerate(datasets[0:-1]): + assert dataset.is_unbounded() # All VStacked datasets (except possibly the last) must be bounded (have a length). + L=len(dataset) + for i in xrange(L): + self.index2dataset[self.length+i]=k + self.datasets_start_row.append(self.length) + self.length+=L + assert dataset.fieldNames()==fieldnames + self.datasets_start_row.append(self.length) + self.length+=len(datasets[-1]) + # If length is very large, we should use a more memory-efficient mechanism + # that does not store all indices + if self.length>1000000: + # 1 million entries would require about 60 meg for the index2dataset map + # TODO + print "A more efficient mechanism for index2dataset should be implemented" + + def __len__(self): + return self.length + + def fieldNames(self): + return self.datasets[0].fieldNames() + + def hasFields(self,*fieldnames): + return self.datasets[0].hasFields(*fieldnames) + + def locate_row(self,row): + """Return (dataset_index, row_within_dataset) for global row number""" + dataset_index = self.index2dataset[row] + row_within_dataset = self.datasets_start_row[dataset_index] + return dataset_index, row_within_dataset + + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + + class VStackedIterator(object): + def __init__(self,vsds): + self.vsds=vsds + self.next_row=offset + self.next_dataset_index,self.next_dataset_row=self.vsds.locate_row(offset) + self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ + self.next_iterator(vsds.datasets[0],offset,n_batches) + + def next_iterator(self,dataset,starting_offset,batches_left): + L=len(dataset) + ds_nbatches = (L-starting_offset)/minibatch_size + if batches_left is not None: + ds_nbatches = max(batches_left,ds_nbatches) + if minibatch_size>L: + ds_minibatch_size=L + n_left_in_mb=minibatch_size-L + ds_nbatches=1 + else: + n_left_in_mb=0 + return dataset.minibatches(fieldnames,minibatch_size,ds_nbatches,starting_offset), \ + L-(starting_offset+ds_nbatches*minibatch_size), n_left_in_mb + + def move_to_next_dataset(self): + if self.n_left_at_the_end_of_ds>0: + self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ + self.next_iterator(vsds.datasets[self.next_dataset_index], + self.n_left_at_the_end_of_ds,1) + else: + self.next_dataset_index +=1 + if self.next_dataset_index==len(self.vsds.datasets): + self.next_dataset_index = 0 + self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ + self.next_iterator(vsds.datasets[self.next_dataset_index],starting_offset,n_batches) + def __iter__(self): return self - def next_index(self): - return self.src_iterator.next_index() - def next(self): - example_index = self.src_iterator.next_index() - src_examples = self.src_iterator.next() - if self.dataset.copy_inputs: - function_inputs = [src_examples[field_name] for field_name in self.dataset.input_fields] - else: - function_inputs = src_examples - if self.dataset.cached_examples: - cache_len=len(self.cached_examples) - if example_index<cache_len+minibatch_size: - outputs_list = self.cached_examples[example_index:example_index+minibatch_size] - # convert the minibatch list of examples - # into a list of fields each of which iterate over the minibatch - outputs = zip(*outputs_list) - else: - outputs = self.dataset.function(*function_inputs) - if self.dataset.cache: - # convert the list of fields, each of which can iterate over the minibatch - # into a list of examples in the minibatch (each of which is a list of field values) - outputs_list = zip(*outputs) - # copy the outputs_list into the cache - for i in xrange(cache_len,example_index): - self.cached_examples.append(None) - self.cached_examples += outputs_list - else: - outputs = self.dataset.function(*function_inputs) - - return Example(self.fieldnames_not_in_input+self.dataset.output_fields, - [src_examples[field_name] for field_name in self.fieldnames_not_in_input]+outputs) - + dataset=self.vsds.datasets[self.next_dataset_index] + mb = self.next_iterator.next() + if self.n_left_in_mb: + extra_mb = [] + while self.n_left_in_mb>0: + self.move_to_next_dataset() + extra_mb.append(self.next_iterator.next()) + examples = Example(names, + [dataset.valuesVStack(name, + [mb[name]]+[b[name] for b in extra_mb]) + for name in fieldnames]) + mb = DataSetFields(MinibatchDataSet(examples),fieldnames) + + self.next_row+=minibatch_size + self.next_dataset_row+=minibatch_size + if self.next_row+minibatch_size>len(dataset): + self.move_to_next_dataset() + return examples + return VStackedIterator(self) + +class ArrayFieldsDataSet(DataSet): + """ + Virtual super-class of datasets whose field values are numpy array, + thus defining valuesHStack and valuesVStack for sub-classes. + """ + def __init__(self,description=None,field_types=None): + DataSet.__init__(self,description,field_types) + def valuesHStack(self,fieldnames,fieldvalues): + """Concatenate field values horizontally, e.g. two vectors + become a longer vector, two matrices become a wider matrix, etc.""" + return numpy.hstack(fieldvalues) + def valuesVStack(self,fieldname,values): + """Concatenate field values vertically, e.g. two vectors + become a two-row matrix, two matrices become a longer matrix, etc.""" + return numpy.vstack(values) + +class ArrayDataSet(ArrayFieldsDataSet): + """ + An ArrayDataSet stores the fields as groups of columns in a numpy tensor, + whose first axis iterates over examples, second axis determines fields. + If the underlying array is N-dimensional (has N axes), then the field + values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2). + """ + + def __init__(self, data_array, fields_columns): + """ + Construct an ArrayDataSet from the underlying numpy array (data) and + a map (fields_columns) from fieldnames to field columns. The columns of a field are specified + using the standard arguments for indexing/slicing: integer for a column index, + slice for an interval of columns (with possible stride), or iterable of column indices. + """ + self.data=data_array + self.fields_columns=fields_columns + + # check consistency and complete slices definitions + for fieldname, fieldcolumns in self.fields_columns.items(): + if type(fieldcolumns) is int: + assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1] + elif type(fieldcolumns) is slice: + start,step=None,None + if not fieldcolumns.start: + start=0 + if not fieldcolumns.step: + step=1 + if start or step: + self.fields_columns[fieldname]=slice(start,fieldcolumns.stop,step) + elif hasattr(fieldcolumns,"__iter__"): # something like a list + for i in fieldcolumns: + assert i>=0 and i<data_array.shape[1] - for fieldname in fieldnames: - assert fieldname in self.output_fields or self.src.hasFields(fieldname) - return Iterator(self) + def fieldNames(self): + return self.fields_columns.keys() + + def __len__(self): + return len(self.data) - + def __getitem__(self,i): + """More efficient implementation than the default __getitem__""" + fieldnames=self.fields_columns.keys() + if type(i) is int: + return Example(fieldnames, + [self.data[i,self.fields_columns[f]] for f in fieldnames]) + if type(i) in (slice,list): + return MinibatchDataSet(Example(fieldnames, + [self.data[i,self.fields_columns[f]] for f in fieldnames])) + # else check for a fieldname + if self.hasFields(i): + return Example([i],[self.data[self.fields_columns[i],:]]) + # else we are trying to access a property of the dataset + assert i in self.__dict__ # else it means we are trying to access a non-existing property + return self.__dict__[i] + + + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + class ArrayDataSetIterator(object): + def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset): + if fieldnames is None: fieldnames = dataset.fieldNames() + # store the resulting minibatch in a lookup-list of values + self.minibatch = LookupList(fieldnames,[0]*len(fieldnames)) + self.dataset=dataset + self.minibatch_size=minibatch_size + assert offset>=0 and offset<len(dataset.data) + assert offset+minibatch_size<=len(dataset.data) + self.current=offset + def __iter__(self): + return self + def next(self): + sub_data = self.dataset.data[self.current:self.current+self.minibatch_size] + self.minibatch._values = [sub_data[:,self.dataset.fields_columns[f]] for f in self.minibatch._names] + self.current+=self.minibatch_size + return self.minibatch + + return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) + + +class CachedDataSet(DataSet): + """ + Wrap a dataset whose values are computationally expensive to obtain + (e.g. because they involve some computation, or disk access), + so that repeated accesses to the same example are done cheaply, + by caching every example value that has been accessed at least once. + + Optionally, for finite-length dataset, all the values can be computed + (and cached) upon construction of the CachedDataSet, rather at the + first access. + """ + +class ApplyFunctionDataSet(DataSet): + """ + A dataset that contains as fields the results of applying a given function + example-wise or minibatch-wise to all the fields of an input dataset. + The output of the function should be an iterable (e.g. a list or a LookupList) + over the resulting values. In minibatch mode, the function is expected + to work on minibatches (takes a minibatch in input and returns a minibatch + in output). + + The function is applied each time an example or a minibatch is accessed. + To avoid re-doing computation, wrap this dataset inside a CachedDataSet. + """ + + def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): """ Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the @@ -685,7 +991,7 @@ """ args = ((input_fields,'input'),(output_fields,'target')) if weight_field: args+=(([weight_field],'weight')) - return src_dataset.rename(*args) + return src_dataset.merge_fields(*args)