# HG changeset patch # User Yoshua Bengio # Date 1209505516 14400 # Node ID 1aabd2e2bb5f18e3786e10daf8e4505f93c194f0 # Parent 1729ad44f1758766a73aae841b9bad2be5a59d5b Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet diff -r 1729ad44f175 -r 1aabd2e2bb5f dataset.py --- a/dataset.py Tue Apr 29 16:09:17 2008 -0400 +++ b/dataset.py Tue Apr 29 17:45:16 2008 -0400 @@ -80,12 +80,14 @@ * dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in. - * dataset['key'] returns a property associated with the given 'key' string. - If 'key' is a fieldname, then the VStacked field values (iterable over - field values) for that field is returned. Other keys may be supported - by different dataset subclasses. The following key names are should be supported: + * dataset[fieldname] an iterable over the values of the field fieldname across + the dataset (the iterable is obtained by default by calling valuesVStack + over the values for individual examples). + + * dataset. returns the value of a property associated with + the name . The following properties should be supported: - 'description': a textual description or name for the dataset - - '.type': a type name or value for a given + - 'fieldtypes': a list of types (one per field) Datasets can be concatenated either vertically (increasing the length) or horizontally (augmenting the set of fields), if they are compatible, using @@ -125,12 +127,12 @@ * __iter__ """ - def __init__(self,description=None,field_types=None): + def __init__(self,description=None,fieldtypes=None): if description is None: # by default return "(,,...)" description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" self.description=description - self.field_types=field_types + self.fieldtypes=field_types class MinibatchToSingleExampleIterator(object): """ @@ -944,7 +946,34 @@ return self.minibatch return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) - + + +class CachedDataSet(DataSet): + """ + Wrap a dataset whose values are computationally expensive to obtain + (e.g. because they involve some computation, or disk access), + so that repeated accesses to the same example are done cheaply, + by caching every example value that has been accessed at least once. + + Optionally, for finite-length dataset, all the values can be computed + (and cached) upon construction of the CachedDataSet, rather at the + first access. + """ + +class ApplyFunctionDataSet(DataSet): + """ + A dataset that contains as fields the results of applying a given function + example-wise or minibatch-wise to all the fields of an input dataset. + The output of the function should be an iterable (e.g. a list or a LookupList) + over the resulting values. In minibatch mode, the function is expected + to work on minibatches (takes a minibatch in input and returns a minibatch + in output). + + The function is applied each time an example or a minibatch is accessed. + To avoid re-doing computation, wrap this dataset inside a CachedDataSet. + """ + + def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): """ Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the