Mercurial > pylearn
diff dataset.py @ 57:1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
author | Yoshua Bengio <bengioy@iro.umontreal.ca> |
---|---|
date | Tue, 29 Apr 2008 17:45:16 -0400 |
parents | 1729ad44f175 |
children | 9165d86855ab |
line wrap: on
line diff
--- a/dataset.py Tue Apr 29 16:09:17 2008 -0400 +++ b/dataset.py Tue Apr 29 17:45:16 2008 -0400 @@ -80,12 +80,14 @@ * dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in. - * dataset['key'] returns a property associated with the given 'key' string. - If 'key' is a fieldname, then the VStacked field values (iterable over - field values) for that field is returned. Other keys may be supported - by different dataset subclasses. The following key names are should be supported: + * dataset[fieldname] an iterable over the values of the field fieldname across + the dataset (the iterable is obtained by default by calling valuesVStack + over the values for individual examples). + + * dataset.<property> returns the value of a property associated with + the name <property>. The following properties should be supported: - 'description': a textual description or name for the dataset - - '<fieldname>.type': a type name or value for a given <fieldname> + - 'fieldtypes': a list of types (one per field) Datasets can be concatenated either vertically (increasing the length) or horizontally (augmenting the set of fields), if they are compatible, using @@ -125,12 +127,12 @@ * __iter__ """ - def __init__(self,description=None,field_types=None): + def __init__(self,description=None,fieldtypes=None): if description is None: # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)" description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" self.description=description - self.field_types=field_types + self.fieldtypes=field_types class MinibatchToSingleExampleIterator(object): """ @@ -944,7 +946,34 @@ return self.minibatch return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) - + + +class CachedDataSet(DataSet): + """ + Wrap a dataset whose values are computationally expensive to obtain + (e.g. because they involve some computation, or disk access), + so that repeated accesses to the same example are done cheaply, + by caching every example value that has been accessed at least once. + + Optionally, for finite-length dataset, all the values can be computed + (and cached) upon construction of the CachedDataSet, rather at the + first access. + """ + +class ApplyFunctionDataSet(DataSet): + """ + A dataset that contains as fields the results of applying a given function + example-wise or minibatch-wise to all the fields of an input dataset. + The output of the function should be an iterable (e.g. a list or a LookupList) + over the resulting values. In minibatch mode, the function is expected + to work on minibatches (takes a minibatch in input and returns a minibatch + in output). + + The function is applied each time an example or a minibatch is accessed. + To avoid re-doing computation, wrap this dataset inside a CachedDataSet. + """ + + def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): """ Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the