pylearn: dataset.py comparison

comparison dataset.py @ 167:4803cb76e26b

Updated documentation

author	Joseph Turian <turian@gmail.com>
date	Mon, 12 May 2008 18:51:42 -0400
parents	ee11ed427ba8
children	895b4b60f5e8

comparison

equal deleted inserted replaced

-:ee11ed427ba8
+:4803cb76e26b
 return datasets[0]
 return VStackedDataSet(datasets)
 class FieldsSubsetDataSet(DataSet):
 """
-A sub-class of DataSet that selects a subset of the fields.
+A sub-class of L{DataSet} that selects a subset of the fields.
 """
 def __init__(self,src,fieldnames):
 self.src=src
 self.fieldnames=fieldnames
 assert src.hasFields(*fieldnames)
 return FieldsSubsetDataSet(self.src[i],self.fieldnames)
 class DataSetFields(LookupList):
 """
-Although a DataSet iterates over examples (like rows of a matrix), an associated
+Although a L{DataSet} iterates over examples (like rows of a matrix), an associated
 DataSetFields iterates over fields (like columns of a matrix), and can be understood
 as a transpose of the associated dataset.
 To iterate over fields, one can do
 * for fields in dataset.fields()
 return (self.examples() | other.examples()).fields()
 class MinibatchDataSet(DataSet):
 """
-Turn a LookupList of same-length (iterable) fields into an example-iterable dataset.
+Turn a L{LookupList} of same-length (iterable) fields into an example-iterable dataset.
 Each element of the lookup-list should be an iterable and sliceable, all of the same length.
 """
 def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack,
 values_hstack=DataSet().valuesHStack):
 """
 def valuesHStack(self,fieldnames,fieldvalues):
 return self.values_hstack(fieldnames,fieldvalues)
 class HStackedDataSet(DataSet):
 """
-A DataSet that wraps several datasets and shows a view that includes all their fields,
+A L{DataSet} that wraps several datasets and shows a view that includes all their fields,
 i.e. whose list of fields is the concatenation of their lists of fields.
 If a field name is found in more than one of the datasets, then either an error is
 raised or the fields are renamed (either by prefixing the __name__ attribute
 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list).
-TODO: automatically detect a chain of stacked datasets due to A | B | C | D ...
+@todo: automatically detect a chain of stacked datasets due to A | B | C | D ...
 """
 def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None):
 DataSet.__init__(self,description,field_types)
 self.datasets=datasets
 self.accept_nonunique_names=accept_nonunique_names
 """
 return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues)
 class VStackedDataSet(DataSet):
 """
-A DataSet that wraps several datasets and shows a view that includes all their examples,
+A L{DataSet} that wraps several datasets and shows a view that includes all their examples,
 in the order provided. This clearly assumes that they all have the same field names
 and all (except possibly the last one) are of finite length.
-TODO: automatically detect a chain of stacked datasets due to A + B + C + D ...
+@todo: automatically detect a chain of stacked datasets due to A + B + C + D ...
 """
 def __init__(self,datasets):
 self.datasets=datasets
 self.length=0
 self.index2dataset={}
 return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
 class CachedDataSet(DataSet):
 """
-Wrap a dataset whose values are computationally expensive to obtain
+Wrap a L{DataSet} whose values are computationally expensive to obtain
 (e.g. because they involve some computation, or disk access),
 so that repeated accesses to the same example are done cheaply,
 by caching every example value that has been accessed at least once.
 Optionally, for finite-length dataset, all the values can be computed
 (and cached) upon construction of the CachedDataSet, rather at the
 first access.
-@todo when cache_all_upon_construction create mini-batches that are as
+@todo: when cache_all_upon_construction create mini-batches that are as
 large as possible but not so large as to fill up memory.
-@todo add disk-buffering capability, so that when the cache becomes too
+@todo: add disk-buffering capability, so that when the cache becomes too
 big for memory, we cache things on disk, trying to keep in memory only
 the record most likely to be accessed next.
 """
 def __init__(self,source_dataset,cache_all_upon_construction=False):
 self.source_dataset=source_dataset
 else:
 return DataSet.__getitem__(self,i)
 class ApplyFunctionDataSet(DataSet):
 """
-A dataset that contains as fields the results of applying a given function
+A L{DataSet} that contains as fields the results of applying a
-example-wise or minibatch-wise to all the fields of an input dataset.
+given function example-wise or minibatch-wise to all the fields of
-The output of the function should be an iterable (e.g. a list or a LookupList)
+an input dataset.  The output of the function should be an iterable
-over the resulting values.
+(e.g. a list or a LookupList) over the resulting values.
-In minibatch mode, the function is expected to work on minibatches (takes
+In minibatch mode, the function is expected to work on minibatches
-a minibatch in input and returns a minibatch in output). More precisely,
+(takes a minibatch in input and returns a minibatch in output). More
-it means that each element of the input or output list should be iterable
+precisely, it means that each element of the input or output list
-and indexable over the individual example values (typically these
+should be iterable and indexable over the individual example values
-elements will be numpy arrays). All of the elements in the input and
+(typically these elements will be numpy arrays). All of the elements
-output lists should have the same length, which is the length of the
+in the input and output lists should have the same length, which is
-minibatch.
+the length of the minibatch.
 The function is applied each time an example or a minibatch is accessed.
 To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
 If the values_{h,v}stack functions are not provided, then
 return ApplyFunctionSingleExampleIterator(self)
 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
 """
-Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the
+Wraps an arbitrary L{DataSet} into one for supervised learning tasks
-user to define a set of fields as the 'input' field and a set of fields
+by forcing the user to define a set of fields as the 'input' field
-as the 'target' field. Optionally, a single weight_field can also be defined.
+and a set of fields as the 'target' field. Optionally, a single
+weight_field can also be defined.
 """
 args = ((input_fields,'input'),(output_fields,'target'))
 if weight_field: args+=(([weight_field],'weight'))
 return src_dataset.merge_fields(*args)

Mercurial > pylearn

comparison dataset.py @ 167:4803cb76e26b