Mercurial > pylearn
changeset 59:ac9aff8d5743
Automated merge with ssh://p-omega1@lgcm.iro.umontreal.ca/tlearn
author | Frederic Bastien <bastienf@iro.umontreal.ca> |
---|---|
date | Thu, 01 May 2008 16:19:31 -0400 |
parents | 1aabd2e2bb5f (diff) 17729d7104fa (current diff) |
children | 9165d86855ab |
files | |
diffstat | 1 files changed, 60 insertions(+), 16 deletions(-) [+] |
line wrap: on
line diff
--- a/dataset.py Thu May 01 16:17:10 2008 -0400 +++ b/dataset.py Thu May 01 16:19:31 2008 -0400 @@ -80,12 +80,14 @@ * dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in. - * dataset['key'] returns a property associated with the given 'key' string. - If 'key' is a fieldname, then the VStacked field values (iterable over - field values) for that field is returned. Other keys may be supported - by different dataset subclasses. The following key names are should be supported: + * dataset[fieldname] an iterable over the values of the field fieldname across + the dataset (the iterable is obtained by default by calling valuesVStack + over the values for individual examples). + + * dataset.<property> returns the value of a property associated with + the name <property>. The following properties should be supported: - 'description': a textual description or name for the dataset - - '<fieldname>.type': a type name or value for a given <fieldname> + - 'fieldtypes': a list of types (one per field) Datasets can be concatenated either vertically (increasing the length) or horizontally (augmenting the set of fields), if they are compatible, using @@ -125,12 +127,12 @@ * __iter__ """ - def __init__(self,description=None,field_types=None): + def __init__(self,description=None,fieldtypes=None): if description is None: # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)" description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" self.description=description - self.field_types=field_types + self.fieldtypes=field_types class MinibatchToSingleExampleIterator(object): """ @@ -603,6 +605,7 @@ Example(self.fields.keys(),[field[i] for field in self.fields])),self.fields) if self.hasFields(i): return self.fields[i] + assert i in self.__dict__ # else it means we are trying to access a non-existing property return self.__dict__[i] def fieldNames(self): @@ -874,13 +877,13 @@ values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2). """ - """ - Construct an ArrayDataSet from the underlying numpy array (data) and - a map (fields_columns) from fieldnames to field columns. The columns of a field are specified - using the standard arguments for indexing/slicing: integer for a column index, - slice for an interval of columns (with possible stride), or iterable of column indices. - """ def __init__(self, data_array, fields_columns): + """ + Construct an ArrayDataSet from the underlying numpy array (data) and + a map (fields_columns) from fieldnames to field columns. The columns of a field are specified + using the standard arguments for indexing/slicing: integer for a column index, + slice for an interval of columns (with possible stride), or iterable of column indices. + """ self.data=data_array self.fields_columns=fields_columns @@ -906,8 +909,22 @@ def __len__(self): return len(self.data) - #def __getitem__(self,i): - # """More efficient implementation than the default""" + def __getitem__(self,i): + """More efficient implementation than the default __getitem__""" + fieldnames=self.fields_columns.keys() + if type(i) is int: + return Example(fieldnames, + [self.data[i,self.fields_columns[f]] for f in fieldnames]) + if type(i) in (slice,list): + return MinibatchDataSet(Example(fieldnames, + [self.data[i,self.fields_columns[f]] for f in fieldnames])) + # else check for a fieldname + if self.hasFields(i): + return Example([i],[self.data[self.fields_columns[i],:]]) + # else we are trying to access a property of the dataset + assert i in self.__dict__ # else it means we are trying to access a non-existing property + return self.__dict__[i] + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): class ArrayDataSetIterator(object): @@ -929,7 +946,34 @@ return self.minibatch return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) - + + +class CachedDataSet(DataSet): + """ + Wrap a dataset whose values are computationally expensive to obtain + (e.g. because they involve some computation, or disk access), + so that repeated accesses to the same example are done cheaply, + by caching every example value that has been accessed at least once. + + Optionally, for finite-length dataset, all the values can be computed + (and cached) upon construction of the CachedDataSet, rather at the + first access. + """ + +class ApplyFunctionDataSet(DataSet): + """ + A dataset that contains as fields the results of applying a given function + example-wise or minibatch-wise to all the fields of an input dataset. + The output of the function should be an iterable (e.g. a list or a LookupList) + over the resulting values. In minibatch mode, the function is expected + to work on minibatches (takes a minibatch in input and returns a minibatch + in output). + + The function is applied each time an example or a minibatch is accessed. + To avoid re-doing computation, wrap this dataset inside a CachedDataSet. + """ + + def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): """ Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the