Mercurial > pylearn
diff dataset.py @ 290:9b533cc7874a
trying to get default implemenations to work
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Thu, 05 Jun 2008 18:38:42 -0400 |
parents | ed70580f2324 |
children | 174374d59405 |
line wrap: on
line diff
--- a/dataset.py Thu Jun 05 14:16:46 2008 -0400 +++ b/dataset.py Thu Jun 05 18:38:42 2008 -0400 @@ -1,6 +1,5 @@ -from lookup_list import LookupList -Example = LookupList +from lookup_list import LookupList as Example from misc import unique_elements_list_intersection from string import join from sys import maxint @@ -38,7 +37,6 @@ else: return [self.__getattribute__(name) for name in attribute_names] - class DataSet(AttributesHolder): """A virtual base class for datasets. @@ -234,9 +232,8 @@ self.n_batches=n_batches self.n_batches_done=0 self.next_row=offset - self.offset=offset self.L=len(dataset) - assert offset+minibatch_size<=self.L + self.offset=offset % self.L ds_nbatches = (self.L-self.next_row)/self.minibatch_size if n_batches is not None: ds_nbatches = min(n_batches,ds_nbatches) @@ -244,8 +241,7 @@ assert dataset.hasFields(*fieldnames) else: self.fieldnames=dataset.fieldNames() - self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, - ds_nbatches,self.next_row) + self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, ds_nbatches,self.next_row) def __iter__(self): return self @@ -318,7 +314,7 @@ f1, f2, and f3 fields of a batch of examples on each loop iteration. The minibatches iterator is expected to return upon each call to next() - a DataSetFields object, which is a LookupList (indexed by the field names) whose + a DataSetFields object, which is a Example (indexed by the field names) whose elements are iterable and indexable over the minibatch examples, and which keeps a pointer to a sub-dataset that can be used to iterate over the individual examples in the minibatch. Hence a minibatch can be converted back to a regular @@ -424,52 +420,70 @@ def __getitem__(self,i): """ - dataset[i] returns the (i+1)-th example of the dataset. - dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. - dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. - dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in. - dataset['key'] returns a property associated with the given 'key' string. - If 'key' is a fieldname, then the VStacked field values (iterable over - field values) for that field is returned. Other keys may be supported - by different dataset subclasses. The following key names are encouraged: - - 'description': a textual description or name for the dataset - - '<fieldname>.type': a type name or value for a given <fieldname> + @rtype: Example + @returns: single or multiple examples - Note that some stream datasets may be unable to implement random access, i.e. - arbitrary slicing/indexing - because they can only iterate through examples one or a minibatch at a time - and do not actually store or keep past (or future) examples. + @type i: integer or slice or <iterable> of integers + @param i: + dataset[i] returns the (i+1)-th example of the dataset. + dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. + dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. + dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in. + + @note: + Some stream datasets may be unable to implement random access, i.e. + arbitrary slicing/indexing because they can only iterate through + examples one or a minibatch at a time and do not actually store or keep + past (or future) examples. The default implementation of getitem uses the minibatches iterator to obtain one example, one slice, or a list of examples. It may not always be the most efficient way to obtain the result, especially if the data are actually stored in a memory array. """ - # check for an index + if type(i) is int: - return DataSet.MinibatchToSingleExampleIterator( - self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next() - rows=None - # or a slice + #TODO: consider asserting that i >= 0 + i_batch = self.minibatches_nowrap(self.fieldNames(), + minibatch_size=1, n_batches=1, offset=i % len(self)) + return DataSet.MinibatchToSingleExampleIterator(i_batch).next() + + #if i is a contiguous slice + if type(i) is slice and (i.step in (None, 1)): + offset = 0 if i.start is None else i.start + upper_bound = len(self) if i.stop is None else i.stop + return MinibatchDataSet(self.minibatches_nowrap(self.fieldNames(), + minibatch_size=upper_bound - offset, + n_batches=1, + offset=offset).next()) + + # if slice has a step param, convert it to list and handle it with the + # list code if type(i) is slice: - #print 'i=',i - if not i.start: i=slice(0,i.stop,i.step) - if not i.stop: i=slice(i.start,len(self),i.step) - if not i.step: i=slice(i.start,i.stop,1) - if i.step is 1: - return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples() - rows = range(i.start,i.stop,i.step) - # or a list of indices - elif type(i) is list: - rows = i - if rows is not None: - examples = [self[row] for row in rows] - fields_values = zip(*examples) - return MinibatchDataSet( - Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values) - for fieldname,field_values - in zip(self.fieldNames(),fields_values)]), - self.valuesVStack,self.valuesHStack) + offset = 0 if i.start is None else i.start + upper_bound = len(self) if i.stop is None else i.stop + i = list(range(offset, upper_bound, i.step)) + + # handle tuples, arrays, lists + if hasattr(i, '__getitem__'): + for idx in i: + #dis-allow nested slices + if not isinstance(idx, int): + raise TypeError(idx) + # call back into self.__getitem__ + examples = [self.minibatches_nowrap(self.fieldNames(), + minibatch_size=1, n_batches=1, offset=ii%len(self)).next() + for ii in i] + # re-index the fields in each example by field instead of by example + field_values = [[] for blah in self.fieldNames()] + for e in examples: + for f,v in zip(field_values, e): + f.append(v) + #build them into a LookupList (a.ka. Example) + zz = zip(self.fieldNames(),field_values) + vst = [self.valuesVStack(fieldname,field_values) for fieldname,field_values in zz] + example = Example(self.fieldNames(), vst) + return MinibatchDataSet(example, self.valuesVStack, self.valuesHStack) raise TypeError(i, type(i)) def valuesHStack(self,fieldnames,fieldvalues): @@ -493,24 +507,21 @@ # the default implementation of horizontal stacking is to put values in a list return fieldvalues - def valuesVStack(self,fieldname,values): """ - Return a value that corresponds to concatenating (vertically) several values of the - same field. This can be important to build a minibatch out of individual examples. This - is likely to involve a copy of the original values. When the values are numpy arrays, the - result should be numpy.vstack(values). - The default is to use numpy.vstack for numpy.ndarray values, and a list - pointing to the original values for other data types. + @param fieldname: the name of the field from which the values were taken + @type fieldname: any type + + @param values: bits near the beginning or end of the dataset + @type values: list of minibatches (returned by minibatch_nowrap) + + @return: the concatenation (stacking) of the values + @rtype: something suitable as a minibatch field """ - all_numpy=True - for value in values: - if not type(value) is numpy.ndarray: - all_numpy=False - if all_numpy: - return numpy.vstack(values) - # the default implementation of vertical stacking is to put values in a list - return values + rval = [] + for v in values: + rval.extend(v) + return rval def __or__(self,other): """ @@ -586,11 +597,11 @@ def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): assert self.hasFields(*fieldnames) return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset) - def __getitem__(self,i): + def dontuse__getitem__(self,i): return FieldsSubsetDataSet(self.src[i],self.fieldnames) -class DataSetFields(LookupList): +class DataSetFields(Example): """ Although a L{DataSet} iterates over examples (like rows of a matrix), an associated DataSetFields iterates over fields (like columns of a matrix), and can be understood @@ -628,9 +639,9 @@ self.dataset=dataset if isinstance(dataset,MinibatchDataSet): - LookupList.__init__(self,fieldnames,list(dataset._fields)) + Example.__init__(self,fieldnames,list(dataset._fields)) elif isinstance(original_dataset,MinibatchDataSet): - LookupList.__init__(self,fieldnames, + Example.__init__(self,fieldnames, [original_dataset._fields[field] for field in fieldnames]) else: @@ -638,7 +649,7 @@ minibatch_size=len(dataset), n_batches=1) minibatch=minibatch_iterator.next() - LookupList.__init__(self,fieldnames,minibatch) + Example.__init__(self,fieldnames,minibatch) def examples(self): return self.dataset @@ -660,7 +671,7 @@ class MinibatchDataSet(DataSet): """ - Turn a L{LookupList} of same-length (iterable) fields into an example-iterable dataset. + Turn a L{Example} of same-length (iterable) fields into an example-iterable dataset. Each element of the lookup-list should be an iterable and sliceable, all of the same length. """ def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack, @@ -680,14 +691,15 @@ print 'len(field) = ', len(field) print 'self._fields.keys() = ', self._fields.keys() print 'field=',field + print 'fields_lookuplist=', fields_lookuplist assert self.length==len(field) - self.values_vstack=values_vstack - self.values_hstack=values_hstack + self.valuesVStack=values_vstack + self.valuesHStack=values_hstack def __len__(self): return self.length - def __getitem__(self,i): + def dontuse__getitem__(self,i): if type(i) in (slice,list): return DataSetFields(MinibatchDataSet( Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames()) @@ -717,7 +729,7 @@ self.ds=ds self.next_example=offset - assert minibatch_size > 0 + assert minibatch_size >= 0 if offset+minibatch_size > ds.length: raise NotImplementedError() def __iter__(self): @@ -741,12 +753,6 @@ # tbm: added fieldnames to handle subset of fieldnames return Iterator(self,fieldnames) - def valuesVStack(self,fieldname,fieldvalues): - return self.values_vstack(fieldname,fieldvalues) - - def valuesHStack(self,fieldnames,fieldvalues): - return self.values_hstack(fieldnames,fieldvalues) - class HStackedDataSet(DataSet): """ A L{DataSet} that wraps several datasets and shows a view that includes all their fields, @@ -810,7 +816,7 @@ return self def next(self): # concatenate all the fields of the minibatches - l=LookupList() + l=Example() for iter in self.iterators: l.append_lookuplist(iter.next()) return l @@ -834,10 +840,10 @@ return HStackedIterator(self,iterators) - def valuesVStack(self,fieldname,fieldvalues): + def untested_valuesVStack(self,fieldname,fieldvalues): return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues) - def valuesHStack(self,fieldnames,fieldvalues): + def untested_valuesHStack(self,fieldnames,fieldvalues): """ We will use the sub-dataset associated with the first fieldname in the fieldnames list to do the work, hoping that it can cope with the other values (i.e. won't care @@ -961,11 +967,11 @@ """ def __init__(self,description=None,field_types=None): DataSet.__init__(self,description,field_types) - def valuesHStack(self,fieldnames,fieldvalues): + def untested_valuesHStack(self,fieldnames,fieldvalues): """Concatenate field values horizontally, e.g. two vectors become a longer vector, two matrices become a wider matrix, etc.""" return numpy.hstack(fieldvalues) - def valuesVStack(self,fieldname,values): + def untested_valuesVStack(self,fieldname,values): """Concatenate field values vertically, e.g. two vectors become a two-row matrix, two matrices become a longer matrix, etc.""" return numpy.vstack(values) @@ -1019,7 +1025,7 @@ def __len__(self): return len(self.data) - def __getitem__(self,key): + def dontuse__getitem__(self,key): """More efficient implementation than the default __getitem__""" fieldnames=self.fields_columns.keys() values=self.fields_columns.values() @@ -1051,12 +1057,12 @@ assert key in self.__dict__ # else it means we are trying to access a non-existing property return self.__dict__[key] - def __iter__(self): + def dontuse__iter__(self): class ArrayDataSetIteratorIter(object): def __init__(self,dataset,fieldnames): if fieldnames is None: fieldnames = dataset.fieldNames() # store the resulting minibatch in a lookup-list of values - self.minibatch = LookupList(fieldnames,[0]*len(fieldnames)) + self.minibatch = Example(fieldnames,[0]*len(fieldnames)) self.dataset=dataset self.current=0 self.columns = [self.dataset.fields_columns[f] @@ -1078,26 +1084,17 @@ return ArrayDataSetIteratorIter(self,self.fieldNames()) def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - class ArrayDataSetIterator(object): - def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset): - if fieldnames is None: fieldnames = dataset.fieldNames() - # store the resulting minibatch in a lookup-list of values - self.minibatch = LookupList(fieldnames,[0]*len(fieldnames)) - self.dataset=dataset - self.minibatch_size=minibatch_size - assert offset>=0 and offset<len(dataset.data) - assert offset+minibatch_size<=len(dataset.data) - self.current=offset - def __iter__(self): - return self - def next(self): - #@todo: we suppose that MinibatchWrapAroundIterator stop the iterator - sub_data = self.dataset.data[self.current:self.current+self.minibatch_size] - self.minibatch._values = [sub_data[:,self.dataset.fields_columns[f]] for f in self.minibatch._names] - self.current+=self.minibatch_size - return self.minibatch + cursor = Example(fieldnames,[0]*len(fieldnames)) + fieldnames = self.fieldNames() if fieldnames is None else fieldnames + for n in xrange(n_batches): + if offset == len(self): + break + sub_data = self.data[offset : offset+minibatch_size] + offset += len(sub_data) #can be less than minibatch_size at end + cursor._values = [sub_data[:,self.fields_columns[f]] for f in cursor._names] + yield cursor - return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) + #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) class CachedDataSet(DataSet): @@ -1162,7 +1159,7 @@ return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames]) return CacheIterator(self) - def __getitem__(self,i): + def dontuse__getitem__(self,i): if type(i)==int and len(self.cached_examples)>i: return self.cached_examples[i] else: @@ -1175,7 +1172,7 @@ self.l = len(dataset) self.current = 0 self.fieldnames = self.dataset.fieldNames() - self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames)) + self.example = Example(self.fieldnames,[0]*len(self.fieldnames)) def __iter__(self): return self def next(self): if self.current>=self.l: @@ -1192,107 +1189,99 @@ return CacheIteratorIter(self) class ApplyFunctionDataSet(DataSet): - """ - A L{DataSet} that contains as fields the results of applying a - given function example-wise or minibatch-wise to all the fields of - an input dataset. The output of the function should be an iterable - (e.g. a list or a LookupList) over the resulting values. - - The function take as input the fields of the dataset, not the examples. + """ + A L{DataSet} that contains as fields the results of applying a + given function example-wise or minibatch-wise to all the fields of + an input dataset. The output of the function should be an iterable + (e.g. a list or a Example) over the resulting values. + + The function take as input the fields of the dataset, not the examples. - In minibatch mode, the function is expected to work on minibatches - (takes a minibatch in input and returns a minibatch in output). More - precisely, it means that each element of the input or output list - should be iterable and indexable over the individual example values - (typically these elements will be numpy arrays). All of the elements - in the input and output lists should have the same length, which is - the length of the minibatch. + In minibatch mode, the function is expected to work on minibatches + (takes a minibatch in input and returns a minibatch in output). More + precisely, it means that each element of the input or output list + should be iterable and indexable over the individual example values + (typically these elements will be numpy arrays). All of the elements + in the input and output lists should have the same length, which is + the length of the minibatch. - The function is applied each time an example or a minibatch is accessed. - To avoid re-doing computation, wrap this dataset inside a CachedDataSet. + The function is applied each time an example or a minibatch is accessed. + To avoid re-doing computation, wrap this dataset inside a CachedDataSet. - If the values_{h,v}stack functions are not provided, then - the input_dataset.values{H,V}Stack functions are used by default. - """ - def __init__(self,input_dataset,function,output_names,minibatch_mode=True, - values_hstack=None,values_vstack=None, - description=None,fieldtypes=None): - """ - Constructor takes an input dataset that has as many fields as the function - expects as inputs. The resulting dataset has as many fields as the function - produces as outputs, and that should correspond to the number of output names - (provided in a list). + If the values_{h,v}stack functions are not provided, then + the input_dataset.values{H,V}Stack functions are used by default. + """ + def __init__(self,input_dataset,function,output_names,minibatch_mode=True, + values_hstack=None,values_vstack=None, + description=None,fieldtypes=None): + """ + Constructor takes an input dataset that has as many fields as the function + expects as inputs. The resulting dataset has as many fields as the function + produces as outputs, and that should correspond to the number of output names + (provided in a list). - Note that the expected semantics of the function differs in minibatch mode - (it takes minibatches of inputs and produces minibatches of outputs, as - documented in the class comment). + Note that the expected semantics of the function differs in minibatch mode + (it takes minibatches of inputs and produces minibatches of outputs, as + documented in the class comment). - TBM: are filedtypes the old field types (from input_dataset) or the new ones - (for the new dataset created)? - """ - self.input_dataset=input_dataset - self.function=function - self.output_names=output_names - self.minibatch_mode=minibatch_mode - DataSet.__init__(self,description,fieldtypes) - self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack - self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack - - def __len__(self): - return len(self.input_dataset) + TBM: are filedtypes the old field types (from input_dataset) or the new ones + (for the new dataset created)? + """ + self.input_dataset=input_dataset + self.function=function + self.output_names=output_names + self.minibatch_mode=minibatch_mode + DataSet.__init__(self,description,fieldtypes) + self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack + self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack - def fieldNames(self): - return self.output_names + def __len__(self): + return len(self.input_dataset) - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - class ApplyFunctionIterator(object): - def __init__(self,output_dataset): - self.input_dataset=output_dataset.input_dataset - self.output_dataset=output_dataset - self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size, - n_batches=n_batches,offset=offset).__iter__() + def fieldNames(self): + return self.output_names - def __iter__(self): return self + def minibatches_nowrap(self, fieldnames, *args, **kwargs): + for fields in self.input_dataset.minibatches_nowrap(fieldnames, *args, **kwargs): - def next(self): - function_inputs = self.input_iterator.next() - all_output_names = self.output_dataset.output_names - if self.output_dataset.minibatch_mode: - function_outputs = self.output_dataset.function(*function_inputs) - else: - input_examples = zip(*function_inputs) - output_examples = [self.output_dataset.function(*input_example) - for input_example in input_examples] - function_outputs = [self.output_dataset.valuesVStack(name,values) - for name,values in zip(all_output_names, - zip(*output_examples))] - all_outputs = Example(all_output_names,function_outputs) - if fieldnames==all_output_names: - return all_outputs - return Example(fieldnames,[all_outputs[name] for name in fieldnames]) - - - return ApplyFunctionIterator(self) + #function_inputs = self.input_iterator.next() + if self.minibatch_mode: + function_outputs = self.function(*fields) + else: + input_examples = zip(*fields) + output_examples = [self.function(*input_example) + for input_example in input_examples] + function_outputs = [self.valuesVStack(name,values) + for name,values in zip(self.output_names, + zip(*output_examples))] + all_outputs = Example(self.output_names, function_outputs) + print fields + print all_outputs + print '--------' + if fieldnames==self.output_names: + yield all_outputs + else: + yield Example(fieldnames,[all_outputs[name] for name in fieldnames]) - def __iter__(self): # only implemented for increased efficiency - class ApplyFunctionSingleExampleIterator(object): - def __init__(self,output_dataset): - self.current=0 - self.output_dataset=output_dataset - self.input_iterator=output_dataset.input_dataset.__iter__() - def __iter__(self): return self - def next(self): - if self.output_dataset.minibatch_mode: - function_inputs = [[input] for input in self.input_iterator.next()] - outputs = self.output_dataset.function(*function_inputs) - assert all([hasattr(output,'__iter__') for output in outputs]) - function_outputs = [output[0] for output in outputs] - else: - function_inputs = self.input_iterator.next() - function_outputs = self.output_dataset.function(*function_inputs) - return Example(self.output_dataset.output_names,function_outputs) - return ApplyFunctionSingleExampleIterator(self) - + def untested__iter__(self): # only implemented for increased efficiency + class ApplyFunctionSingleExampleIterator(object): + def __init__(self,output_dataset): + self.current=0 + self.output_dataset=output_dataset + self.input_iterator=output_dataset.input_dataset.__iter__() + def __iter__(self): return self + def next(self): + if self.output_dataset.minibatch_mode: + function_inputs = [[input] for input in self.input_iterator.next()] + outputs = self.output_dataset.function(*function_inputs) + assert all([hasattr(output,'__iter__') for output in outputs]) + function_outputs = [output[0] for output in outputs] + else: + function_inputs = self.input_iterator.next() + function_outputs = self.output_dataset.function(*function_inputs) + return Example(self.output_dataset.output_names,function_outputs) + return ApplyFunctionSingleExampleIterator(self) + def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): """