Mercurial > pylearn
changeset 294:f7924e13e426
Automated merge with ssh://projects@lgcm.iro.umontreal.ca/hg/pylearn
author | Frederic Bastien <bastienf@iro.umontreal.ca> |
---|---|
date | Fri, 06 Jun 2008 16:15:47 -0400 |
parents | 4bfdda107a17 (diff) 3af204aa71e5 (current diff) |
children | 7380376816e5 f5d33f9c0b9c |
files | _test_dataset.py |
diffstat | 2 files changed, 243 insertions(+), 237 deletions(-) [+] |
line wrap: on
line diff
--- a/_test_dataset.py Fri Jun 06 16:15:43 2008 -0400 +++ b/_test_dataset.py Fri Jun 06 16:15:47 2008 -0400 @@ -47,6 +47,11 @@ #not in doc!!! i=0 for example in range(len(ds)): + wanted = array[example][:3] + returned = ds[example]['x'] + if (wanted != returned).all(): + print 'returned:', returned + print 'wanted:', wanted assert (ds[example]['x']==array[example][:3]).all() assert ds[example]['y']==array[example][3] assert (ds[example]['z']==array[example][[0,2]]).all() @@ -226,8 +231,7 @@ assert i==m.n_batches*m.minibatch_size del x,y,i,id - #@todo: we can't do minibatch bigger then the size of the dataset??? - assert have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array)+1,offset=0) + assert not have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array)+1,offset=0) assert not have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array),offset=0) def test_ds_iterator(array,iterator1,iterator2,iterator3): @@ -388,7 +392,7 @@ #don't test proterties a2 = numpy.random.rand(10,4) ds = ArrayDataSet(a2,{'x':slice(3),'y':3,'z':[0,2]})###???tuple not tested - ds = ArrayDataSet(a2,LookupList(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested + ds = ArrayDataSet(a2,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested #assert ds==a? should this work? test_all(a2,ds) @@ -397,7 +401,7 @@ def test_CachedDataSet(self): a = numpy.random.rand(10,4) - ds1 = ArrayDataSet(a,LookupList(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested + ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested ds2 = CachedDataSet(ds1) ds3 = CachedDataSet(ds1,cache_all_upon_construction=True) @@ -413,7 +417,7 @@ def test_ApplyFunctionDataSet(self): a = numpy.random.rand(10,4) a2 = a+1 - ds1 = ArrayDataSet(a,LookupList(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested + ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested ds2 = ApplyFunctionDataSet(ds1,lambda x,y,z: (x+1,y+1,z+1), ['x','y','z'],minibatch_mode=False) ds3 = ApplyFunctionDataSet(ds1,lambda x,y,z: (numpy.array(x)+1,numpy.array(y)+1,numpy.array(z)+1),
--- a/dataset.py Fri Jun 06 16:15:43 2008 -0400 +++ b/dataset.py Fri Jun 06 16:15:47 2008 -0400 @@ -1,6 +1,5 @@ -from lookup_list import LookupList -Example = LookupList +from lookup_list import LookupList as Example from misc import unique_elements_list_intersection from string import join from sys import maxint @@ -38,7 +37,6 @@ else: return [self.__getattribute__(name) for name in attribute_names] - class DataSet(AttributesHolder): """A virtual base class for datasets. @@ -163,17 +161,55 @@ numpy_vstack = lambda fieldname,values: numpy.vstack(values) numpy_hstack = lambda fieldnames,values: numpy.hstack(values) - def __init__(self,description=None,fieldtypes=None): - if description is None: - # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)" - description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" - self.description=description - self.fieldtypes=fieldtypes + def __init__(self, description=None, fieldnames=None, fieldtypes=None): + """ + @type fieldnames: list of strings + @type fieldtypes: list of python types, same length as fieldnames + @type description: string + @param description: description/name for this dataset + """ + def default_desc(): + return type(self).__name__ \ + + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" + + #self.fieldnames = fieldnames + + self.fieldtypes = fieldtypes if fieldtypes is not None \ + else [None]*1 #len(fieldnames) + + self.description = default_desc() if description is None \ + else description self._attribute_names = ["description"] - if fieldtypes: - self._attribute_names.append("fieldtypes") + + attributeNames = property(lambda self: copy.copy(self._attribute_names)) + + def __contains__(self, fieldname): + return (fieldname in self.fieldNames()) \ + or (fieldname in self.attributeNames()) + + def __iter__(self): + """Supports the syntax "for i in dataset: ..." - def attributeNames(self): return self._attribute_names + Using this syntax, "i" will be an Example instance (or equivalent) with + all the fields of DataSet self. Every field of "i" will give access to + a field of a single example. Fields should be accessible via + i["fielname"] or i[3] (in the order defined by the elements of the + Example returned by this iterator), but the derived class is free + to accept any type of identifier, and add extra functionality to the iterator. + + The default implementation calls the minibatches iterator and extracts the first example of each field. + """ + return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1)) + + def __len__(self): + """ + len(dataset) returns the number of examples in the dataset. + By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint). + Sub-classes which implement finite-length datasets should redefine this method. + Some methods only make sense for finite-length datasets. + """ + return None + class MinibatchToSingleExampleIterator(object): """ @@ -200,24 +236,6 @@ def next_index(self): return self.minibatch_iterator.next_index() - def __iter__(self): - """Supports the syntax "for i in dataset: ..." - - Using this syntax, "i" will be an Example instance (or equivalent) with - all the fields of DataSet self. Every field of "i" will give access to - a field of a single example. Fields should be accessible via - i["fielname"] or i[3] (in the order defined by the elements of the - Example returned by this iterator), but the derived class is free - to accept any type of identifier, and add extra functionality to the iterator. - - The default implementation calls the minibatches iterator and extracts the first example of each field. - """ - return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1)) - - def __contains__(self, fieldname): - return (fieldname in self.fieldNames()) \ - or (fieldname in self.attributeNames()) - class MinibatchWrapAroundIterator(object): """ An iterator for minibatches that handles the case where we need to wrap around the @@ -234,9 +252,8 @@ self.n_batches=n_batches self.n_batches_done=0 self.next_row=offset - self.offset=offset self.L=len(dataset) - assert offset+minibatch_size<=self.L + self.offset=offset % self.L ds_nbatches = (self.L-self.next_row)/self.minibatch_size if n_batches is not None: ds_nbatches = min(n_batches,ds_nbatches) @@ -244,8 +261,7 @@ assert dataset.hasFields(*fieldnames) else: self.fieldnames=dataset.fieldNames() - self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, - ds_nbatches,self.next_row) + self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, ds_nbatches,self.next_row) def __iter__(self): return self @@ -318,7 +334,7 @@ f1, f2, and f3 fields of a batch of examples on each loop iteration. The minibatches iterator is expected to return upon each call to next() - a DataSetFields object, which is a LookupList (indexed by the field names) whose + a DataSetFields object, which is a Example (indexed by the field names) whose elements are iterable and indexable over the minibatch examples, and which keeps a pointer to a sub-dataset that can be used to iterate over the individual examples in the minibatch. Hence a minibatch can be converted back to a regular @@ -362,15 +378,6 @@ """ raise AbstractFunction() - def __len__(self): - """ - len(dataset) returns the number of examples in the dataset. - By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint). - Sub-classes which implement finite-length datasets should redefine this method. - Some methods only make sense for finite-length datasets. - """ - return maxint - def is_unbounded(self): """ Tests whether a dataset is unbounded (e.g. a stream). @@ -424,52 +431,70 @@ def __getitem__(self,i): """ - dataset[i] returns the (i+1)-th example of the dataset. - dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. - dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. - dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in. - dataset['key'] returns a property associated with the given 'key' string. - If 'key' is a fieldname, then the VStacked field values (iterable over - field values) for that field is returned. Other keys may be supported - by different dataset subclasses. The following key names are encouraged: - - 'description': a textual description or name for the dataset - - '<fieldname>.type': a type name or value for a given <fieldname> + @rtype: Example + @returns: single or multiple examples - Note that some stream datasets may be unable to implement random access, i.e. - arbitrary slicing/indexing - because they can only iterate through examples one or a minibatch at a time - and do not actually store or keep past (or future) examples. + @type i: integer or slice or <iterable> of integers + @param i: + dataset[i] returns the (i+1)-th example of the dataset. + dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. + dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. + dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in. + + @note: + Some stream datasets may be unable to implement random access, i.e. + arbitrary slicing/indexing because they can only iterate through + examples one or a minibatch at a time and do not actually store or keep + past (or future) examples. The default implementation of getitem uses the minibatches iterator to obtain one example, one slice, or a list of examples. It may not always be the most efficient way to obtain the result, especially if the data are actually stored in a memory array. """ - # check for an index + if type(i) is int: - return DataSet.MinibatchToSingleExampleIterator( - self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next() - rows=None - # or a slice + #TODO: consider asserting that i >= 0 + i_batch = self.minibatches_nowrap(self.fieldNames(), + minibatch_size=1, n_batches=1, offset=i) + return DataSet.MinibatchToSingleExampleIterator(i_batch).next() + + #if i is a contiguous slice + if type(i) is slice and (i.step in (None, 1)): + offset = 0 if i.start is None else i.start + upper_bound = len(self) if i.stop is None else i.stop + return MinibatchDataSet(self.minibatches_nowrap(self.fieldNames(), + minibatch_size=upper_bound - offset, + n_batches=1, + offset=offset).next()) + + # if slice has a step param, convert it to list and handle it with the + # list code if type(i) is slice: - #print 'i=',i - if not i.start: i=slice(0,i.stop,i.step) - if not i.stop: i=slice(i.start,len(self),i.step) - if not i.step: i=slice(i.start,i.stop,1) - if i.step is 1: - return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples() - rows = range(i.start,i.stop,i.step) - # or a list of indices - elif type(i) is list: - rows = i - if rows is not None: - examples = [self[row] for row in rows] - fields_values = zip(*examples) - return MinibatchDataSet( - Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values) - for fieldname,field_values - in zip(self.fieldNames(),fields_values)]), - self.valuesVStack,self.valuesHStack) + offset = 0 if i.start is None else i.start + upper_bound = len(self) if i.stop is None else i.stop + i = list(range(offset, upper_bound, i.step)) + + # handle tuples, arrays, lists + if hasattr(i, '__getitem__'): + for idx in i: + #dis-allow nested slices + if not isinstance(idx, int): + raise TypeError(idx) + # call back into self.__getitem__ + examples = [self.minibatches_nowrap(self.fieldNames(), + minibatch_size=1, n_batches=1, offset=ii).next() + for ii in i] + # re-index the fields in each example by field instead of by example + field_values = [[] for blah in self.fieldNames()] + for e in examples: + for f,v in zip(field_values, e): + f.append(v) + #build them into a LookupList (a.ka. Example) + zz = zip(self.fieldNames(),field_values) + vst = [self.valuesVStack(fieldname,field_values) for fieldname,field_values in zz] + example = Example(self.fieldNames(), vst) + return MinibatchDataSet(example, self.valuesVStack, self.valuesHStack) raise TypeError(i, type(i)) def valuesHStack(self,fieldnames,fieldvalues): @@ -493,24 +518,21 @@ # the default implementation of horizontal stacking is to put values in a list return fieldvalues - def valuesVStack(self,fieldname,values): """ - Return a value that corresponds to concatenating (vertically) several values of the - same field. This can be important to build a minibatch out of individual examples. This - is likely to involve a copy of the original values. When the values are numpy arrays, the - result should be numpy.vstack(values). - The default is to use numpy.vstack for numpy.ndarray values, and a list - pointing to the original values for other data types. + @param fieldname: the name of the field from which the values were taken + @type fieldname: any type + + @param values: bits near the beginning or end of the dataset + @type values: list of minibatches (returned by minibatch_nowrap) + + @return: the concatenation (stacking) of the values + @rtype: something suitable as a minibatch field """ - all_numpy=True - for value in values: - if not type(value) is numpy.ndarray: - all_numpy=False - if all_numpy: - return numpy.vstack(values) - # the default implementation of vertical stacking is to put values in a list - return values + rval = [] + for v in values: + rval.extend(v) + return rval def __or__(self,other): """ @@ -586,11 +608,11 @@ def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): assert self.hasFields(*fieldnames) return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset) - def __getitem__(self,i): + def dontuse__getitem__(self,i): return FieldsSubsetDataSet(self.src[i],self.fieldnames) -class DataSetFields(LookupList): +class DataSetFields(Example): """ Although a L{DataSet} iterates over examples (like rows of a matrix), an associated DataSetFields iterates over fields (like columns of a matrix), and can be understood @@ -628,9 +650,9 @@ self.dataset=dataset if isinstance(dataset,MinibatchDataSet): - LookupList.__init__(self,fieldnames,list(dataset._fields)) + Example.__init__(self,fieldnames,list(dataset._fields)) elif isinstance(original_dataset,MinibatchDataSet): - LookupList.__init__(self,fieldnames, + Example.__init__(self,fieldnames, [original_dataset._fields[field] for field in fieldnames]) else: @@ -638,7 +660,7 @@ minibatch_size=len(dataset), n_batches=1) minibatch=minibatch_iterator.next() - LookupList.__init__(self,fieldnames,minibatch) + Example.__init__(self,fieldnames,minibatch) def examples(self): return self.dataset @@ -660,7 +682,7 @@ class MinibatchDataSet(DataSet): """ - Turn a L{LookupList} of same-length (iterable) fields into an example-iterable dataset. + Turn a L{Example} of same-length (iterable) fields into an example-iterable dataset. Each element of the lookup-list should be an iterable and sliceable, all of the same length. """ def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack, @@ -680,14 +702,15 @@ print 'len(field) = ', len(field) print 'self._fields.keys() = ', self._fields.keys() print 'field=',field + print 'fields_lookuplist=', fields_lookuplist assert self.length==len(field) - self.values_vstack=values_vstack - self.values_hstack=values_hstack + self.valuesVStack=values_vstack + self.valuesHStack=values_hstack def __len__(self): return self.length - def __getitem__(self,i): + def dontuse__getitem__(self,i): if type(i) in (slice,list): return DataSetFields(MinibatchDataSet( Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames()) @@ -717,7 +740,7 @@ self.ds=ds self.next_example=offset - assert minibatch_size > 0 + assert minibatch_size >= 0 if offset+minibatch_size > ds.length: raise NotImplementedError() def __iter__(self): @@ -741,12 +764,6 @@ # tbm: added fieldnames to handle subset of fieldnames return Iterator(self,fieldnames) - def valuesVStack(self,fieldname,fieldvalues): - return self.values_vstack(fieldname,fieldvalues) - - def valuesHStack(self,fieldnames,fieldvalues): - return self.values_hstack(fieldnames,fieldvalues) - class HStackedDataSet(DataSet): """ A L{DataSet} that wraps several datasets and shows a view that includes all their fields, @@ -810,7 +827,7 @@ return self def next(self): # concatenate all the fields of the minibatches - l=LookupList() + l=Example() for iter in self.iterators: l.append_lookuplist(iter.next()) return l @@ -834,10 +851,10 @@ return HStackedIterator(self,iterators) - def valuesVStack(self,fieldname,fieldvalues): + def untested_valuesVStack(self,fieldname,fieldvalues): return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues) - def valuesHStack(self,fieldnames,fieldvalues): + def untested_valuesHStack(self,fieldnames,fieldvalues): """ We will use the sub-dataset associated with the first fieldname in the fieldnames list to do the work, hoping that it can cope with the other values (i.e. won't care @@ -961,11 +978,11 @@ """ def __init__(self,description=None,field_types=None): DataSet.__init__(self,description,field_types) - def valuesHStack(self,fieldnames,fieldvalues): + def untested_valuesHStack(self,fieldnames,fieldvalues): """Concatenate field values horizontally, e.g. two vectors become a longer vector, two matrices become a wider matrix, etc.""" return numpy.hstack(fieldvalues) - def valuesVStack(self,fieldname,values): + def untested_valuesVStack(self,fieldname,values): """Concatenate field values vertically, e.g. two vectors become a two-row matrix, two matrices become a longer matrix, etc.""" return numpy.vstack(values) @@ -1019,7 +1036,7 @@ def __len__(self): return len(self.data) - def __getitem__(self,key): + def dontuse__getitem__(self,key): """More efficient implementation than the default __getitem__""" fieldnames=self.fields_columns.keys() values=self.fields_columns.values() @@ -1051,12 +1068,12 @@ assert key in self.__dict__ # else it means we are trying to access a non-existing property return self.__dict__[key] - def __iter__(self): + def dontuse__iter__(self): class ArrayDataSetIteratorIter(object): def __init__(self,dataset,fieldnames): if fieldnames is None: fieldnames = dataset.fieldNames() # store the resulting minibatch in a lookup-list of values - self.minibatch = LookupList(fieldnames,[0]*len(fieldnames)) + self.minibatch = Example(fieldnames,[0]*len(fieldnames)) self.dataset=dataset self.current=0 self.columns = [self.dataset.fields_columns[f] @@ -1078,26 +1095,17 @@ return ArrayDataSetIteratorIter(self,self.fieldNames()) def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - class ArrayDataSetIterator(object): - def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset): - if fieldnames is None: fieldnames = dataset.fieldNames() - # store the resulting minibatch in a lookup-list of values - self.minibatch = LookupList(fieldnames,[0]*len(fieldnames)) - self.dataset=dataset - self.minibatch_size=minibatch_size - assert offset>=0 and offset<len(dataset.data) - assert offset+minibatch_size<=len(dataset.data) - self.current=offset - def __iter__(self): - return self - def next(self): - #@todo: we suppose that MinibatchWrapAroundIterator stop the iterator - sub_data = self.dataset.data[self.current:self.current+self.minibatch_size] - self.minibatch._values = [sub_data[:,self.dataset.fields_columns[f]] for f in self.minibatch._names] - self.current+=self.minibatch_size - return self.minibatch + cursor = Example(fieldnames,[0]*len(fieldnames)) + fieldnames = self.fieldNames() if fieldnames is None else fieldnames + for n in xrange(n_batches): + if offset == len(self): + break + sub_data = self.data[offset : offset+minibatch_size] + offset += len(sub_data) #can be less than minibatch_size at end + cursor._values = [sub_data[:,self.fields_columns[f]] for f in cursor._names] + yield cursor - return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) + #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) class CachedDataSet(DataSet): @@ -1162,7 +1170,7 @@ return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames]) return CacheIterator(self) - def __getitem__(self,i): + def dontuse__getitem__(self,i): if type(i)==int and len(self.cached_examples)>i: return self.cached_examples[i] else: @@ -1175,7 +1183,7 @@ self.l = len(dataset) self.current = 0 self.fieldnames = self.dataset.fieldNames() - self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames)) + self.example = Example(self.fieldnames,[0]*len(self.fieldnames)) def __iter__(self): return self def next(self): if self.current>=self.l: @@ -1192,107 +1200,101 @@ return CacheIteratorIter(self) class ApplyFunctionDataSet(DataSet): - """ - A L{DataSet} that contains as fields the results of applying a - given function example-wise or minibatch-wise to all the fields of - an input dataset. The output of the function should be an iterable - (e.g. a list or a LookupList) over the resulting values. - - The function take as input the fields of the dataset, not the examples. + """ + A L{DataSet} that contains as fields the results of applying a + given function example-wise or minibatch-wise to all the fields of + an input dataset. The output of the function should be an iterable + (e.g. a list or a Example) over the resulting values. + + The function take as input the fields of the dataset, not the examples. - In minibatch mode, the function is expected to work on minibatches - (takes a minibatch in input and returns a minibatch in output). More - precisely, it means that each element of the input or output list - should be iterable and indexable over the individual example values - (typically these elements will be numpy arrays). All of the elements - in the input and output lists should have the same length, which is - the length of the minibatch. + In minibatch mode, the function is expected to work on minibatches + (takes a minibatch in input and returns a minibatch in output). More + precisely, it means that each element of the input or output list + should be iterable and indexable over the individual example values + (typically these elements will be numpy arrays). All of the elements + in the input and output lists should have the same length, which is + the length of the minibatch. - The function is applied each time an example or a minibatch is accessed. - To avoid re-doing computation, wrap this dataset inside a CachedDataSet. + The function is applied each time an example or a minibatch is accessed. + To avoid re-doing computation, wrap this dataset inside a CachedDataSet. - If the values_{h,v}stack functions are not provided, then - the input_dataset.values{H,V}Stack functions are used by default. - """ - def __init__(self,input_dataset,function,output_names,minibatch_mode=True, - values_hstack=None,values_vstack=None, - description=None,fieldtypes=None): - """ - Constructor takes an input dataset that has as many fields as the function - expects as inputs. The resulting dataset has as many fields as the function - produces as outputs, and that should correspond to the number of output names - (provided in a list). + If the values_{h,v}stack functions are not provided, then + the input_dataset.values{H,V}Stack functions are used by default. + """ + def __init__(self,input_dataset,function,output_names,minibatch_mode=True, + values_hstack=None,values_vstack=None, + description=None,fieldtypes=None): + """ + Constructor takes an input dataset that has as many fields as the function + expects as inputs. The resulting dataset has as many fields as the function + produces as outputs, and that should correspond to the number of output names + (provided in a list). - Note that the expected semantics of the function differs in minibatch mode - (it takes minibatches of inputs and produces minibatches of outputs, as - documented in the class comment). + Note that the expected semantics of the function differs in minibatch mode + (it takes minibatches of inputs and produces minibatches of outputs, as + documented in the class comment). - TBM: are filedtypes the old field types (from input_dataset) or the new ones - (for the new dataset created)? - """ - self.input_dataset=input_dataset - self.function=function - self.output_names=output_names - self.minibatch_mode=minibatch_mode - DataSet.__init__(self,description,fieldtypes) - self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack - self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack - - def __len__(self): - return len(self.input_dataset) + TBM: are filedtypes the old field types (from input_dataset) or the new ones + (for the new dataset created)? + """ + self.input_dataset=input_dataset + self.function=function + self.output_names=output_names + self.minibatch_mode=minibatch_mode + DataSet.__init__(self,description,fieldtypes) + self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack + self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack - def fieldNames(self): - return self.output_names + def __len__(self): + return len(self.input_dataset) - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - class ApplyFunctionIterator(object): - def __init__(self,output_dataset): - self.input_dataset=output_dataset.input_dataset - self.output_dataset=output_dataset - self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size, - n_batches=n_batches,offset=offset).__iter__() + def fieldNames(self): + return self.output_names - def __iter__(self): return self + def minibatches_nowrap(self, fieldnames, *args, **kwargs): + for input_fields in self.input_dataset.minibatches_nowrap(fieldnames, *args, **kwargs): - def next(self): - function_inputs = self.input_iterator.next() - all_output_names = self.output_dataset.output_names - if self.output_dataset.minibatch_mode: - function_outputs = self.output_dataset.function(*function_inputs) - else: - input_examples = zip(*function_inputs) - output_examples = [self.output_dataset.function(*input_example) - for input_example in input_examples] - function_outputs = [self.output_dataset.valuesVStack(name,values) - for name,values in zip(all_output_names, - zip(*output_examples))] - all_outputs = Example(all_output_names,function_outputs) - if fieldnames==all_output_names: - return all_outputs - return Example(fieldnames,[all_outputs[name] for name in fieldnames]) - - - return ApplyFunctionIterator(self) + #function_inputs = self.input_iterator.next() + if self.minibatch_mode: + function_outputs = self.function(*input_fields) + else: + input_examples = zip(*input_fields) + output_examples = [self.function(*input_example) + for input_example in input_examples] + function_outputs = [self.valuesVStack(name,values) + for name,values in zip(self.output_names, + zip(*output_examples))] + all_outputs = Example(self.output_names, function_outputs) + print 'input_fields', input_fields + print 'all_outputs', all_outputs + if fieldnames==self.output_names: + rval = all_outputs + else: + rval = Example(fieldnames,[all_outputs[name] for name in fieldnames]) + print 'rval', rval + print '--------' + yield rval - def __iter__(self): # only implemented for increased efficiency - class ApplyFunctionSingleExampleIterator(object): - def __init__(self,output_dataset): - self.current=0 - self.output_dataset=output_dataset - self.input_iterator=output_dataset.input_dataset.__iter__() - def __iter__(self): return self - def next(self): - if self.output_dataset.minibatch_mode: - function_inputs = [[input] for input in self.input_iterator.next()] - outputs = self.output_dataset.function(*function_inputs) - assert all([hasattr(output,'__iter__') for output in outputs]) - function_outputs = [output[0] for output in outputs] - else: - function_inputs = self.input_iterator.next() - function_outputs = self.output_dataset.function(*function_inputs) - return Example(self.output_dataset.output_names,function_outputs) - return ApplyFunctionSingleExampleIterator(self) - + def untested__iter__(self): # only implemented for increased efficiency + class ApplyFunctionSingleExampleIterator(object): + def __init__(self,output_dataset): + self.current=0 + self.output_dataset=output_dataset + self.input_iterator=output_dataset.input_dataset.__iter__() + def __iter__(self): return self + def next(self): + if self.output_dataset.minibatch_mode: + function_inputs = [[input] for input in self.input_iterator.next()] + outputs = self.output_dataset.function(*function_inputs) + assert all([hasattr(output,'__iter__') for output in outputs]) + function_outputs = [output[0] for output in outputs] + else: + function_inputs = self.input_iterator.next() + function_outputs = self.output_dataset.function(*function_inputs) + return Example(self.output_dataset.output_names,function_outputs) + return ApplyFunctionSingleExampleIterator(self) + def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): """