Mercurial > pylearn
changeset 266:6e69fb91f3c0
initial commit of amat
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Wed, 04 Jun 2008 17:49:09 -0400 |
parents | 5614b186c5f4 |
children | 4dad41215967 |
files | amat.py dataset.py test_dataset.py |
diffstat | 3 files changed, 347 insertions(+), 205 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/amat.py Wed Jun 04 17:49:09 2008 -0400 @@ -0,0 +1,123 @@ +"""load PLearn AMat files""" + +import sys, numpy, array + +path_MNIST = '/u/bergstrj/pub/data/mnist.amat' + + +class AMat: + """DataSource to access a plearn amat file as a periodic unrandomized stream. + + Attributes: + + input -- minibatch of input + target -- minibatch of target + weight -- minibatch of weight + extra -- minitbatch of extra + + all -- the entire data contents of the amat file + n_examples -- the number of training examples in the file + + AMat stands for Ascii Matri[x,ces] + + """ + + marker_size = '#size:' + marker_sizes = '#sizes:' + marker_col_names = '#:' + + def __init__(self, path, head=None, update_interval=0, ofile=sys.stdout): + + """Load the amat at <path> into memory. + + path - str: location of amat file + head - int: stop reading after this many data rows + update_interval - int: print '.' to ofile every <this many> lines + ofile - file: print status, msgs, etc. to this file + + """ + self.all = None + self.input = None + self.target = None + self.weight = None + self.extra = None + + self.header = False + self.header_size = None + self.header_rows = None + self.header_cols = None + self.header_sizes = None + self.header_col_names = [] + + data_started = False + data = array.array('d') + + f = open(path) + n_data_lines = 0 + len_float_line = None + + for i,line in enumerate(f): + if n_data_lines == head: + #we've read enough data, + # break even if there's more in the file + break + if len(line) == 0 or line == '\n': + continue + if line[0] == '#': + if not data_started: + #the condition means that the file has a header, and we're on + # some header line + self.header = True + if line.startswith(AMat.marker_size): + info = line[len(AMat.marker_size):] + self.header_size = [int(s) for s in info.split()] + self.header_rows, self.header_cols = self.header_size + if line.startswith(AMat.marker_col_names): + info = line[len(AMat.marker_col_names):] + self.header_col_names = info.split() + elif line.startswith(AMat.marker_sizes): + info = line[len(AMat.marker_sizes):] + self.header_sizes = [int(s) for s in info.split()] + else: + #the first non-commented line tells us that the header is done + data_started = True + float_line = [float(s) for s in line.split()] + if len_float_line is None: + len_float_line = len(float_line) + if (self.header_cols is not None) \ + and self.header_cols != len_float_line: + print >> sys.stderr, \ + 'WARNING: header declared %i cols but first line has %i, using %i',\ + self.header_cols, len_float_line, len_float_line + else: + if len_float_line != len(float_line): + raise IOError('wrong line length', i, line) + data.extend(float_line) + n_data_lines += 1 + + if update_interval > 0 and (ofile is not None) \ + and n_data_lines % update_interval == 0: + ofile.write('.') + ofile.flush() + + if update_interval > 0: + ofile.write('\n') + f.close() + + # convert from array.array to numpy.ndarray + nshape = (len(data) / len_float_line, len_float_line) + self.all = numpy.frombuffer(data).reshape(nshape) + self.n_examples = self.all.shape[0] + + # assign + if self.header_sizes is not None: + if len(self.header_sizes) > 4: + print >> sys.stderr, 'WARNING: ignoring sizes after 4th in %s' % path + leftmost = 0 + #here we make use of the fact that if header_sizes has len < 4 + # the loop will exit before 4 iterations + attrlist = ['input', 'target', 'weight', 'extra'] + for attr, ncols in zip(attrlist, self.header_sizes): + setattr(self, attr, self.all[:, leftmost:leftmost+ncols]) + leftmost += ncols +
--- a/dataset.py Tue Jun 03 21:34:40 2008 -0400 +++ b/dataset.py Wed Jun 04 17:49:09 2008 -0400 @@ -109,10 +109,6 @@ - dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in. - - dataset[fieldname] an iterable over the values of the field fieldname across - the dataset (the iterable is obtained by default by calling valuesVStack - over the values for individual examples). - - dataset.<property> returns the value of a property associated with the name <property>. The following properties should be supported: - 'description': a textual description or name for the dataset @@ -162,10 +158,14 @@ By convention, attributes not in attributeNames() should have a name starting with an underscore. @todo enforce/test that convention! + """ - numpy_vstack = lambda fieldname,values: numpy.vstack(values) - numpy_hstack = lambda fieldnames,values: numpy.hstack(values) + if 0: + # removed by James June 4... these aren't used anywhere according to + # grep + numpy_vstack = lambda fieldname,values: numpy.vstack(values) + numpy_hstack = lambda fieldnames,values: numpy.hstack(values) def __init__(self,description=None,fieldtypes=None): if description is None: @@ -277,9 +277,11 @@ # first get the beginning of our minibatch (top of dataset) first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next() second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next() - minibatch = Example(self.fieldnames, - [self.dataset.valuesAppend(name,[first_part[name],second_part[name]]) - for name in self.fieldnames]) + + blah = [self.dataset.valuesVStack(name,[first_part[name],second_part[name]]) + for name in self.fieldnames] + print type(self.dataset), blah + minibatch = Example(self.fieldnames,blah) self.next_row=upper self.n_batches_done+=1 if upper >= self.L and self.n_batches: @@ -460,12 +462,16 @@ for fieldname,field_values in zip(self.fieldNames(),fields_values)]), self.valuesVStack,self.valuesHStack) - # else check for a fieldname - if self.hasFields(i): - return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] - # else we are trying to access a property of the dataset - assert i in self.__dict__ # else it means we are trying to access a non-existing property - return self.__dict__[i] + + raise TypeError(i) + if 0: + # else check for a fieldname + #after talk with Yoshua June 4, this is disabled. + if self.hasFields(i): + return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] + # else we are trying to access a property of the dataset + assert i in self.__dict__ # else it means we are trying to access a non-existing property + return self.__dict__[i] def valuesHStack(self,fieldnames,fieldvalues): """ @@ -491,21 +497,20 @@ def valuesVStack(self,fieldname,values): """ - Return a value that corresponds to concatenating (vertically) several values of the - same field. This can be important to build a minibatch out of individual examples. This - is likely to involve a copy of the original values. When the values are numpy arrays, the - result should be numpy.vstack(values). - The default is to use numpy.vstack for numpy.ndarray values, and a list - pointing to the original values for other data types. + @param fieldname: the name of the field from which the values were taken + @type fieldname: any type + + @param values: bits near the beginning or end of the dataset + @type values: list of minibatches (returned by minibatch_nowrap) + + @return: the concatenation (stacking) of the values + @rtype: something suitable as a minibatch field + """ - all_numpy=True - for value in values: - if not type(value) is numpy.ndarray: - all_numpy=False - if all_numpy: - return numpy.vstack(values) - # the default implementation of vertical stacking is to put values in a list - return values + rval = [] + for sub_batch in values: + rval.extend(sub_batch) + return rval def __or__(self,other): """ @@ -962,11 +967,15 @@ def valuesVStack(self, fieldname, values): """Concatenate field values vertically, e.g. two vectors become a two-row matrix, two matrices become a longer matrix, etc.""" - return numpy.vstack(values) - def valuesAppend(self, fieldname, values): + #print len(values) + for v in values: + if not isinstance(v, numpy.ndarray): + raise TypeError(v, type(v)) + s0 = sum([v.shape[0] for v in values]) #TODO: there's gotta be a better way to do this! - rval = numpy.ndarray([s0] + values[0].shape[1:],dtype=values[0].dtype) + dtype = values[0].dtype + rval = numpy.ndarray([s0] + list(values[0].shape[1:]), dtype=dtype) cur_row = 0 for v in values: rval[cur_row:cur_row+v.shape[0]] = v @@ -1028,7 +1037,7 @@ values=self.fields_columns.values() if type(key) is int: return Example(fieldnames, - [self.data[key,col] for col in values]) + [numpy.asarray(self.data[key,col]) for col in values]) if type(key) is slice: return MinibatchDataSet(Example(fieldnames, [self.data[key,col] for col in values])) @@ -1106,198 +1115,207 @@ class CachedDataSet(DataSet): - """ - Wrap a L{DataSet} whose values are computationally expensive to obtain - (e.g. because they involve some computation, or disk access), - so that repeated accesses to the same example are done cheaply, - by caching every example value that has been accessed at least once. + """ + Wrap a L{DataSet} whose values are computationally expensive to obtain + (e.g. because they involve some computation, or disk access), + so that repeated accesses to the same example are done cheaply, + by caching every example value that has been accessed at least once. - Optionally, for finite-length dataset, all the values can be computed - (and cached) upon construction of the CachedDataSet, rather at the - first access. + Optionally, for finite-length dataset, all the values can be computed + (and cached) upon construction of the CachedDataSet, rather at the + first access. - @todo: when cache_all_upon_construction create mini-batches that are as - large as possible but not so large as to fill up memory. - - @todo: add disk-buffering capability, so that when the cache becomes too - big for memory, we cache things on disk, trying to keep in memory only - the record most likely to be accessed next. - """ - def __init__(self,source_dataset,cache_all_upon_construction=False): - self.source_dataset=source_dataset - self.cache_all_upon_construction=cache_all_upon_construction - self.cached_examples = [] - if cache_all_upon_construction: - # this potentially brings all the source examples - # into memory at once, which may be too much - # the work could possibly be done by minibatches - # that are as large as possible but no more than what memory allows. - fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next() - assert all([len(self)==len(field_values) for field_values in fields_values]) - for example in fields_values.examples(): - self.cached_examples.append(copy.copy(example)) + @todo: when cache_all_upon_construction create mini-batches that are as + large as possible but not so large as to fill up memory. + + @todo: add disk-buffering capability, so that when the cache becomes too + big for memory, we cache things on disk, trying to keep in memory only + the record most likely to be accessed next. + """ + def __init__(self,source_dataset,cache_all_upon_construction=False): + self.source_dataset=source_dataset + self.cache_all_upon_construction=cache_all_upon_construction + self.cached_examples = [] #a list of LookupList (copies) + if cache_all_upon_construction: + # this potentially brings all the source examples + # into memory at once, which may be too much + # the work could possibly be done by minibatches + # that are as large as possible but no more than what memory allows. + fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next() + assert all([len(self)==len(fval) for fval in fields_values]) + for example in fields_values.examples(): + dup = copy.copy(example) + self.cached_examples.append(dup) - self.fieldNames = source_dataset.fieldNames - self.hasFields = source_dataset.hasFields - self.valuesHStack = source_dataset.valuesHStack - self.valuesVStack = source_dataset.valuesVStack + self.fieldNames = source_dataset.fieldNames + self.hasFields = source_dataset.hasFields + self.valuesHStack = source_dataset.valuesHStack + self.valuesVStack = source_dataset.valuesVStack - def __len__(self): - return len(self.source_dataset) + def __len__(self): + return len(self.source_dataset) - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - class CacheIterator(object): - def __init__(self,dataset): - self.dataset=dataset - self.current=offset - self.all_fields = self.dataset.fieldNames()==fieldnames - def __iter__(self): return self - def next(self): - upper = self.current+minibatch_size - cache_len = len(self.dataset.cached_examples) - if upper>cache_len: # whole minibatch is not already in cache - # cache everything from current length to upper - for example in self.dataset.source_dataset[cache_len:upper]: - self.dataset.cached_examples.append(example) - all_fields_minibatch = Example(self.dataset.fieldNames(), - zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size])) - self.current+=minibatch_size - if self.all_fields: - return all_fields_minibatch - return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames]) - return CacheIterator(self) + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + class CacheIterator(object): + def __init__(self,dataset): + self.dataset=dataset + self.current=offset + self.all_fields = self.dataset.fieldNames()==fieldnames + def __iter__(self): return self + def next(self): + upper = self.current+minibatch_size + cache_len = len(self.dataset.cached_examples) + if upper>cache_len: + # whole minibatch is not already in cache + # cache everything from current length to upper + for example in self.dataset.source_dataset[cache_len:upper]: + self.dataset.cached_examples.append(example) + + next_range = slice(self.current, self.current+minibatch_size) + blah = self.dataset.cached_examples[next_range] + all_fields_minibatch = Example(self.dataset.fieldNames(), zip(*blah)) + self.current+=minibatch_size + + #little optimization to avoid second Example computation if + #possible. + if self.all_fields: + return all_fields_minibatch - def __getitem__(self,i): - if type(i)==int and len(self.cached_examples)>i: - return self.cached_examples[i] - else: - return self.source_dataset[i] - - def __iter__(self): - class CacheIteratorIter(object): - def __init__(self,dataset): - self.dataset=dataset - self.l = len(dataset) - self.current = 0 - self.fieldnames = self.dataset.fieldNames() - self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames)) - def __iter__(self): return self - def next(self): - if self.current>=self.l: - raise StopIteration - cache_len = len(self.dataset.cached_examples) - if self.current>=cache_len: # whole minibatch is not already in cache - # cache everything from current length to upper - self.dataset.cached_examples.append( - self.dataset.source_dataset[self.current]) - self.example._values = self.dataset.cached_examples[self.current] - self.current+=1 - return self.example + rval = Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames]) + return rval + return CacheIterator(self) - return CacheIteratorIter(self) + def __getitem__(self,i): + if type(i)==int and len(self.cached_examples)>i: + return self.cached_examples[i] + else: + return self.source_dataset[i] + + def __iter__(self): + class CacheIteratorIter(object): + def __init__(self,dataset): + self.dataset=dataset + self.l = len(dataset) + self.current = 0 + self.fieldnames = self.dataset.fieldNames() + self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames)) + def __iter__(self): return self + def next(self): + if self.current>=self.l: + raise StopIteration + cache_len = len(self.dataset.cached_examples) + if self.current>=cache_len: # whole minibatch is not already in cache + # cache everything from current length to upper + self.dataset.cached_examples.append( + self.dataset.source_dataset[self.current]) + self.example._values = self.dataset.cached_examples[self.current] + self.current+=1 + return self.example + + return CacheIteratorIter(self) class ApplyFunctionDataSet(DataSet): - """ - A L{DataSet} that contains as fields the results of applying a - given function example-wise or minibatch-wise to all the fields of - an input dataset. The output of the function should be an iterable - (e.g. a list or a LookupList) over the resulting values. - - The function take as input the fields of the dataset, not the examples. + """ + A L{DataSet} that contains as fields the results of applying a + given function example-wise or minibatch-wise to all the fields of + an input dataset. The output of the function should be an iterable + (e.g. a list or a LookupList) over the resulting values. + + The function take as input the fields of the dataset, not the examples. - In minibatch mode, the function is expected to work on minibatches - (takes a minibatch in input and returns a minibatch in output). More - precisely, it means that each element of the input or output list - should be iterable and indexable over the individual example values - (typically these elements will be numpy arrays). All of the elements - in the input and output lists should have the same length, which is - the length of the minibatch. + In minibatch mode, the function is expected to work on minibatches + (takes a minibatch in input and returns a minibatch in output). More + precisely, it means that each element of the input or output list + should be iterable and indexable over the individual example values + (typically these elements will be numpy arrays). All of the elements + in the input and output lists should have the same length, which is + the length of the minibatch. - The function is applied each time an example or a minibatch is accessed. - To avoid re-doing computation, wrap this dataset inside a CachedDataSet. + The function is applied each time an example or a minibatch is accessed. + To avoid re-doing computation, wrap this dataset inside a CachedDataSet. - If the values_{h,v}stack functions are not provided, then - the input_dataset.values{H,V}Stack functions are used by default. - """ - def __init__(self,input_dataset,function,output_names,minibatch_mode=True, - values_hstack=None,values_vstack=None, - description=None,fieldtypes=None): - """ - Constructor takes an input dataset that has as many fields as the function - expects as inputs. The resulting dataset has as many fields as the function - produces as outputs, and that should correspond to the number of output names - (provided in a list). + If the values_{h,v}stack functions are not provided, then + the input_dataset.values{H,V}Stack functions are used by default. + """ + def __init__(self,input_dataset,function,output_names,minibatch_mode=True, + values_hstack=None,values_vstack=None, + description=None,fieldtypes=None): + """ + Constructor takes an input dataset that has as many fields as the function + expects as inputs. The resulting dataset has as many fields as the function + produces as outputs, and that should correspond to the number of output names + (provided in a list). - Note that the expected semantics of the function differs in minibatch mode - (it takes minibatches of inputs and produces minibatches of outputs, as - documented in the class comment). + Note that the expected semantics of the function differs in minibatch mode + (it takes minibatches of inputs and produces minibatches of outputs, as + documented in the class comment). - TBM: are filedtypes the old field types (from input_dataset) or the new ones - (for the new dataset created)? - """ - self.input_dataset=input_dataset - self.function=function - self.output_names=output_names - self.minibatch_mode=minibatch_mode - DataSet.__init__(self,description,fieldtypes) - self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack - self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack + TBM: are filedtypes the old field types (from input_dataset) or the new ones + (for the new dataset created)? + """ + self.input_dataset=input_dataset + self.function=function + self.output_names=output_names + self.minibatch_mode=minibatch_mode + DataSet.__init__(self,description,fieldtypes) + self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack + self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack - def __len__(self): - return len(self.input_dataset) + def __len__(self): + return len(self.input_dataset) - def fieldNames(self): - return self.output_names + def fieldNames(self): + return self.output_names - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - class ApplyFunctionIterator(object): - def __init__(self,output_dataset): - self.input_dataset=output_dataset.input_dataset - self.output_dataset=output_dataset - self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size, - n_batches=n_batches,offset=offset).__iter__() + def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + class ApplyFunctionIterator(object): + def __init__(self,output_dataset): + self.input_dataset=output_dataset.input_dataset + self.output_dataset=output_dataset + self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size, + n_batches=n_batches,offset=offset).__iter__() - def __iter__(self): return self + def __iter__(self): return self - def next(self): - function_inputs = self.input_iterator.next() - all_output_names = self.output_dataset.output_names - if self.output_dataset.minibatch_mode: - function_outputs = self.output_dataset.function(*function_inputs) - else: - input_examples = zip(*function_inputs) - output_examples = [self.output_dataset.function(*input_example) - for input_example in input_examples] - function_outputs = [self.output_dataset.valuesVStack(name,values) - for name,values in zip(all_output_names, - zip(*output_examples))] - all_outputs = Example(all_output_names,function_outputs) - if fieldnames==all_output_names: - return all_outputs - return Example(fieldnames,[all_outputs[name] for name in fieldnames]) + def next(self): + function_inputs = self.input_iterator.next() + all_output_names = self.output_dataset.output_names + if self.output_dataset.minibatch_mode: + function_outputs = self.output_dataset.function(*function_inputs) + else: + input_examples = zip(*function_inputs) + output_examples = [self.output_dataset.function(*input_example) + for input_example in input_examples] + function_outputs = [self.output_dataset.valuesVStack(name,values) + for name,values in zip(all_output_names, + zip(*output_examples))] + all_outputs = Example(all_output_names,function_outputs) + if fieldnames==all_output_names: + return all_outputs + return Example(fieldnames,[all_outputs[name] for name in fieldnames]) - return ApplyFunctionIterator(self) + return ApplyFunctionIterator(self) - def __iter__(self): # only implemented for increased efficiency - class ApplyFunctionSingleExampleIterator(object): - def __init__(self,output_dataset): - self.current=0 - self.output_dataset=output_dataset - self.input_iterator=output_dataset.input_dataset.__iter__() - def __iter__(self): return self - def next(self): - if self.output_dataset.minibatch_mode: - function_inputs = [[input] for input in self.input_iterator.next()] - outputs = self.output_dataset.function(*function_inputs) - assert all([hasattr(output,'__iter__') for output in outputs]) - function_outputs = [output[0] for output in outputs] - else: - function_inputs = self.input_iterator.next() - function_outputs = self.output_dataset.function(*function_inputs) - return Example(self.output_dataset.output_names,function_outputs) - return ApplyFunctionSingleExampleIterator(self) - + def __iter__(self): # only implemented for increased efficiency + class ApplyFunctionSingleExampleIterator(object): + def __init__(self,output_dataset): + self.current=0 + self.output_dataset=output_dataset + self.input_iterator=output_dataset.input_dataset.__iter__() + def __iter__(self): return self + def next(self): + if self.output_dataset.minibatch_mode: + function_inputs = [[input] for input in self.input_iterator.next()] + outputs = self.output_dataset.function(*function_inputs) + assert all([hasattr(output,'__iter__') for output in outputs]) + function_outputs = [output[0] for output in outputs] + else: + function_inputs = self.input_iterator.next() + function_outputs = self.output_dataset.function(*function_inputs) + return Example(self.output_dataset.output_names,function_outputs) + return ApplyFunctionSingleExampleIterator(self) + def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): """
--- a/test_dataset.py Tue Jun 03 21:34:40 2008 -0400 +++ b/test_dataset.py Wed Jun 04 17:49:09 2008 -0400 @@ -421,7 +421,7 @@ test_all(a2,ds) - del a2, ds + del a2, ds #removes from list of active objects in debugger def test_LookupList(): #test only the example in the doc??? @@ -642,7 +642,8 @@ if __name__=='__main__': - test1() + if 0: + test1() test_LookupList() test_ArrayDataSet() test_CachedDataSet()