# HG changeset patch # User Frederic Bastien # Date 1212680864 14400 # Node ID fa8abc813bd21c759817c3ed56e0b2d79de74449 # Parent fdce496c3b5616a2f77c9302cfb5bf91e8c4687d# Parent 6226ebafefc399b2520c9c58f6f2dac6a26dd3e6 Automated merge with ssh://projects@lgcm.iro.umontreal.ca/hg/pylearn diff -r 6226ebafefc3 -r fa8abc813bd2 amat.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/amat.py Thu Jun 05 11:47:44 2008 -0400 @@ -0,0 +1,123 @@ +"""load PLearn AMat files""" + +import sys, numpy, array + +path_MNIST = '/u/bergstrj/pub/data/mnist.amat' + + +class AMat: + """DataSource to access a plearn amat file as a periodic unrandomized stream. + + Attributes: + + input -- minibatch of input + target -- minibatch of target + weight -- minibatch of weight + extra -- minitbatch of extra + + all -- the entire data contents of the amat file + n_examples -- the number of training examples in the file + + AMat stands for Ascii Matri[x,ces] + + """ + + marker_size = '#size:' + marker_sizes = '#sizes:' + marker_col_names = '#:' + + def __init__(self, path, head=None, update_interval=0, ofile=sys.stdout): + + """Load the amat at into memory. + + path - str: location of amat file + head - int: stop reading after this many data rows + update_interval - int: print '.' to ofile every lines + ofile - file: print status, msgs, etc. to this file + + """ + self.all = None + self.input = None + self.target = None + self.weight = None + self.extra = None + + self.header = False + self.header_size = None + self.header_rows = None + self.header_cols = None + self.header_sizes = None + self.header_col_names = [] + + data_started = False + data = array.array('d') + + f = open(path) + n_data_lines = 0 + len_float_line = None + + for i,line in enumerate(f): + if n_data_lines == head: + #we've read enough data, + # break even if there's more in the file + break + if len(line) == 0 or line == '\n': + continue + if line[0] == '#': + if not data_started: + #the condition means that the file has a header, and we're on + # some header line + self.header = True + if line.startswith(AMat.marker_size): + info = line[len(AMat.marker_size):] + self.header_size = [int(s) for s in info.split()] + self.header_rows, self.header_cols = self.header_size + if line.startswith(AMat.marker_col_names): + info = line[len(AMat.marker_col_names):] + self.header_col_names = info.split() + elif line.startswith(AMat.marker_sizes): + info = line[len(AMat.marker_sizes):] + self.header_sizes = [int(s) for s in info.split()] + else: + #the first non-commented line tells us that the header is done + data_started = True + float_line = [float(s) for s in line.split()] + if len_float_line is None: + len_float_line = len(float_line) + if (self.header_cols is not None) \ + and self.header_cols != len_float_line: + print >> sys.stderr, \ + 'WARNING: header declared %i cols but first line has %i, using %i',\ + self.header_cols, len_float_line, len_float_line + else: + if len_float_line != len(float_line): + raise IOError('wrong line length', i, line) + data.extend(float_line) + n_data_lines += 1 + + if update_interval > 0 and (ofile is not None) \ + and n_data_lines % update_interval == 0: + ofile.write('.') + ofile.flush() + + if update_interval > 0: + ofile.write('\n') + f.close() + + # convert from array.array to numpy.ndarray + nshape = (len(data) / len_float_line, len_float_line) + self.all = numpy.frombuffer(data).reshape(nshape) + self.n_examples = self.all.shape[0] + + # assign + if self.header_sizes is not None: + if len(self.header_sizes) > 4: + print >> sys.stderr, 'WARNING: ignoring sizes after 4th in %s' % path + leftmost = 0 + #here we make use of the fact that if header_sizes has len < 4 + # the loop will exit before 4 iterations + attrlist = ['input', 'target', 'weight', 'extra'] + for attr, ncols in zip(attrlist, self.header_sizes): + setattr(self, attr, self.all[:, leftmost:leftmost+ncols]) + leftmost += ncols + diff -r 6226ebafefc3 -r fa8abc813bd2 dataset.py --- a/dataset.py Tue Jun 03 16:13:42 2008 -0400 +++ b/dataset.py Thu Jun 05 11:47:44 2008 -0400 @@ -109,10 +109,6 @@ - dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in. - - dataset[fieldname] an iterable over the values of the field fieldname across - the dataset (the iterable is obtained by default by calling valuesVStack - over the values for individual examples). - - dataset. returns the value of a property associated with the name . The following properties should be supported: - 'description': a textual description or name for the dataset @@ -151,9 +147,9 @@ - __len__ if it is not a stream - fieldNames - minibatches_nowrap (called by DataSet.minibatches()) + For efficiency of implementation, a sub-class might also want to redefine - valuesHStack - valuesVStack - For efficiency of implementation, a sub-class might also want to redefine - hasFields - __getitem__ may not be feasible with some streams - __iter__ @@ -278,7 +274,7 @@ first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next() second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next() minibatch = Example(self.fieldnames, - [self.dataset.valuesAppend(name,[first_part[name],second_part[name]]) + [self.dataset.valuesVStack(name,[first_part[name],second_part[name]]) for name in self.fieldnames]) self.next_row=upper self.n_batches_done+=1 @@ -412,6 +408,20 @@ """ return DataSetFields(self,fieldnames) + def getitem_key(self, fieldname): + """A not-so-well thought-out place to put code that used to be in + getitem. + """ + #removing as per discussion June 4. --JSB + + i = fieldname + # else check for a fieldname + if self.hasFields(i): + return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] + # else we are trying to access a property of the dataset + assert i in self.__dict__ # else it means we are trying to access a non-existing property + return self.__dict__[i] + def __getitem__(self,i): """ dataset[i] returns the (i+1)-th example of the dataset. @@ -460,12 +470,7 @@ for fieldname,field_values in zip(self.fieldNames(),fields_values)]), self.valuesVStack,self.valuesHStack) - # else check for a fieldname - if self.hasFields(i): - return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] - # else we are trying to access a property of the dataset - assert i in self.__dict__ # else it means we are trying to access a non-existing property - return self.__dict__[i] + raise TypeError(i, type(i)) def valuesHStack(self,fieldnames,fieldvalues): """ @@ -953,25 +958,16 @@ Virtual super-class of datasets whose field values are numpy array, thus defining valuesHStack and valuesVStack for sub-classes. """ - def __init__(self, description=None, field_types=None): - DataSet.__init__(self, description, field_types) - def valuesHStack(self, fieldnames, fieldvalues): + def __init__(self,description=None,field_types=None): + DataSet.__init__(self,description,field_types) + def valuesHStack(self,fieldnames,fieldvalues): """Concatenate field values horizontally, e.g. two vectors become a longer vector, two matrices become a wider matrix, etc.""" return numpy.hstack(fieldvalues) - def valuesVStack(self, fieldname, values): + def valuesVStack(self,fieldname,values): """Concatenate field values vertically, e.g. two vectors become a two-row matrix, two matrices become a longer matrix, etc.""" return numpy.vstack(values) - def valuesAppend(self, fieldname, values): - s0 = sum([v.shape[0] for v in values]) - #TODO: there's gotta be a better way to do this! - rval = numpy.ndarray([s0] + values[0].shape[1:],dtype=values[0].dtype) - cur_row = 0 - for v in values: - rval[cur_row:cur_row+v.shape[0]] = v - cur_row += v.shape[0] - return rval class ArrayDataSet(ArrayFieldsDataSet): """ @@ -996,7 +992,7 @@ for fieldname, fieldcolumns in self.fields_columns.items(): if type(fieldcolumns) is int: assert fieldcolumns>=0 and fieldcolumns# returns the value of a property associated with - #the name . The following properties should be supported: - # - 'description': a textual description or name for the ds - # - 'fieldtypes': a list of types (one per field) + #ds.# returns the value of a property associated with + #the name . The following properties should be supported: + # - 'description': a textual description or name for the ds + # - 'fieldtypes': a list of types (one per field) -#* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#???? - #assert hstack([ds('x','y'),ds('z')])==ds - #hstack([ds('z','y'),ds('x')])==ds + #* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#???? + #assert hstack([ds('x','y'),ds('z')])==ds + #hstack([ds('z','y'),ds('x')])==ds assert have_raised2(hstack,[ds('x'),ds('x')]) assert have_raised2(hstack,[ds('y','x'),ds('x')]) assert not have_raised2(hstack,[ds('x'),ds('y')]) - -# i=0 -# for example in hstack([ds('x'),ds('y'),ds('z')]): -# example==ds[i] -# i+=1 -# del i,example -#* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#???? + + # i=0 + # for example in hstack([ds('x'),ds('y'),ds('z')]): + # example==ds[i] + # i+=1 + # del i,example + #* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#???? def test_fields_fct(ds): #@todo, fill correctly @@ -544,8 +547,6 @@ f_array_iter(array) f_ds_index(ds) - f_ds_index(ds) - f_ds_iter(ds) f_ds_iter(ds) f_ds_mb1(ds,10)