Mercurial > pylearn
view filetensor.py @ 99:a8da709eb6a9
in ArrayDataSet.__init__ if a columns is an index, we change it to be a list that containt only this index. This way, we remove the special case where the columns is an index for all subsequent call.
This was possing trouble with numpy.vstack() called by MinibatchWrapAroundIterator.next
author | Frederic Bastien <bastienf@iro.umontreal.ca> |
---|---|
date | Tue, 06 May 2008 13:57:36 -0400 |
parents | 2b6656b2ef52 |
children | 82ba488b2c24 |
line wrap: on
line source
""" Read and write the matrix file format described at U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html} The format is for dense tensors: - magic number indicating type and endianness - 4bytes - rank of tensor - int32 - dimensions - int32, int32, int32, ... - <data> The number of dimensions and rank is slightly tricky: - for scalar: rank=0, dimensions = [1, 1, 1] - for vector: rank=1, dimensions = [?, 1, 1] - for matrix: rank=2, dimensions = [?, ?, 1] For rank >= 3, the number of dimensions matches the rank exactly. """ import sys import numpy def prod(lst): p = 1 for l in lst: p *= l return p _magic_dtype = { 0x1E3D4C51 : ('float32', 4), 0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix? 0x1E3D4C53 : ('float64', 8), 0x1E3D4C54 : ('int32', 4), 0x1E3D4C55 : ('uint8', 1), 0x1E3D4C56 : ('int16', 2), } _dtype_magic = { 'float32': 0x1E3D4C51, 'packed matrix': 0x1E3D4C52, 'float64': 0x1E3D4C53, 'int32': 0x1E3D4C54, 'uint8': 0x1E3D4C55, 'int16': 0x1E3D4C56 } def _unused(): f.seek(0,2) #seek to end f_len = f.tell() f.seek(f_data_start,0) #seek back to where we were if debug: print 'length:', f_len f_data_bytes = (f_len - f_data_start) if debug: print 'data bytes according to header: ', dim_size * elsize if debug: print 'data bytes according to file : ', f_data_bytes if debug: print 'reading data...' sys.stdout.flush() def _write_int32(f, i): i_array = numpy.asarray(i, dtype='int32') if 0: print 'writing int32', i, i_array i_array.tofile(f) def _read_int32(f): s = f.read(4) s_array = numpy.fromstring(s, dtype='int32') return s_array.item() def read_ndarray(f, dim, dtype): return numpy.fromfile(f, dtype=dtype, count=prod(dim)).reshape(dim) # # TODO: implement item selection: # e.g. load('some mat', subtensor=(:6, 2:5)) # # This function should be memory efficient by: # - allocating an output matrix at the beginning # - seeking through the file, reading subtensors from multiple places def read(f, subtensor=None, debug=False): """Load all or part of file 'f' into a numpy ndarray If f is a string, it will be treated as a filename, and opened in read mode. If subtensor is not None, it should be like the argument to numpy.ndarray.__getitem__. The following two expressions should return equivalent ndarray objects, but the one on the left may be faster and more memory efficient if the underlying file f is big. read(f, subtensor) <===> read(f)[*subtensor] Support for subtensors is currently spotty, so check the code to see if your particular type of subtensor is supported. """ if isinstance(f, str): if debug: print 'f', f f = file(f, 'r') #what is the data type of this matrix? #magic_s = f.read(4) #magic = numpy.fromstring(magic_s, dtype='int32') magic = _read_int32(f) magic_t, elsize = _magic_dtype[magic] if debug: print 'header magic', magic, magic_t, elsize if magic_t == 'packed matrix': raise NotImplementedError('packed matrix not supported') #what is the rank of the tensor? ndim = _read_int32(f) if debug: print 'header ndim', ndim #what are the dimensions of the tensor? dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] dim_size = prod(dim) if debug: print 'header dim', dim, dim_size rval = None if subtensor is None: rval = read_ndarray(f, dim, magic_t) elif isinstance(subtensor, slice): if subtensor.step not in (None, 1): raise NotImplementedError('slice with step', subtensor.step) if subtensor.start not in (None, 0): bytes_per_row = prod(dim[1:]) * elsize raise NotImplementedError('slice with start', subtensor.start) dim[0] = min(dim[0], subtensor.stop) rval = read_ndarray(f, dim, magic_t) else: raise NotImplementedError('subtensor access not written yet:', subtensor) return rval def write(f, mat): if isinstance(f, str): f = file(f, 'w') _write_int32(f, _dtype_magic[str(mat.dtype)]) _write_int32(f, len(mat.shape)) shape = mat.shape if len(shape) < 3: shape = list(shape) + [1] * (3 - len(shape)) print 'writing shape =', shape for sh in shape: _write_int32(f, sh) mat.tofile(f) if __name__ == '__main__': #a small test script, starts by reading sys.argv[1] rval = read(sys.argv[1], None, debug=True) #load from filename print 'rval', rval.shape, rval.size if 0: f = file('/tmp/some_mat', 'w'); write(f, rval) print '' f.close() f = file('/tmp/some_mat', 'r'); rval2 = read(f) #load from file handle print 'rval2', rval2.shape, rval2.size assert rval.dtype == rval2.dtype assert rval.shape == rval2.shape assert numpy.all(rval == rval2) print 'ok'