Mercurial > pylearn
view filetensor.py @ 55:66619ce44497
Efficient implementation of getitem for ArrayDataSet
author | Yoshua Bengio <bengioy@iro.umontreal.ca> |
---|---|
date | Tue, 29 Apr 2008 15:05:12 -0400 |
parents | 2508c373cf29 |
children | 2b6656b2ef52 |
line wrap: on
line source
""" Read and write the matrix file format described at http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html The format is for dense tensors: magic number indicating type and endianness - 4bytes rank of tensor - int32 dimensions - int32, int32, int32, ... <data> The number of dimensions and rank is slightly tricky: for scalar: rank=0, dimensions = [1, 1, 1] for vector: rank=1, dimensions = [?, 1, 1] for matrix: rank=2, dimensions = [?, ?, 1] For rank >= 3, the number of dimensions matches the rank exactly. """ import sys import numpy def prod(lst): p = 1 for l in lst: p *= l return p _magic_dtype = { 0x1E3D4C51 : ('float32', 4), 0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix? 0x1E3D4C53 : ('float64', 8), 0x1E3D4C54 : ('int32', 4), 0x1E3D4C55 : ('uint8', 1), 0x1E3D4C56 : ('int16', 2), } _dtype_magic = { 'float32': 0x1E3D4C51, 'packed matrix': 0x1E3D4C52, 'float64': 0x1E3D4C53, 'int32': 0x1E3D4C54, 'uint8': 0x1E3D4C55, 'int16': 0x1E3D4C56 } def _unused(): f.seek(0,2) #seek to end f_len = f.tell() f.seek(f_data_start,0) #seek back to where we were if debug: print 'length:', f_len f_data_bytes = (f_len - f_data_start) if debug: print 'data bytes according to header: ', dim_size * elsize if debug: print 'data bytes according to file : ', f_data_bytes if debug: print 'reading data...' sys.stdout.flush() def _write_int32(f, i): i_array = numpy.asarray(i, dtype='int32') if 0: print 'writing int32', i, i_array i_array.tofile(f) def _read_int32(f): s = f.read(4) s_array = numpy.fromstring(s, dtype='int32') return s_array.item() def read_ndarray(f, dim, dtype): return numpy.fromfile(f, dtype=dtype, count=prod(dim)).reshape(dim) # # TODO: implement item selection: # e.g. load('some mat', subtensor=(:6, 2:5)) # # This function should be memory efficient by: # - allocating an output matrix at the beginning # - seeking through the file, reading subtensors from multiple places def read(f, subtensor=None, debug=False): """Load all or part of file 'f' into a numpy ndarray If f is a string, it will be treated as a filename, and opened in read mode. If subtensor is not None, it should be like the argument to numpy.ndarray.__getitem__. The following two expressions should return equivalent ndarray objects, but the one on the left may be faster and more memory efficient if the underlying file f is big. read(f, subtensor) <===> read(f)[*subtensor] Support for subtensors is currently spotty, so check the code to see if your particular type of subtensor is supported. """ if isinstance(f, str): if debug: print 'f', f f = file(f, 'r') #what is the data type of this matrix? #magic_s = f.read(4) #magic = numpy.fromstring(magic_s, dtype='int32') magic = _read_int32(f) magic_t, elsize = _magic_dtype[magic] if debug: print 'header magic', magic, magic_t, elsize if magic_t == 'packed matrix': raise NotImplementedError('packed matrix not supported') #what is the rank of the tensor? ndim = _read_int32(f) if debug: print 'header ndim', ndim #what are the dimensions of the tensor? dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] dim_size = prod(dim) if debug: print 'header dim', dim, dim_size rval = None if subtensor is None: rval = read_ndarray(f, dim, magic_t) elif isinstance(subtensor, slice): if subtensor.step not in (None, 1): raise NotImplementedError('slice with step', subtensor.step) if subtensor.start not in (None, 0): bytes_per_row = prod(dim[1:]) * elsize raise NotImplementedError('slice with start', subtensor.start) dim[0] = min(dim[0], subtensor.stop) rval = read_ndarray(f, dim, magic_t) else: raise NotImplementedError('subtensor access not written yet:', subtensor) return rval def write(f, mat): if isinstance(f, str): f = file(f, 'w') _write_int32(f, _dtype_magic[str(mat.dtype)]) _write_int32(f, len(mat.shape)) shape = mat.shape if len(shape) < 3: shape = list(shape) + [1] * (3 - len(shape)) print 'writing shape =', shape for sh in shape: _write_int32(f, sh) mat.tofile(f) if __name__ == '__main__': #a small test script, starts by reading sys.argv[1] rval = read(sys.argv[1], None, debug=True) #load from filename print 'rval', rval.shape, rval.size if 0: f = file('/tmp/some_mat', 'w'); write(f, rval) print '' f.close() f = file('/tmp/some_mat', 'r'); rval2 = read(f) #load from file handle print 'rval2', rval2.shape, rval2.size assert rval.dtype == rval2.dtype assert rval.shape == rval2.shape assert numpy.all(rval == rval2) print 'ok'