# HG changeset patch # User bergstrj@iro.umontreal.ca # Date 1208450973 14400 # Node ID bb92087cb0f6ec06a5d3e179ac6a80fd0f70f6cb # Parent 039c0f249859e2e38f050e68fb37fe4ac6f56f51 added filetensor.py diff -r 039c0f249859 -r bb92087cb0f6 filetensor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filetensor.py Thu Apr 17 12:49:33 2008 -0400 @@ -0,0 +1,168 @@ +""" +Read and write the matrix file format described at +http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html + +The format is for dense tensors: + + magic number indicating type and endianness - 4bytes + rank of tensor - int32 + dimensions - int32, int32, int32, ... + + +The number of dimensions and rank is slightly tricky: + for scalar: rank=0, dimensions = [1, 1, 1] + for vector: rank=1, dimensions = [?, 1, 1] + for matrix: rank=2, dimensions = [?, ?, 1] + +For rank >= 3, the number of dimensions matches the rank exactly. + +""" +import sys +import numpy + +def prod(lst): + p = 1 + for l in lst: + p *= l + return p + +_magic_dtype = { + 0x1E3D4C51 : ('float32', 4), + 0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix? + 0x1E3D4C53 : ('float64', 8), + 0x1E3D4C54 : ('int32', 4), + 0x1E3D4C55 : ('int8', 1), + 0x1E3D4C56 : ('int16', 2), + } +_dtype_magic = { + 'float32': 0x1E3D4C51, + 'packed matrix': 0x1E3D4C52, + 'float64': 0x1E3D4C53, + 'int32': 0x1E3D4C54, + 'int8': 0x1E3D4C55, + 'int16': 0x1E3D4C56 + } + +def _unused(): + f.seek(0,2) #seek to end + f_len = f.tell() + f.seek(f_data_start,0) #seek back to where we were + + if debug: print 'length:', f_len + + + f_data_bytes = (f_len - f_data_start) + + if debug: print 'data bytes according to header: ', dim_size * elsize + if debug: print 'data bytes according to file : ', f_data_bytes + + if debug: print 'reading data...' + sys.stdout.flush() + +def _write_int32(f, i): + i_array = numpy.asarray(i, dtype='int32') + if 0: print 'writing int32', i, i_array + i_array.tofile(f) +def _read_int32(f): + s = f.read(4) + s_array = numpy.fromstring(s, dtype='int32') + return s_array.item() + +def read_ndarray(f, dim, dtype): + return numpy.fromfile(f, dtype=dtype, count=prod(dim)).reshape(dim) + +# +# TODO: implement item selection: +# e.g. load('some mat', subtensor=(:6, 2:5)) +# +# This function should be memory efficient by: +# - allocating an output matrix at the beginning +# - seeking through the file, reading subtensors from multiple places +def read(f, subtensor=None, debug=False): + """Load all or part of file 'f' into a numpy ndarray + + If f is a string, it will be treated as a filename, and opened in read mode. + + If subtensor is not None, it should be like the argument to + numpy.ndarray.__getitem__. The following two expressions should return + equivalent ndarray objects, but the one on the left may be faster and more + memory efficient if the underlying file f is big. + + read(f, subtensor) <===> read(f)[*subtensor] + + Support for subtensors is currently spotty, so check the code to see if your + particular type of subtensor is supported. + + """ + + if isinstance(f, str): + if debug: print 'f', f + f = file(f, 'r') + + #what is the data type of this matrix? + #magic_s = f.read(4) + #magic = numpy.fromstring(magic_s, dtype='int32') + magic = _read_int32(f) + magic_t, elsize = _magic_dtype[magic] + if debug: + print 'header magic', magic, magic_t, elsize + if magic_t == 'packed matrix': + raise NotImplementedError('packed matrix not supported') + + #what is the rank of the tensor? + ndim = _read_int32(f) + if debug: print 'header ndim', ndim + + #what are the dimensions of the tensor? + dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] + dim_size = prod(dim) + if debug: print 'header dim', dim, dim_size + + rval = None + if subtensor is None: + rval = read_ndarray(f, dim, magic_t) + elif isinstance(subtensor, slice): + if subtensor.step not in (None, 1): + raise NotImplementedError('slice with step', subtensor.step) + if subtensor.start not in (None, 0): + bytes_per_row = prod(dim[1:]) * elsize + raise NotImplementedError('slice with start', subtensor.start) + dim[0] = min(dim[0], subtensor.stop) + rval = read_ndarray(f, dim, magic_t) + else: + raise NotImplementedError('subtensor access not written yet:', subtensor) + + return rval + +def write(f, mat): + if isinstance(f, str): + f = file(f, 'w') + + _write_int32(f, _dtype_magic[str(mat.dtype)]) + _write_int32(f, len(mat.shape)) + shape = mat.shape + if len(shape) < 3: + shape = list(shape) + [1] * (3 - len(shape)) + print 'writing shape =', shape + for sh in shape: + _write_int32(f, sh) + mat.tofile(f) + +if __name__ == '__main__': + #a small test script, starts by reading sys.argv[1] + rval = read(sys.argv[1], slice(400), debug=True) #load from filename + print 'rval', rval.shape, rval.size + + f = file('/tmp/some_mat', 'w'); + write(f, rval) + print '' + f.close() + f = file('/tmp/some_mat', 'r'); + rval2 = read(f) #load from file handle + print 'rval2', rval2.shape, rval2.size + + assert rval.dtype == rval2.dtype + assert rval.shape == rval2.shape + assert numpy.all(rval == rval2) + print 'ok' +