view filetensor.py @ 228:6f55e301c687

optimisation of ArrayDataSet
author Frederic Bastien <bastienf@iro.umontreal.ca>
date Fri, 16 May 2008 16:38:07 -0400
parents 2b6656b2ef52
children 82ba488b2c24
line wrap: on
line source

"""
Read and write the matrix file format described at
U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html}

The format is for dense tensors:

    - magic number indicating type and endianness - 4bytes
    - rank of tensor - int32
    - dimensions - int32, int32, int32, ...
    - <data>

The number of dimensions and rank is slightly tricky: 
    - for scalar: rank=0, dimensions = [1, 1, 1]
    - for vector: rank=1, dimensions = [?, 1, 1]
    - for matrix: rank=2, dimensions = [?, ?, 1]

For rank >= 3, the number of dimensions matches the rank exactly.

"""
import sys
import numpy

def prod(lst):
    p = 1
    for l in lst:
        p *= l
    return p

_magic_dtype = {
        0x1E3D4C51 : ('float32', 4),
        0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix?
        0x1E3D4C53 : ('float64', 8),
        0x1E3D4C54 : ('int32', 4),
        0x1E3D4C55 : ('uint8', 1),
        0x1E3D4C56 : ('int16', 2),
        }
_dtype_magic = {
        'float32': 0x1E3D4C51,
        'packed matrix': 0x1E3D4C52,
        'float64': 0x1E3D4C53,
        'int32': 0x1E3D4C54,
        'uint8': 0x1E3D4C55,
        'int16': 0x1E3D4C56
        }

def _unused():
    f.seek(0,2) #seek to end
    f_len =  f.tell()
    f.seek(f_data_start,0) #seek back to where we were

    if debug: print 'length:', f_len


    f_data_bytes = (f_len - f_data_start)

    if debug: print 'data bytes according to header: ', dim_size * elsize
    if debug: print 'data bytes according to file  : ', f_data_bytes

    if debug: print 'reading data...'
    sys.stdout.flush()

def _write_int32(f, i):
    i_array = numpy.asarray(i, dtype='int32')
    if 0: print 'writing int32', i, i_array
    i_array.tofile(f)
def _read_int32(f):
    s = f.read(4)
    s_array = numpy.fromstring(s, dtype='int32')
    return s_array.item()

def read_ndarray(f, dim, dtype):
    return numpy.fromfile(f, dtype=dtype, count=prod(dim)).reshape(dim)

#
# TODO: implement item selection:
#  e.g. load('some mat', subtensor=(:6, 2:5))
#
#  This function should be memory efficient by:
#  - allocating an output matrix at the beginning
#  - seeking through the file, reading subtensors from multiple places
def read(f, subtensor=None, debug=False):
    """Load all or part of file 'f' into a numpy ndarray

    If f is a string, it will be treated as a filename, and opened in read mode.

    If subtensor is not None, it should be like the argument to
    numpy.ndarray.__getitem__.  The following two expressions should return
    equivalent ndarray objects, but the one on the left may be faster and more
    memory efficient if the underlying file f is big.

        read(f, subtensor) <===> read(f)[*subtensor]
    
    Support for subtensors is currently spotty, so check the code to see if your
    particular type of subtensor is supported.

    """

    if isinstance(f, str):
        if debug: print 'f', f
        f = file(f, 'r')

    #what is the data type of this matrix?
    #magic_s = f.read(4)
    #magic = numpy.fromstring(magic_s, dtype='int32')
    magic = _read_int32(f)
    magic_t, elsize = _magic_dtype[magic]
    if debug: 
        print 'header magic', magic, magic_t, elsize
    if magic_t == 'packed matrix':
        raise NotImplementedError('packed matrix not supported')

    #what is the rank of the tensor?
    ndim = _read_int32(f)
    if debug: print 'header ndim', ndim

    #what are the dimensions of the tensor?
    dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim]
    dim_size = prod(dim)
    if debug: print 'header dim', dim, dim_size

    rval = None
    if subtensor is None:
        rval = read_ndarray(f, dim, magic_t)
    elif isinstance(subtensor, slice):
        if subtensor.step not in (None, 1):
            raise NotImplementedError('slice with step', subtensor.step)
        if subtensor.start not in (None, 0):
            bytes_per_row = prod(dim[1:]) * elsize
            raise NotImplementedError('slice with start', subtensor.start)
        dim[0] = min(dim[0], subtensor.stop)
        rval = read_ndarray(f, dim, magic_t)
    else:
        raise NotImplementedError('subtensor access not written yet:', subtensor) 

    return rval

def write(f, mat):
    if isinstance(f, str):
        f = file(f, 'w')

    _write_int32(f, _dtype_magic[str(mat.dtype)])
    _write_int32(f, len(mat.shape))
    shape = mat.shape
    if len(shape) < 3:
        shape = list(shape) + [1] * (3 - len(shape))
    print 'writing shape =', shape
    for sh in shape:
        _write_int32(f, sh)
    mat.tofile(f)

if __name__ == '__main__':
    #a small test script, starts by reading sys.argv[1]
    rval = read(sys.argv[1], None, debug=True) #load from filename
    print 'rval', rval.shape, rval.size

    if 0:
        f = file('/tmp/some_mat', 'w');
        write(f, rval)
        print ''
        f.close()
        f = file('/tmp/some_mat', 'r');
        rval2 = read(f) #load from file handle
        print 'rval2', rval2.shape, rval2.size

        assert rval.dtype == rval2.dtype
        assert rval.shape == rval2.shape
        assert numpy.all(rval == rval2)
        print 'ok'