view filetensor.py @ 99:a8da709eb6a9

in ArrayDataSet.__init__ if a columns is an index, we change it to be a list that containt only this index. This way, we remove the special case where the columns is an index for all subsequent call. This was possing trouble with numpy.vstack() called by MinibatchWrapAroundIterator.next
author Frederic Bastien <bastienf@iro.umontreal.ca>
date Tue, 06 May 2008 13:57:36 -0400
parents 2b6656b2ef52
children 82ba488b2c24
line wrap: on
line source

"""
Read and write the matrix file format described at
U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html}

The format is for dense tensors:

    - magic number indicating type and endianness - 4bytes
    - rank of tensor - int32
    - dimensions - int32, int32, int32, ...
    - <data>

The number of dimensions and rank is slightly tricky: 
    - for scalar: rank=0, dimensions = [1, 1, 1]
    - for vector: rank=1, dimensions = [?, 1, 1]
    - for matrix: rank=2, dimensions = [?, ?, 1]

For rank >= 3, the number of dimensions matches the rank exactly.

"""
import sys
import numpy

def prod(lst):
    p = 1
    for l in lst:
        p *= l
    return p

_magic_dtype = {
        0x1E3D4C51 : ('float32', 4),
        0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix?
        0x1E3D4C53 : ('float64', 8),
        0x1E3D4C54 : ('int32', 4),
        0x1E3D4C55 : ('uint8', 1),
        0x1E3D4C56 : ('int16', 2),
        }
_dtype_magic = {
        'float32': 0x1E3D4C51,
        'packed matrix': 0x1E3D4C52,
        'float64': 0x1E3D4C53,
        'int32': 0x1E3D4C54,
        'uint8': 0x1E3D4C55,
        'int16': 0x1E3D4C56
        }

def _unused():
    f.seek(0,2) #seek to end
    f_len =  f.tell()
    f.seek(f_data_start,0) #seek back to where we were

    if debug: print 'length:', f_len


    f_data_bytes = (f_len - f_data_start)

    if debug: print 'data bytes according to header: ', dim_size * elsize
    if debug: print 'data bytes according to file  : ', f_data_bytes

    if debug: print 'reading data...'
    sys.stdout.flush()

def _write_int32(f, i):
    i_array = numpy.asarray(i, dtype='int32')
    if 0: print 'writing int32', i, i_array
    i_array.tofile(f)
def _read_int32(f):
    s = f.read(4)
    s_array = numpy.fromstring(s, dtype='int32')
    return s_array.item()

def read_ndarray(f, dim, dtype):
    return numpy.fromfile(f, dtype=dtype, count=prod(dim)).reshape(dim)

#
# TODO: implement item selection:
#  e.g. load('some mat', subtensor=(:6, 2:5))
#
#  This function should be memory efficient by:
#  - allocating an output matrix at the beginning
#  - seeking through the file, reading subtensors from multiple places
def read(f, subtensor=None, debug=False):
    """Load all or part of file 'f' into a numpy ndarray

    If f is a string, it will be treated as a filename, and opened in read mode.

    If subtensor is not None, it should be like the argument to
    numpy.ndarray.__getitem__.  The following two expressions should return
    equivalent ndarray objects, but the one on the left may be faster and more
    memory efficient if the underlying file f is big.

        read(f, subtensor) <===> read(f)[*subtensor]
    
    Support for subtensors is currently spotty, so check the code to see if your
    particular type of subtensor is supported.

    """

    if isinstance(f, str):
        if debug: print 'f', f
        f = file(f, 'r')

    #what is the data type of this matrix?
    #magic_s = f.read(4)
    #magic = numpy.fromstring(magic_s, dtype='int32')
    magic = _read_int32(f)
    magic_t, elsize = _magic_dtype[magic]
    if debug: 
        print 'header magic', magic, magic_t, elsize
    if magic_t == 'packed matrix':
        raise NotImplementedError('packed matrix not supported')

    #what is the rank of the tensor?
    ndim = _read_int32(f)
    if debug: print 'header ndim', ndim

    #what are the dimensions of the tensor?
    dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim]
    dim_size = prod(dim)
    if debug: print 'header dim', dim, dim_size

    rval = None
    if subtensor is None:
        rval = read_ndarray(f, dim, magic_t)
    elif isinstance(subtensor, slice):
        if subtensor.step not in (None, 1):
            raise NotImplementedError('slice with step', subtensor.step)
        if subtensor.start not in (None, 0):
            bytes_per_row = prod(dim[1:]) * elsize
            raise NotImplementedError('slice with start', subtensor.start)
        dim[0] = min(dim[0], subtensor.stop)
        rval = read_ndarray(f, dim, magic_t)
    else:
        raise NotImplementedError('subtensor access not written yet:', subtensor) 

    return rval

def write(f, mat):
    if isinstance(f, str):
        f = file(f, 'w')

    _write_int32(f, _dtype_magic[str(mat.dtype)])
    _write_int32(f, len(mat.shape))
    shape = mat.shape
    if len(shape) < 3:
        shape = list(shape) + [1] * (3 - len(shape))
    print 'writing shape =', shape
    for sh in shape:
        _write_int32(f, sh)
    mat.tofile(f)

if __name__ == '__main__':
    #a small test script, starts by reading sys.argv[1]
    rval = read(sys.argv[1], None, debug=True) #load from filename
    print 'rval', rval.shape, rval.size

    if 0:
        f = file('/tmp/some_mat', 'w');
        write(f, rval)
        print ''
        f.close()
        f = file('/tmp/some_mat', 'r');
        rval2 = read(f) #load from file handle
        print 'rval2', rval2.shape, rval2.size

        assert rval.dtype == rval2.dtype
        assert rval.shape == rval2.shape
        assert numpy.all(rval == rval2)
        print 'ok'