Mercurial > pylearn

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filetensor.py	Thu Apr 17 12:49:33 2008 -0400
@@ -0,0 +1,168 @@
+"""
+Read and write the matrix file format described at
+http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html
+
+The format is for dense tensors:
+
+    magic number indicating type and endianness - 4bytes
+    rank of tensor - int32
+    dimensions - int32, int32, int32, ...
+    <data>
+
+The number of dimensions and rank is slightly tricky:
+    for scalar: rank=0, dimensions = [1, 1, 1]
+    for vector: rank=1, dimensions = [?, 1, 1]
+    for matrix: rank=2, dimensions = [?, ?, 1]
+
+For rank >= 3, the number of dimensions matches the rank exactly.
+
+"""
+import sys
+import numpy
+
+def prod(lst):
+    p = 1
+    for l in lst:
+        p *= l
+    return p
+
+_magic_dtype = {
+        0x1E3D4C51 : ('float32', 4),
+        0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix?
+        0x1E3D4C53 : ('float64', 8),
+        0x1E3D4C54 : ('int32', 4),
+        0x1E3D4C55 : ('int8', 1),
+        0x1E3D4C56 : ('int16', 2),
+        }
+_dtype_magic = {
+        'float32': 0x1E3D4C51,
+        'packed matrix': 0x1E3D4C52,
+        'float64': 0x1E3D4C53,
+        'int32': 0x1E3D4C54,
+        'int8': 0x1E3D4C55,
+        'int16': 0x1E3D4C56
+        }
+
+def _unused():
+    f.seek(0,2) #seek to end
+    f_len =  f.tell()
+    f.seek(f_data_start,0) #seek back to where we were
+
+    if debug: print 'length:', f_len
+
+
+    f_data_bytes = (f_len - f_data_start)
+
+    if debug: print 'data bytes according to header: ', dim_size * elsize
+    if debug: print 'data bytes according to file  : ', f_data_bytes
+
+    if debug: print 'reading data...'
+    sys.stdout.flush()
+
+def _write_int32(f, i):
+    i_array = numpy.asarray(i, dtype='int32')
+    if 0: print 'writing int32', i, i_array
+    i_array.tofile(f)
+def _read_int32(f):
+    s = f.read(4)
+    s_array = numpy.fromstring(s, dtype='int32')
+    return s_array.item()
+
+def read_ndarray(f, dim, dtype):
+    return numpy.fromfile(f, dtype=dtype, count=prod(dim)).reshape(dim)
+
+#
+# TODO: implement item selection:
+#  e.g. load('some mat', subtensor=(:6, 2:5))
+#
+#  This function should be memory efficient by:
+#  - allocating an output matrix at the beginning
+#  - seeking through the file, reading subtensors from multiple places
+def read(f, subtensor=None, debug=False):
+    """Load all or part of file 'f' into a numpy ndarray
+
+    If f is a string, it will be treated as a filename, and opened in read mode.
+
+    If subtensor is not None, it should be like the argument to
+    numpy.ndarray.__getitem__.  The following two expressions should return
+    equivalent ndarray objects, but the one on the left may be faster and more
+    memory efficient if the underlying file f is big.
+
+        read(f, subtensor) <===> read(f)[*subtensor]
+
+    Support for subtensors is currently spotty, so check the code to see if your
+    particular type of subtensor is supported.
+
+    """
+
+    if isinstance(f, str):
+        if debug: print 'f', f
+        f = file(f, 'r')
+
+    #what is the data type of this matrix?
+    #magic_s = f.read(4)
+    #magic = numpy.fromstring(magic_s, dtype='int32')
+    magic = _read_int32(f)
+    magic_t, elsize = _magic_dtype[magic]
+    if debug:
+        print 'header magic', magic, magic_t, elsize
+    if magic_t == 'packed matrix':
+        raise NotImplementedError('packed matrix not supported')
+
+    #what is the rank of the tensor?
+    ndim = _read_int32(f)
+    if debug: print 'header ndim', ndim
+
+    #what are the dimensions of the tensor?
+    dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim]
+    dim_size = prod(dim)
+    if debug: print 'header dim', dim, dim_size
+
+    rval = None
+    if subtensor is None:
+        rval = read_ndarray(f, dim, magic_t)
+    elif isinstance(subtensor, slice):
+        if subtensor.step not in (None, 1):
+            raise NotImplementedError('slice with step', subtensor.step)
+        if subtensor.start not in (None, 0):
+            bytes_per_row = prod(dim[1:]) * elsize
+            raise NotImplementedError('slice with start', subtensor.start)
+        dim[0] = min(dim[0], subtensor.stop)
+        rval = read_ndarray(f, dim, magic_t)
+    else:
+        raise NotImplementedError('subtensor access not written yet:', subtensor)
+
+    return rval
+
+def write(f, mat):
+    if isinstance(f, str):
+        f = file(f, 'w')
+
+    _write_int32(f, _dtype_magic[str(mat.dtype)])
+    _write_int32(f, len(mat.shape))
+    shape = mat.shape
+    if len(shape) < 3:
+        shape = list(shape) + [1] * (3 - len(shape))
+    print 'writing shape =', shape
+    for sh in shape:
+        _write_int32(f, sh)
+    mat.tofile(f)
+
+if __name__ == '__main__':
+    #a small test script, starts by reading sys.argv[1]
+    rval = read(sys.argv[1], slice(400), debug=True) #load from filename
+    print 'rval', rval.shape, rval.size
+
+    f = file('/tmp/some_mat', 'w');
+    write(f, rval)
+    print ''
+    f.close()
+    f = file('/tmp/some_mat', 'r');
+    rval2 = read(f) #load from file handle
+    print 'rval2', rval2.shape, rval2.size
+
+    assert rval.dtype == rval2.dtype
+    assert rval.shape == rval2.shape
+    assert numpy.all(rval == rval2)
+    print 'ok'
+