# HG changeset patch # User James Bergstra # Date 1242339910 14400 # Node ID ee7026de9681fddfc59ab75f265aa0438a7f3e5e # Parent 5ca1a8e859dbad82c2871cd52097a5a4020ab052 added filetensor.arraylike diff -r 5ca1a8e859db -r ee7026de9681 pylearn/io/filetensor.py --- a/pylearn/io/filetensor.py Thu May 14 17:00:22 2009 -0400 +++ b/pylearn/io/filetensor.py Thu May 14 18:25:10 2009 -0400 @@ -46,6 +46,107 @@ 'int16': 0x1E3D4C56 } +def _read_int32(f): + """unpack a 4-byte integer from the current position in file f""" + s = f.read(4) + s_array = numpy.fromstring(s, dtype='int32') + return s_array.item() + +def _read_header(f, debug=False): + """ + :returns: data type, element size, rank, shape, size + """ + #what is the data type of this matrix? + #magic_s = f.read(4) + #magic = numpy.fromstring(magic_s, dtype='int32') + magic = _read_int32(f) + magic_t, elsize = _magic_dtype[magic] + if debug: + print 'header magic', magic, magic_t, elsize + if magic_t == 'packed matrix': + raise NotImplementedError('packed matrix not supported') + + #what is the rank of the tensor? + ndim = _read_int32(f) + if debug: print 'header ndim', ndim + + #what are the dimensions of the tensor? + dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] + dim_size = _prod(dim) + if debug: print 'header dim', dim, dim_size + + return magic_t, elsize, ndim, dim, dim_size + +class arraylike(object): + """Provide an array-like interface to the filetensor in f. + + The rank parameter to __init__ controls how this object interprets the underlying tensor. + Its behaviour should be clear from the following example. + Suppose the underlying tensor is MxNxK. + + - If rank is 0, self[i] will be a scalar and len(self) == M*N*K. + + - If rank is 1, self[i] is a vector of length K, and len(self) == M*N. + + - If rank is 3, self[i] is a 3D tensor of size MxNxK, and len(self)==1. + + - If rank is 5, self[i] is a 5D tensor of size 1x1xMxNxK, and len(self) == 1. + + + :note: Objects of this class generally require exclusive use of the underlying file handle, because + they call seek() every time you access an element. + """ + + f = None + """File-like object""" + + magic_t = None + """numpy data type of array""" + + elsize = None + """number of bytes per scalar element""" + + ndim = None + """Rank of underlying tensor""" + + dim = None + """tuple of array dimensions (aka shape)""" + + dim_size = None + """number of scalars in the tensor (prod of dim)""" + + f_start = None + """The file position of the first element of the tensor""" + + readshape = None + """tuple of array dimensions of the block that we read""" + + readsize = None + """number of elements we must read for each block""" + + def __init__(self, f, rank=0, debug=False): + self.f = f + self.magic_t, self.elsize, self.ndim, self.dim, self.dim_size = _read_header(f,debug) + self.f_start = f.tell() + + self.readshape = tuple(self.dim[self.ndim-rank:]) if rank <= self.ndim else tuple(self.dim) + padding = tuple() if rank <= self.ndim else (1,) * (rank - self.ndim) + self.returnshape = padding + self.readshape + self.readsize = _prod(self.readshape) + if debug: print 'READ PARAM', self.readshape, self.returnshape, self.readsize + + def __len__(self): + return _prod(self.dim[:self.ndim-len(self.readshape)]) + + def __getitem__(self, idx): + if idx >= len(self): + raise IndexError(idx) + self.f.seek(self.f_start + idx * self.elsize * self.readsize) + return numpy.fromfile(self.f, + dtype=self.magic_t, + count=self.readsize).reshape(self.returnshape) + + # # TODO: implement item selection: # e.g. load('some mat', subtensor=(:6, 2:5)) @@ -70,29 +171,7 @@ particular type of subtensor is supported. """ - def _read_int32(f): - s = f.read(4) - s_array = numpy.fromstring(s, dtype='int32') - return s_array.item() - - #what is the data type of this matrix? - #magic_s = f.read(4) - #magic = numpy.fromstring(magic_s, dtype='int32') - magic = _read_int32(f) - magic_t, elsize = _magic_dtype[magic] - if debug: - print 'header magic', magic, magic_t, elsize - if magic_t == 'packed matrix': - raise NotImplementedError('packed matrix not supported') - - #what is the rank of the tensor? - ndim = _read_int32(f) - if debug: print 'header ndim', ndim - - #what are the dimensions of the tensor? - dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] - dim_size = _prod(dim) - if debug: print 'header dim', dim, dim_size + magic_t, elsize, ndim, dim, dim_size = _read_header(f,debug) rval = None if subtensor is None: diff -r 5ca1a8e859db -r ee7026de9681 pylearn/io/tests/test_filetensor.py --- a/pylearn/io/tests/test_filetensor.py Thu May 14 17:00:22 2009 -0400 +++ b/pylearn/io/tests/test_filetensor.py Thu May 14 18:25:10 2009 -0400 @@ -3,21 +3,17 @@ import numpy import unittest -import os +import os, tempfile class T(unittest.TestCase): - fname = '/tmp/some_mat' - def setUp(self): - #TODO: test that /tmp/some_mat does not exist - try: - os.stat(self.fname) - except OSError: - return #assume file was not found - raise Exception('autotest file "%s" exists!' % self.fname) + self.fname = tempfile.mktemp() def tearDown(self): - os.remove(self.fname) + try: + os.remove(self.fname) + except IOError: + pass def test_file(self): gen = numpy.random.rand(1) @@ -79,6 +75,34 @@ passed = True f.close() self.failUnless(passed) + + def test_arraylike(self): + fullshape = [10, 4, 3, 1, 8] + for ndim in range(len(fullshape)): + for rank in range(ndim+1): + print 'ndim, rank', ndim, rank + tt = numpy.asarray(numpy.random.RandomState(55).randn(*fullshape[:ndim])) + f = file(self.fname, 'w') + filetensor.write(f, tt) + f.close() + a = filetensor.arraylike(open(self.fname), rank=rank, debug=True) + print 'len a', len(a) + list_a = list(a) + assert len(a) == len(list_a) + + #WARNING: assuming that the readshape is correct + tt_flat = tt.reshape( (tt.size/filetensor._prod(a.readshape),) + a.readshape) + + assert len(a) == len(tt_flat) + assert len(a) > 0 + for a_i, t_i in zip(a, tt_flat): + assert a_i.shape == t_i.shape + assert a_i.dtype == t_i.dtype + assert numpy.all(a_i == t_i) + + + + if __name__ == '__main__':