changeset 693:ee7026de9681

added filetensor.arraylike
author James Bergstra <bergstrj@iro.umontreal.ca>
date Thu, 14 May 2009 18:25:10 -0400
parents 5ca1a8e859db
children 69947f4e9c0e 4c24b2023f32
files pylearn/io/filetensor.py pylearn/io/tests/test_filetensor.py
diffstat 2 files changed, 136 insertions(+), 33 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/io/filetensor.py	Thu May 14 17:00:22 2009 -0400
+++ b/pylearn/io/filetensor.py	Thu May 14 18:25:10 2009 -0400
@@ -46,6 +46,107 @@
         'int16': 0x1E3D4C56
         }
 
+def _read_int32(f):
+    """unpack a 4-byte integer from the current position in file f"""
+    s = f.read(4)
+    s_array = numpy.fromstring(s, dtype='int32')
+    return s_array.item()
+
+def _read_header(f, debug=False):
+    """
+    :returns: data type, element size, rank, shape, size
+    """
+    #what is the data type of this matrix?
+    #magic_s = f.read(4)
+    #magic = numpy.fromstring(magic_s, dtype='int32')
+    magic = _read_int32(f)
+    magic_t, elsize = _magic_dtype[magic]
+    if debug: 
+        print 'header magic', magic, magic_t, elsize
+    if magic_t == 'packed matrix':
+        raise NotImplementedError('packed matrix not supported')
+
+    #what is the rank of the tensor?
+    ndim = _read_int32(f)
+    if debug: print 'header ndim', ndim
+
+    #what are the dimensions of the tensor?
+    dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim]
+    dim_size = _prod(dim)
+    if debug: print 'header dim', dim, dim_size
+
+    return magic_t, elsize, ndim, dim, dim_size
+
+class arraylike(object):
+    """Provide an array-like interface to the filetensor in f.
+
+    The rank parameter to __init__ controls how this object interprets the underlying tensor.
+    Its behaviour should be clear from the following example.
+    Suppose the underlying tensor is MxNxK.
+
+    - If rank is 0, self[i] will be a scalar and len(self) == M*N*K.
+
+    - If rank is 1, self[i] is a vector of length K, and len(self) == M*N.
+
+    - If rank is 3, self[i] is a 3D tensor of size MxNxK, and len(self)==1.
+
+    - If rank is 5, self[i] is a 5D tensor of size 1x1xMxNxK, and len(self) == 1.
+
+
+    :note: Objects of this class generally require exclusive use of the underlying file handle, because
+    they call seek() every time you access an element.
+    """
+
+    f = None 
+    """File-like object"""
+
+    magic_t = None
+    """numpy data type of array"""
+
+    elsize = None
+    """number of bytes per scalar element"""
+
+    ndim = None
+    """Rank of underlying tensor"""
+
+    dim = None
+    """tuple of array dimensions (aka shape)"""
+
+    dim_size = None
+    """number of scalars in the tensor (prod of dim)"""
+
+    f_start = None
+    """The file position of the first element of the tensor"""
+
+    readshape = None
+    """tuple of array dimensions of the block that we read"""
+
+    readsize = None
+    """number of elements we must read for each block"""
+    
+    def __init__(self, f, rank=0, debug=False):
+        self.f = f
+        self.magic_t, self.elsize, self.ndim, self.dim, self.dim_size = _read_header(f,debug)
+        self.f_start = f.tell()
+
+        self.readshape = tuple(self.dim[self.ndim-rank:]) if rank <= self.ndim else tuple(self.dim)
+        padding = tuple() if rank <= self.ndim else (1,) * (rank - self.ndim)
+        self.returnshape = padding + self.readshape
+        self.readsize = _prod(self.readshape)
+        if debug: print 'READ PARAM', self.readshape, self.returnshape, self.readsize
+
+    def __len__(self):
+        return _prod(self.dim[:self.ndim-len(self.readshape)])
+
+    def __getitem__(self, idx):
+        if idx >= len(self):
+            raise IndexError(idx)
+        self.f.seek(self.f_start + idx * self.elsize * self.readsize)
+        return numpy.fromfile(self.f, 
+                dtype=self.magic_t, 
+                count=self.readsize).reshape(self.returnshape)
+
+
 #
 # TODO: implement item selection:
 #  e.g. load('some mat', subtensor=(:6, 2:5))
@@ -70,29 +171,7 @@
     particular type of subtensor is supported.
 
     """
-    def _read_int32(f):
-        s = f.read(4)
-        s_array = numpy.fromstring(s, dtype='int32')
-        return s_array.item()
-
-    #what is the data type of this matrix?
-    #magic_s = f.read(4)
-    #magic = numpy.fromstring(magic_s, dtype='int32')
-    magic = _read_int32(f)
-    magic_t, elsize = _magic_dtype[magic]
-    if debug: 
-        print 'header magic', magic, magic_t, elsize
-    if magic_t == 'packed matrix':
-        raise NotImplementedError('packed matrix not supported')
-
-    #what is the rank of the tensor?
-    ndim = _read_int32(f)
-    if debug: print 'header ndim', ndim
-
-    #what are the dimensions of the tensor?
-    dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim]
-    dim_size = _prod(dim)
-    if debug: print 'header dim', dim, dim_size
+    magic_t, elsize, ndim, dim, dim_size = _read_header(f,debug)
 
     rval = None
     if subtensor is None:
--- a/pylearn/io/tests/test_filetensor.py	Thu May 14 17:00:22 2009 -0400
+++ b/pylearn/io/tests/test_filetensor.py	Thu May 14 18:25:10 2009 -0400
@@ -3,21 +3,17 @@
 import numpy
 
 import unittest
-import os
+import os, tempfile
 
 class T(unittest.TestCase):
-    fname = '/tmp/some_mat'
-
     def setUp(self):
-        #TODO: test that /tmp/some_mat does not exist
-        try:
-            os.stat(self.fname)
-        except OSError:
-            return #assume file was not found
-        raise Exception('autotest file "%s" exists!' % self.fname)
+        self.fname = tempfile.mktemp()
 
     def tearDown(self):
-        os.remove(self.fname)
+        try:
+            os.remove(self.fname)
+        except IOError:
+            pass
 
     def test_file(self):
         gen = numpy.random.rand(1)
@@ -79,6 +75,34 @@
                 passed = True
         f.close()
         self.failUnless(passed)
+
+    def test_arraylike(self):
+        fullshape = [10, 4, 3, 1, 8]
+        for ndim in range(len(fullshape)):
+            for rank in range(ndim+1):
+                print 'ndim, rank', ndim, rank
+                tt = numpy.asarray(numpy.random.RandomState(55).randn(*fullshape[:ndim]))
+                f = file(self.fname, 'w')
+                filetensor.write(f, tt)
+                f.close()
+                a = filetensor.arraylike(open(self.fname), rank=rank, debug=True)
+                print 'len a', len(a)
+                list_a = list(a)
+                assert len(a) == len(list_a)
+
+                #WARNING: assuming that the readshape is correct
+                tt_flat = tt.reshape( (tt.size/filetensor._prod(a.readshape),) + a.readshape)
+
+                assert len(a) == len(tt_flat)
+                assert len(a) > 0
+                for a_i, t_i in zip(a, tt_flat):
+                    assert a_i.shape == t_i.shape
+                    assert a_i.dtype == t_i.dtype
+                    assert numpy.all(a_i == t_i)
+
+
+
+
         
 
 if __name__ == '__main__':