Mercurial > ift6266
view data_generation/transformations/filetensor.py @ 613:5e481b224117
fix the reading of PNIST dataset following Dumi compression of the data.
author | Frederic Bastien <nouiz@nouiz.org> |
---|---|
date | Thu, 06 Jan 2011 13:57:05 -0500 |
parents | 1f5937e9e530 |
children |
line wrap: on
line source
""" Read and write the matrix file format described at U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html} The format is for dense tensors: - magic number indicating type and endianness - 4bytes - rank of tensor - int32 - dimensions - int32, int32, int32, ... - <data> The number of dimensions and rank is slightly tricky: - for scalar: rank=0, dimensions = [1, 1, 1] - for vector: rank=1, dimensions = [?, 1, 1] - for matrix: rank=2, dimensions = [?, ?, 1] For rank >= 3, the number of dimensions matches the rank exactly. @todo: add complex type support """ import sys import numpy def _prod(lst): p = 1 for l in lst: p *= l return p _magic_dtype = { 0x1E3D4C51 : ('float32', 4), #0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix? 0x1E3D4C53 : ('float64', 8), 0x1E3D4C54 : ('int32', 4), 0x1E3D4C55 : ('uint8', 1), 0x1E3D4C56 : ('int16', 2), } _dtype_magic = { 'float32': 0x1E3D4C51, #'packed matrix': 0x1E3D4C52, 'float64': 0x1E3D4C53, 'int32': 0x1E3D4C54, 'uint8': 0x1E3D4C55, 'int16': 0x1E3D4C56 } def _read_int32(f): """unpack a 4-byte integer from the current position in file f""" s = f.read(4) s_array = numpy.fromstring(s, dtype='int32') return s_array.item() def _read_header(f, debug=False): """ :returns: data type, element size, rank, shape, size """ #what is the data type of this matrix? #magic_s = f.read(4) #magic = numpy.fromstring(magic_s, dtype='int32') magic = _read_int32(f) magic_t, elsize = _magic_dtype[magic] if debug: print 'header magic', magic, magic_t, elsize if magic_t == 'packed matrix': raise NotImplementedError('packed matrix not supported') #what is the rank of the tensor? ndim = _read_int32(f) if debug: print 'header ndim', ndim #what are the dimensions of the tensor? dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] dim_size = _prod(dim) if debug: print 'header dim', dim, dim_size return magic_t, elsize, ndim, dim, dim_size class arraylike(object): """Provide an array-like interface to the filetensor in f. The rank parameter to __init__ controls how this object interprets the underlying tensor. Its behaviour should be clear from the following example. Suppose the underlying tensor is MxNxK. - If rank is 0, self[i] will be a scalar and len(self) == M*N*K. - If rank is 1, self[i] is a vector of length K, and len(self) == M*N. - If rank is 3, self[i] is a 3D tensor of size MxNxK, and len(self)==1. - If rank is 5, self[i] is a 5D tensor of size 1x1xMxNxK, and len(self) == 1. :note: Objects of this class generally require exclusive use of the underlying file handle, because they call seek() every time you access an element. """ f = None """File-like object""" magic_t = None """numpy data type of array""" elsize = None """number of bytes per scalar element""" ndim = None """Rank of underlying tensor""" dim = None """tuple of array dimensions (aka shape)""" dim_size = None """number of scalars in the tensor (prod of dim)""" f_start = None """The file position of the first element of the tensor""" readshape = None """tuple of array dimensions of the block that we read""" readsize = None """number of elements we must read for each block""" def __init__(self, f, rank=0, debug=False): self.f = f self.magic_t, self.elsize, self.ndim, self.dim, self.dim_size = _read_header(f,debug) self.f_start = f.tell() if rank <= self.ndim: self.readshape = tuple(self.dim[self.ndim-rank:]) else: self.readshape = tuple(self.dim) #self.readshape = tuple(self.dim[self.ndim-rank:]) if rank <= self.ndim else tuple(self.dim) if rank <= self.ndim: padding = tuple() else: padding = (1,) * (rank - self.ndim) #padding = tuple() if rank <= self.ndim else (1,) * (rank - self.ndim) self.returnshape = padding + self.readshape self.readsize = _prod(self.readshape) if debug: print 'READ PARAM', self.readshape, self.returnshape, self.readsize def __len__(self): return _prod(self.dim[:self.ndim-len(self.readshape)]) def __getitem__(self, idx): if idx >= len(self): raise IndexError(idx) self.f.seek(self.f_start + idx * self.elsize * self.readsize) return numpy.fromfile(self.f, dtype=self.magic_t, count=self.readsize).reshape(self.returnshape) # # TODO: implement item selection: # e.g. load('some mat', subtensor=(:6, 2:5)) # # This function should be memory efficient by: # - allocating an output matrix at the beginning # - seeking through the file, reading subtensors from multiple places def read(f, subtensor=None, debug=False): """Load all or part of file 'f' into a numpy ndarray @param f: file from which to read @type f: file-like object If subtensor is not None, it should be like the argument to numpy.ndarray.__getitem__. The following two expressions should return equivalent ndarray objects, but the one on the left may be faster and more memory efficient if the underlying file f is big. read(f, subtensor) <===> read(f)[*subtensor] Support for subtensors is currently spotty, so check the code to see if your particular type of subtensor is supported. """ magic_t, elsize, ndim, dim, dim_size = _read_header(f,debug) f_start = f.tell() rval = None if subtensor is None: rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) elif isinstance(subtensor, slice): if subtensor.step not in (None, 1): raise NotImplementedError('slice with step', subtensor.step) if subtensor.start not in (None, 0): bytes_per_row = _prod(dim[1:]) * elsize f.seek(f_start + subtensor.start * bytes_per_row) dim[0] = min(dim[0], subtensor.stop) - subtensor.start rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) else: raise NotImplementedError('subtensor access not written yet:', subtensor) return rval def write(f, mat): """Write a numpy.ndarray to file. @param f: file into which to write @type f: file-like object @param mat: array to write to file @type mat: numpy ndarray or compatible """ def _write_int32(f, i): i_array = numpy.asarray(i, dtype='int32') if 0: print 'writing int32', i, i_array i_array.tofile(f) try: _write_int32(f, _dtype_magic[str(mat.dtype)]) except KeyError: raise TypeError('Invalid ndarray dtype for filetensor format', mat.dtype) _write_int32(f, len(mat.shape)) shape = mat.shape if len(shape) < 3: shape = list(shape) + [1] * (3 - len(shape)) if 0: print 'writing shape =', shape for sh in shape: _write_int32(f, sh) mat.tofile(f)