# HG changeset patch # User Arnaud Bergeron # Date 1264631737 18000 # Node ID faacc76d21c2c37f141320ff4e7cdcb8316ab06a # Parent 64dac4aabc04c3e35e8890660c05016c64872cf7 Basic new pipeline script for the images tranforms (includes bonus patched filetensor module from pylearn.io) diff -r 64dac4aabc04 -r faacc76d21c2 transformations/filetensor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/transformations/filetensor.py Wed Jan 27 17:35:37 2010 -0500 @@ -0,0 +1,232 @@ +""" +Read and write the matrix file format described at +U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html} + +The format is for dense tensors: + + - magic number indicating type and endianness - 4bytes + - rank of tensor - int32 + - dimensions - int32, int32, int32, ... + - + +The number of dimensions and rank is slightly tricky: + - for scalar: rank=0, dimensions = [1, 1, 1] + - for vector: rank=1, dimensions = [?, 1, 1] + - for matrix: rank=2, dimensions = [?, ?, 1] + +For rank >= 3, the number of dimensions matches the rank exactly. + + +@todo: add complex type support + +""" +import sys +import numpy + +def _prod(lst): + p = 1 + for l in lst: + p *= l + return p + +_magic_dtype = { + 0x1E3D4C51 : ('float32', 4), + #0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix? + 0x1E3D4C53 : ('float64', 8), + 0x1E3D4C54 : ('int32', 4), + 0x1E3D4C55 : ('uint8', 1), + 0x1E3D4C56 : ('int16', 2), + } +_dtype_magic = { + 'float32': 0x1E3D4C51, + #'packed matrix': 0x1E3D4C52, + 'float64': 0x1E3D4C53, + 'int32': 0x1E3D4C54, + 'uint8': 0x1E3D4C55, + 'int16': 0x1E3D4C56 + } + +def _read_int32(f): + """unpack a 4-byte integer from the current position in file f""" + s = f.read(4) + s_array = numpy.fromstring(s, dtype='int32') + return s_array.item() + +def _read_header(f, debug=False): + """ + :returns: data type, element size, rank, shape, size + """ + #what is the data type of this matrix? + #magic_s = f.read(4) + #magic = numpy.fromstring(magic_s, dtype='int32') + magic = _read_int32(f) + magic_t, elsize = _magic_dtype[magic] + if debug: + print 'header magic', magic, magic_t, elsize + if magic_t == 'packed matrix': + raise NotImplementedError('packed matrix not supported') + + #what is the rank of the tensor? + ndim = _read_int32(f) + if debug: print 'header ndim', ndim + + #what are the dimensions of the tensor? + dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] + dim_size = _prod(dim) + if debug: print 'header dim', dim, dim_size + + return magic_t, elsize, ndim, dim, dim_size + +class arraylike(object): + """Provide an array-like interface to the filetensor in f. + + The rank parameter to __init__ controls how this object interprets the underlying tensor. + Its behaviour should be clear from the following example. + Suppose the underlying tensor is MxNxK. + + - If rank is 0, self[i] will be a scalar and len(self) == M*N*K. + + - If rank is 1, self[i] is a vector of length K, and len(self) == M*N. + + - If rank is 3, self[i] is a 3D tensor of size MxNxK, and len(self)==1. + + - If rank is 5, self[i] is a 5D tensor of size 1x1xMxNxK, and len(self) == 1. + + + :note: Objects of this class generally require exclusive use of the underlying file handle, because + they call seek() every time you access an element. + """ + + f = None + """File-like object""" + + magic_t = None + """numpy data type of array""" + + elsize = None + """number of bytes per scalar element""" + + ndim = None + """Rank of underlying tensor""" + + dim = None + """tuple of array dimensions (aka shape)""" + + dim_size = None + """number of scalars in the tensor (prod of dim)""" + + f_start = None + """The file position of the first element of the tensor""" + + readshape = None + """tuple of array dimensions of the block that we read""" + + readsize = None + """number of elements we must read for each block""" + + def __init__(self, f, rank=0, debug=False): + self.f = f + self.magic_t, self.elsize, self.ndim, self.dim, self.dim_size = _read_header(f,debug) + self.f_start = f.tell() + + if rank <= self.ndim: + self.readshape = tuple(self.dim[self.ndim-rank:]) + else: + self.readshape = tuple(self.dim) + + #self.readshape = tuple(self.dim[self.ndim-rank:]) if rank <= self.ndim else tuple(self.dim) + + if rank <= self.ndim: + padding = tuple() + else: + padding = (1,) * (rank - self.ndim) + + #padding = tuple() if rank <= self.ndim else (1,) * (rank - self.ndim) + self.returnshape = padding + self.readshape + self.readsize = _prod(self.readshape) + if debug: print 'READ PARAM', self.readshape, self.returnshape, self.readsize + + def __len__(self): + return _prod(self.dim[:self.ndim-len(self.readshape)]) + + def __getitem__(self, idx): + if idx >= len(self): + raise IndexError(idx) + self.f.seek(self.f_start + idx * self.elsize * self.readsize) + return numpy.fromfile(self.f, + dtype=self.magic_t, + count=self.readsize).reshape(self.returnshape) + + +# +# TODO: implement item selection: +# e.g. load('some mat', subtensor=(:6, 2:5)) +# +# This function should be memory efficient by: +# - allocating an output matrix at the beginning +# - seeking through the file, reading subtensors from multiple places +def read(f, subtensor=None, debug=False): + """Load all or part of file 'f' into a numpy ndarray + + @param f: file from which to read + @type f: file-like object + + If subtensor is not None, it should be like the argument to + numpy.ndarray.__getitem__. The following two expressions should return + equivalent ndarray objects, but the one on the left may be faster and more + memory efficient if the underlying file f is big. + + read(f, subtensor) <===> read(f)[*subtensor] + + Support for subtensors is currently spotty, so check the code to see if your + particular type of subtensor is supported. + + """ + magic_t, elsize, ndim, dim, dim_size = _read_header(f,debug) + f_start = f.tell() + + rval = None + if subtensor is None: + rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) + elif isinstance(subtensor, slice): + if subtensor.step not in (None, 1): + raise NotImplementedError('slice with step', subtensor.step) + if subtensor.start not in (None, 0): + bytes_per_row = _prod(dim[1:]) * elsize + f.seek(f_start + subtensor.start * bytes_per_row) + dim[0] = min(dim[0], subtensor.stop) - subtensor.start + rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) + else: + raise NotImplementedError('subtensor access not written yet:', subtensor) + + return rval + +def write(f, mat): + """Write a numpy.ndarray to file. + + @param f: file into which to write + @type f: file-like object + + @param mat: array to write to file + @type mat: numpy ndarray or compatible + + """ + def _write_int32(f, i): + i_array = numpy.asarray(i, dtype='int32') + if 0: print 'writing int32', i, i_array + i_array.tofile(f) + + try: + _write_int32(f, _dtype_magic[str(mat.dtype)]) + except KeyError: + raise TypeError('Invalid ndarray dtype for filetensor format', mat.dtype) + + _write_int32(f, len(mat.shape)) + shape = mat.shape + if len(shape) < 3: + shape = list(shape) + [1] * (3 - len(shape)) + if 0: print 'writing shape =', shape + for sh in shape: + _write_int32(f, sh) + mat.tofile(f) + diff -r 64dac4aabc04 -r faacc76d21c2 transformations/pipeline.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/transformations/pipeline.py Wed Jan 27 17:35:37 2010 -0500 @@ -0,0 +1,51 @@ +from __future__ import with_statement + +import sys, os +import numpy +import filetensor as ft +import random + +BATCH_SIZE = 100 + +#import and stuff them in mods below + +mods = [] + +# DANGER: HIGH VOLTAGE -- DO NOT EDIT BELOW THIS LINE +# ----------------------------------------------------------- + +train_data = open('/data/lisa/data/nist/by_class/all/all_train_data.ft', 'rb') + +dim = tuple(ft._read_header(train_data)[3]) + +res_data = numpy.empty(dim) + +all_settings = ['complexity'] + +for mod in mods: + all_settings += mod.get_settings_names() + +params = numpy.empty(((dim[0]/BATCH_SIZE)+1, len(all_settings))) + +for i in xrange(0, dim[0], BATCH_SIZE): + train_data.seek(0) + imgs = ft.read(train_data, slice(i, i+BATCH_SIZE)) + + complexity = random.random() + p = i/BATCH_SIZE + j = 1 + for mod in mods: + par = mod.regenerate_parameters(complexity) + params[p, j:j+len(par)] = par + j += len(par) + + for k in range(imgs.shape[0]): + c = imgs[k] + for mod in mods: + c = mod.transform_image(c) + res_data[i+k] = c + +with open(sys.argv[1], 'wb') as f: + ft.write(f, res_data) + +numpy.save(sys.argv[2], params)