view pylearn/dataset_ops/tinyimages.py @ 1420:7374d676c9b0

dataset_ops/tinyimages - added a tinyimages_op that gives access to the full dataset, not just patches.
author James Bergstra <bergstrj@iro.umontreal.ca>
date Fri, 04 Feb 2011 16:06:00 -0500
parents cc3e3e596500
children 93e5ce7ccd6d
line wrap: on
line source

"""I'm not sure where to put this code.

THIS IS NOT POLISHED LIBRARY CODE YET.

"""

__authors__ = "James Bergstra"
__copyright__ = "(c) 2010, Universite de Montreal"
__license__ = "3-clause BSD License"
__contact__ = "bergstrj@iro.umontreal.ca"


import cPickle, logging, sys
import numpy
from pylearn.datasets import tinyimages, image_patches
import pylearn.preprocessing.pca
import theano
from pylearn.io import image_tiling

from .protocol import TensorFnDataset # protocol.py __init__.py
from .memo import memo


#
# This part of the file (until main()) is for generating a dataset of image patches from the
# tinyimages dataset.  These patches are used in the pretraining stage of the mcRBM training
# algorithm.
#
# Since the 'dataset' is properly seen as a cached-to-disk preprocessing derived from raw
# material in tinyimages, it is not a real dataset (with a standard disk location in the
# PYLEARN_DATA_ROOT root).
#
# Hopefully the upcoming pylearn library proposal will have a policy on how/where this sort of
# pre-processed data should be stored.  For now it is stored in the current working directory.
#

def tinyimages_op(s_idx):
    """Return symbolic tiny_images[s_idx]

    If s_idx is a scalar, the return value is a tensor3 of shape 32,32,3 and
    dtype uint8.
    If s_idx is a vector of len N, the return value
    is a tensor4 of shape N,32,32,3 and dtype uint8.
    """
    op = TensorFnDataset('uint8',
            bcast=(False, False, False),
            fn=tinyimages.get_memmapped_file,
            single_shape=(32,32,3))
    return op(s_idx)


_raw_patch_file = 'tinydataset_raw.npy'
_pca_file       = 'tinydataset_pca.pkl'
_whitened_file  = 'tinydataset_whitened.npy'

def normalize_channels(X, max_scale=5):
    """Map images from (0,1) to all reals so that each channel of each image has zero mean,
    [maximum] unit variance.  

    Channels will not be scaled by more than max_scale, so the output variance might be smaller
    than 1.
    """
    n_imgs,n_rows,n_cols,n_channels = X.shape
    X = X.copy()
    # ensure that we're working with floats on (0,1)
    if not  str(X.dtype).startswith('float'):
        raise TypeError()
    if X.min() < 0:
        raise ValueError('min out of bounds')
    if X.max() > 1:
        raise ValueError('max out of bounds')
    assert n_channels==3
    imaxscale = 1.0 / max_scale
    def centre(imgstack):
        a,b,c = imgstack.shape
        flat = imgstack.reshape((a,b*c))
        flat -= flat.mean(axis=1).reshape((a,1))
        flat /= numpy.maximum(flat.std(axis=1).reshape((a,1)),imaxscale)
        return flat.reshape((a,b,c))
    X[:,:,:,0]=centre(X[:,:,:,0])
    X[:,:,:,1]=centre(X[:,:,:,1])
    X[:,:,:,2]=centre(X[:,:,:,2])
    return X

def save_filters_orig(X, fname, min_dynamic_range=1e-8, data_path=None, img_shape=(8,8),
        tile_shape=None):
    """
    Save filters X (encoded as whitened images) in the original image space.
    """
    dct = load_pca_dct()
    pca = dct['eig_vals'], dct['eig_vecs']

    _img = image_tiling.tile_raster_images(
            pylearn.preprocessing.pca.pca_whiten_inverse(pca, X),
            img_shape=img_shape,
            min_dynamic_range=1e-6,
            tile_shape=tile_shape)
    image_tiling.save_tiled_raster_images(_img, fname)

def extract_patches(n_imgs=1000*100, n_patches_per_image=10, patch_shape=(8,8), rng=numpy.random.RandomState(234)):
    """
    Extract a number of patches from each of the first TinyImages
    """
    R,C=patch_shape

    dataset = numpy.empty((n_imgs*n_patches_per_image, R, C, 3), dtype='uint8')

    assert n_imgs < tinyimages.n_images

    image_stream = tinyimages.image_generator()

    i = 0
    while i < n_imgs:
        y = image_stream.next()
        yy = image_patches.extract_random_patches(
                y.reshape((1,32,32,3)),
                n_patches_per_image, 
                R,C,
                rng)
        ii = i*n_patches_per_image
        dataset[ii:ii+n_patches_per_image] = yy
        i += 1
    return dataset

def compute_pca_dct(X, use_only=100000, max_components=128, max_energy_fraction=.99):

    # each image channel is adjusted here
    ### X = normalize_channels(numpy.asarray(data[:use_only], dtype='float32')/255)


    # rasterize images
    X = X.reshape((X.shape[0], X.shape[1]* X.shape[2]* X.shape[3]))

    # switch to floats
    X = X.astype('float32')

    # subtract off each image mean (ignoring channels) #TODO: IS THIS GOOD IDEA?
    X = X - X.mean(axis=1).reshape((X.shape[0], 1))

    # subtract off global mean as part of pca
    data_mean = X.mean(axis=0)
    X = X - data_mean

    # calculating pca
    (eig_vals,eig_vecs), _ = pylearn.preprocessing.pca.pca_from_examples(
            X, max_components, max_energy_fraction, x_centered=True)

    print "Keeping %i principle components" % len(eig_vals)

    return dict(
            mean=data_mean,
            eig_vecs=eig_vecs,
            eig_vals=eig_vals)

def whiten_patches(raw_patches, pca_dct):
    """
    Load the patches from sys.argv[1] and whiten them with sys.argv[2], saving them to
    sys.argv[3].
    """

    rval = numpy.empty((raw_patches.shape[0], len(pca_dct['eig_vals'])), dtype='float32')

    print 'allocated output of size', rval.shape

    b = 100 #batchsize
    i = 0
    while i < len(rval):
        xi = numpy.asarray(raw_patches[i:i+b], dtype='float32')
        # rasterize
        xi = xi.reshape((xi.shape[0], xi.shape[1]*xi.shape[2]*xi.shape[3]))
        # remove image mean
        xi = xi - xi.mean(axis=1).reshape((xi.shape[0], 1))
        # remove pixel means
        xi -= pca_dct['mean']
        rval[i:i+b] = pylearn.preprocessing.pca.pca_whiten((pca_dct['eig_vals'], pca_dct['eig_vecs']), xi)
        i += b
    return rval

def main(n_imgs=1000, n_patches_per_image=10, max_components=128, seed=234, patch_shape=(8,8)):
    if 0: #do this to render the dataset to the screen
        sys.exit(glviewer())

    rng = numpy.random.RandomState(seed)

    try:
        open(_raw_patch_file).close() #fails if file not present
        load_raw_patches = True
    except:
        load_raw_patches = False

    if load_raw_patches:
        print 'loading raw patches from', _raw_patch_file
        raw_patches = numpy.load(_raw_patch_file, mmap_mode='r')
    else:
        print 'extracting raw patches'
        raw_patches = extract_patches(rng=rng, n_imgs=n_imgs,
                n_patches_per_image=n_patches_per_image, patch_shape=patch_shape)
        rng.shuffle(raw_patches)
        print 'saving raw patches to', _raw_patch_file
        numpy.save(open(_raw_patch_file, 'wb'), raw_patches)

    try:
        open(_pca_file).close()
        load_pca = True
    except:
        load_pca = False

    if load_pca:
        print 'loading pca from', _pca_file
        pca_dct = cPickle.load(open(_pca_file))
    else:
        print 'computing pca'
        pca_dct = compute_pca_dct(raw_patches, max_components=max_components)
        print 'saving pca to', _pca_file
        cPickle.dump(pca_dct, open(_pca_file, 'wb'))

    try:
        open(_whitened_file).close()
        load_patches = True
    except:
        load_patches = False

    if load_patches:
        print 'loading whitened patches from', _whitened_file
        whitened_patches = numpy.load(_whitened_file, mmap_mode='r')
    else:
        print 'computing whitened data'
        whitened_patches = whiten_patches(raw_patches, pca_dct)
        print 'saving', _whitened_file
        numpy.save(_whitened_file, whitened_patches)

    return whitened_patches, pca_dct

#
# This part of the file defines an op-constructor that uses the pre-processed cache / dataset generated
#

@memo
def load_whitened(path=_whitened_file):
    """
    Replacement for dataset_ops.image_patches.ranzato_hinton_2010_op
    """
    try:
        return numpy.load(path, mmap_mode='r')
    except:
        print >> sys.stderr, "Maybe you need to run 'python pylearn.dataset_ops.tinyimages'?"
        raise

@memo
def load_pca_dct(path=_pca_file):
    return cPickle.load(open(path))

def tinydataset_op(s_idx,
        split='train', 
        fn=load_whitened):

    n_examples,n_dim = fn().shape 

    if split != 'train':
        raise NotImplementedError('train/test/valid splits for randomly sampled image patches?')

    op = TensorFnDataset('float32', bcast=(False,), fn=fn, single_shape=(n_dim,))
    x = op(s_idx%n_examples)
    return x


def save_filters(X, fname, tile_shape=None, img_shape=(8,8)):
    dct = load_pca_dct()
    eigs = dct['eig_vals'], dct['eig_vecs']
    mean = dct['mean']
    rasterized = pylearn.preprocessing.pca.pca_whiten_inverse(eigs, X)+mean
    _img = image_tiling.tile_raster_images(
            (rasterized[:,::3], rasterized[:,1::3], rasterized[:,2::3], None),
            img_shape=img_shape,
            min_dynamic_range=1e-6,
            tile_shape=tile_shape)
    image_tiling.save_tiled_raster_images(_img, fname)

def glviewer(split='train'):
    from glviewer import GlViewer
    #i = theano.tensor.iscalar()
    #f = theano.function([i], mnist(i, split, dtype='uint8', rasterized=False)[0])
    data = numpy.load(_raw_patch_file, mmap_mode='r')
    print 'RAW', data.shape
    data = numpy.load(_whitened_file, mmap_mode='r')
    print 'WHI', data.shape

    if 1: # check the raw data
        data = numpy.load(_raw_patch_file, mmap_mode='r')
        data = data.reshape((data.shape[0], data.shape[1]*data.shape[2], data.shape[3]))
        def f(i):
            j = i*5000
            jj = j + 5000
            return image_tiling.tile_raster_images(
                    (data[j:jj,:,0], data[j:jj,:,1], data[j:jj,:,2], None),
                    img_shape=(8,8))
    if 0: # check the whitened data
        dct = load_pca_dct()
        eigs = dct['eig_vals'], dct['eig_vecs']
        mean = dct['mean']
        data = numpy.load(_whitened_file, mmap_mode='r')
        def f(i):
            j = i*5000
            jj = j + 5000
            X = data[j:jj]
            print 'j', j, jj
            rasterized = pylearn.preprocessing.pca.pca_whiten_inverse(eigs, X)+mean
            _img = image_tiling.tile_raster_images(
                    (rasterized[:,::3], rasterized[:,1::3], rasterized[:,2::3], None),
                    img_shape=(8,8),
                    min_dynamic_range=1e-6)
            return _img
    GlViewer(f).main()



if __name__=='__main__':
    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
    sys.exit(main())