changeset 1285:976539956475

adding tinyimages
author James Bergstra <bergstrj@iro.umontreal.ca>
date Thu, 23 Sep 2010 19:12:08 -0400
parents 1817485d586d
children 8905186b176c
files pylearn/dataset_ops/image_patches.py pylearn/dataset_ops/tinyimages.py pylearn/datasets/config.py pylearn/datasets/tinyimages.py
diffstat 4 files changed, 313 insertions(+), 24 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/dataset_ops/image_patches.py	Wed Sep 15 17:49:21 2010 -0400
+++ b/pylearn/dataset_ops/image_patches.py	Thu Sep 23 19:12:08 2010 -0400
@@ -30,7 +30,8 @@
 def image_patches(s_idx, dims,
         split='train', dtype=theano.config.floatX, rasterized=False,
         center=True,
-        unitvar=True):
+        unitvar=True,
+        fn=get_dataset):
     N,R,C=dims
 
     if split != 'train':
@@ -39,7 +40,7 @@
     if not rasterized:
         raise NotImplementedError()
 
-    op = TensorFnDataset(dtype, bcast=(False,), fn=(get_dataset, (N,R,C,dtype,center,unitvar)), single_shape=(R*C,))
+    op = TensorFnDataset(dtype, bcast=(False,), fn=(fn, (N,R,C,dtype,center,unitvar)), single_shape=(R*C,))
     x = op(s_idx%N)
     if x.ndim == 1:
         if not rasterized:
@@ -87,7 +88,8 @@
         split='train', 
         dtype=theano.config.floatX, rasterized=True,
         center=True,
-        unitvar=True):
+        unitvar=True,
+        fn=ranzato_hinton_2010_whitened_patches):
     N = 10240
 
     if split != 'train':
@@ -104,7 +106,7 @@
 
     op = TensorFnDataset(dtype,
             bcast=(False,), 
-            fn=ranzato_hinton_2010_whitened_patches,
+            fn=fn,
             single_shape=(105,))
     x = op(s_idx%N)
     return x
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/dataset_ops/tinyimages.py	Thu Sep 23 19:12:08 2010 -0400
@@ -0,0 +1,273 @@
+"""I'm not sure where to put this code."""
+
+__authors__ = "James Bergstra"
+__copyright__ = "(c) 2010, Universite de Montreal"
+__license__ = "3-clause BSD License"
+__contact__ = "bergstrj@iro.umontreal.ca"
+
+
+import cPickle, logging, sys
+import numpy
+from pylearn.datasets import tinyimages, image_patches
+import pylearn.preprocessing.pca
+import theano
+from pylearn.io import image_tiling
+
+from .protocol import TensorFnDataset # protocol.py __init__.py
+from .memo import memo
+
+
+#
+# This part of the file (until main()) is for generating a dataset of image patches from the
+# tinyimages dataset.  These patches are used in the pretraining stage of the mcRBM training
+# algorithm.
+#
+# Since the 'dataset' is properly seen as a cached-to-disk preprocessing derived from raw
+# material in tinyimages, it is not a real dataset (with a standard disk location in the
+# PYLEARN_DATA_ROOT root).
+#
+# Hopefully the upcoming pylearn library proposal will have a policy on how/where this sort of
+# pre-processed data should be stored.  For now it is stored in the current working directory.
+#
+
+_raw_patch_file = 'tinydataset_raw.npy'
+_pca_file       = 'tinydataset_pca.pkl'
+_whitened_file  = 'tinydataset_whitened.npy'
+
+def normalize_channels(X, max_scale=5):
+    """Map images from (0,1) to all reals so that each channel of each image has zero mean,
+    [maximum] unit variance.  
+
+    Channels will not be scaled by more than max_scale, so the output variance might be smaller
+    than 1.
+    """
+    n_imgs,n_rows,n_cols,n_channels = X.shape
+    X = X.copy()
+    # ensure that we're working with floats on (0,1)
+    if not  str(X.dtype).startswith('float'):
+        raise TypeError()
+    if X.min() < 0:
+        raise ValueError('min out of bounds')
+    if X.max() > 1:
+        raise ValueError('max out of bounds')
+    assert n_channels==3
+    imaxscale = 1.0 / max_scale
+    def centre(imgstack):
+        a,b,c = imgstack.shape
+        flat = imgstack.reshape((a,b*c))
+        flat -= flat.mean(axis=1).reshape((a,1))
+        flat /= numpy.maximum(flat.std(axis=1).reshape((a,1)),imaxscale)
+        # SHOULD CHANNEL VARIANCE BE STANDARDIZED?
+        #imgstack modified inplace
+    centre(X[:,:,:,0])
+    centre(X[:,:,:,1])
+    centre(X[:,:,:,2])
+    return X
+
+def save_filters(X, fname, min_dynamic_range=1e-3, data_path=None):
+    """
+    Save filters X (encoded as whitened images) in the original image space.
+    """
+    dct = load_pca_dct()
+    pca = dct['eig_vals'], dct['eig_vecs']
+
+    _img = image_tiling.tile_raster_images(
+            pylearn.preprocessing.pca.pca_whiten_inverse(pca, X),
+            img_shape=(8,8),
+            min_dynamic_range=1e-6)
+    image_tiling.save_tiled_raster_images(_img, fname)
+
+def extract_save_patches(path=_raw_patch_file, n_imgs=1000*100, n_patches_per_image=10, patch_shape=(8,8)):
+    """
+    Generate the dataset and store it to the path named in _raw_patch_file
+    """
+    R,C=patch_shape
+
+    dataset = numpy.empty((n_imgs*n_patches_per_image, R, C, 3), dtype='uint8')
+    savefile = open(path, 'wb')
+
+    assert n_imgs < tinyimages.n_images
+
+    image_stream = tinyimages.image_generator()
+    rng = numpy.random.RandomState(234)
+
+    i = 0
+    while i < n_imgs:
+        y = image_stream.next()
+        yy = image_patches.extract_random_patches(
+                y.reshape((1,32,32,3)),
+                n_patches_per_image, 
+                R,C,
+                rng)
+        ii = i*n_patches_per_image
+        dataset[ii:ii+n_patches_per_image] = yy
+        i += 1
+
+    print 'saving'
+    numpy.save(savefile,dataset)
+
+def compute_save_pca(raw_path=_raw_patch_file, pca_path=_pca_file, use_only=100000, max_components=128, max_energy_fraction=.99):
+    """
+    Memmap the data file named in `raw_path_file` and save the pca to `pca_path`.
+    """
+
+    data = numpy.load(raw_path, mmap_mode='r')
+    ofile = open(pca_path, 'wb')
+
+    # each image channel is adjusted here
+    X = normalize_channels(numpy.asarray(data[:use_only], dtype='float32')/255)
+
+    # rasterize images
+    X = X.reshape((X.shape[0], X.shape[1]* X.shape[2]* X.shape[3]))
+
+    # subtract off global mean as part of pca
+    data_mean = X.mean(axis=0)
+    X = X - data_mean
+
+    # calculating pca
+    (eig_vals,eig_vecs), _ = pylearn.preprocessing.pca.pca_from_examples(
+            X, max_components, max_energy_fraction, x_centered=True)
+    cPickle.dump(
+            dict(
+                mean=data_mean,
+                eig_vecs=eig_vecs,
+                eig_vals=eig_vals),
+            ofile)
+    ofile.close()
+
+def whiten_patches(raw_path=_raw_patch_file, pca_path=_pca_file, whitened_file=_whitened_file):
+    """
+    Load the patches from sys.argv[1] and whiten them with sys.argv[2], saving them to
+    sys.argv[3].
+    """
+    data = numpy.load(raw_path, mmap_mode='r')
+    dct = cPickle.load(open(pca_path))
+
+    rval = numpy.empty((data.shape[0], len(dct['eig_vals'])), dtype='float32')
+
+    print 'allocated output of size', rval.shape
+
+    b = 100
+    i = 0
+    while i < len(rval):
+        # each image channel is adjusted here
+        xi = normalize_channels(numpy.asarray(data[i:i+b], dtype='float32')/255)
+        #rasterize images
+        xi = xi.reshape((xi.shape[0], xi.shape[1]*xi.shape[2]*xi.shape[3]))
+        xi -= dct['mean']
+        rval[i:i+b] = pylearn.preprocessing.pca.pca_whiten((dct['eig_vals'], dct['eig_vecs']), xi)
+        i += b
+    print 'saving', whitened_file
+    numpy.save(whitened_file, rval)
+
+def main():
+    if 0: #do this to render the dataset to the screen
+        sys.exit(glviewer())
+    try:
+        open(_raw_patch_file).close()
+        assert 0 # force recomputation
+    except:
+        print 'saving patches'
+        extract_save_patches()
+
+    try:
+        open(_pca_file).close()
+        assert 0 # force recomputation
+    except:
+        print 'computing pca'
+        compute_save_pca()
+
+    try:
+        open(_whitened_file).close()
+        assert 0 # force recomputation
+    except:
+        print 'computing whitened data'
+        whiten_patches()
+
+#
+# This part of the file defines an op-constructor that uses the pre-processed cache / dataset generated
+#
+
+@memo
+def load_whitened(path=_whitened_file):
+    """
+    Replacement for dataset_ops.image_patches.ranzato_hinton_2010_op
+    """
+    try:
+        return numpy.load(path, mmap_mode='r')
+    except:
+        print >> sys.stderr, "Maybe you need to run 'python pylearn.dataset_ops.tinyimages'?"
+        raise
+
+@memo
+def load_pca_dct(path=_pca_file):
+    return cPickle.load(open(path))
+
+def tinydataset_op(s_idx,
+        split='train', 
+        fn=load_whitened):
+
+    n_examples,n_dim = fn().shape 
+
+    if split != 'train':
+        raise NotImplementedError('train/test/valid splits for randomly sampled image patches?')
+
+    op = TensorFnDataset('float32', bcast=(False,), fn=fn, single_shape=(n_dim,))
+    x = op(s_idx%n_examples)
+    return x
+
+
+def save_filters(X, fname):
+    dct = load_pca_dct()
+    eigs = dct['eig_vals'], dct['eig_vecs']
+    mean = dct['mean']
+    rasterized = pylearn.preprocessing.pca.pca_whiten_inverse(eigs, X)+mean
+    _img = image_tiling.tile_raster_images(
+            (rasterized[:,::3], rasterized[:,1::3], rasterized[:,2::3], None),
+            img_shape=(8,8),
+            min_dynamic_range=1e-6)
+    image_tiling.save_tiled_raster_images(_img, fname)
+
+def glviewer(split='train'):
+    from glviewer import GlViewer
+    #i = theano.tensor.iscalar()
+    #f = theano.function([i], mnist(i, split, dtype='uint8', rasterized=False)[0])
+    data = numpy.load(_raw_patch_file, mmap_mode='r')
+    print 'RAW', data.shape
+    data = numpy.load(_whitened_file, mmap_mode='r')
+    print 'WHI', data.shape
+
+    if 1: # check the raw data
+        data = numpy.load(_raw_patch_file, mmap_mode='r')
+        data = data.reshape((data.shape[0], data.shape[1]*data.shape[2], data.shape[3]))
+        def f(i):
+            j = i*5000
+            jj = j + 5000
+            return image_tiling.tile_raster_images(
+                    (data[j:jj,:,0], data[j:jj,:,1], data[j:jj,:,2], None),
+                    img_shape=(8,8))
+    if 0: # check the whitened data
+        dct = load_pca_dct()
+        eigs = dct['eig_vals'], dct['eig_vecs']
+        mean = dct['mean']
+        data = numpy.load(_whitened_file, mmap_mode='r')
+        def f(i):
+            j = i*5000
+            jj = j + 5000
+            X = data[j:jj]
+            print 'j', j, jj
+            rasterized = pylearn.preprocessing.pca.pca_whiten_inverse(eigs, X)+mean
+            _img = image_tiling.tile_raster_images(
+                    (rasterized[:,::3], rasterized[:,1::3], rasterized[:,2::3], None),
+                    img_shape=(8,8),
+                    min_dynamic_range=1e-6)
+            return _img
+    GlViewer(f).main()
+
+
+
+if __name__=='__main__':
+    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
+    sys.exit(main())
+
+
--- a/pylearn/datasets/config.py	Wed Sep 15 17:49:21 2010 -0400
+++ b/pylearn/datasets/config.py	Thu Sep 23 19:12:08 2010 -0400
@@ -5,12 +5,12 @@
 """
 
 import os, sys, logging
-_logger = logging.getLogger('pylearn.datasets.config')
-def debug(*msg): _logger.debug(' '.join(str(m) for m in msg))
-def info(*msg): _logger.info(' '.join(str(m) for m in msg))
-def warn(*msg): _logger.warn(' '.join(str(m) for m in msg))
-def warning(*msg): _logger.warning(' '.join(str(m) for m in msg))
-def error(*msg): _logger.error(' '.join(str(m) for m in msg))
+def _logger():  return logging.getLogger('pylearn.datasets.config')
+def debug(*msg): _logger().debug(' '.join(str(m) for m in msg))
+def info(*msg): _logger().info(' '.join(str(m) for m in msg))
+def warn(*msg): _logger().warn(' '.join(str(m) for m in msg))
+def warning(*msg): _logger().warning(' '.join(str(m) for m in msg))
+def error(*msg): _logger().error(' '.join(str(m) for m in msg))
 
 
 def env_get(key, default, key2 = None):
--- a/pylearn/datasets/tinyimages.py	Wed Sep 15 17:49:21 2010 -0400
+++ b/pylearn/datasets/tinyimages.py	Thu Sep 23 19:12:08 2010 -0400
@@ -5,10 +5,12 @@
 __license__ = "3-clause BSD License"
 __contact__ = "bergstrj@iro.umontreal.ca"
 
-import os, sys
+import logging, os, sys
 import PIL.Image
 import numpy
 
+logger = logging.getLogger('pylearn.datasets.tinyimages')
+
 def sorted_listdir(*path):
     r = os.listdir(os.path.join(*path))
     r.sort()
@@ -27,8 +29,7 @@
                 yield path, letter, label, img
 
 def load_image(path):
-    """
-    """
+    """Return the image at `path` as a numpy ndarray """
     rval = numpy.asarray(PIL.Image.open(path))
     return rval
 
@@ -40,10 +41,17 @@
     Be careful with this generator because the dataset in total is close to
     20GB!
     """
+    n_colour_conversions = 0
+    n_yielded = 0
     for p in iterate_over_filenames(path=_original):
-        y = load_image(*p)
-        assert y.shape == (32,32,3)
-        assert y.dtype == numpy.uint8
+        y = load_image(os.path.join(*p))
+        n_yielded += 1
+        if y.shape == (32,32):
+            logger.info("put %i'th/%i images in colour"%(n_colour_conversions, n_yielded))
+            y = numpy.asarray([y,y,y]).transpose((1,2,0)).copy()
+            n_colour_conversions += 1
+        assert y.shape == (32,32,3), (p,y.shape)
+        assert y.dtype == numpy.uint8, (p,y.dtype)
         yield y
 
 def load_first_N(N):
@@ -53,14 +61,20 @@
         yield it.next()
         i +=1
 
-if __name__ == '__main__':
-    if 0:
-        def iter_len(x):
-            i = 0
-            for xx in x:
-                i += 1
-            return i
-        print 'got %i files' % iter_len(iterate_over_filenames())
+n_images = 1608356 
+
+def main():
+    def iter_len(x):
+        i = 0
+        for xx in x:
+            i += 1
+        return i
+    n_files = iter_len(iterate_over_filenames())
+    print 'got %i files' % n_files
+    assert n_images == n_files
 
     for p in load_first_N(10):
         load_image(os.path.join(*p))
+
+if __name__ == '__main__':
+    sys.exit(main())