changeset 1286:8905186b176c

test_mcRBM - added code to iterate over tinyimages
author James Bergstra <bergstrj@iro.umontreal.ca>
date Thu, 23 Sep 2010 19:12:52 -0400
parents 976539956475
children 4fa2a32e8fde
files pylearn/algorithms/tests/test_mcRBM.py
diffstat 1 files changed, 16 insertions(+), 115 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/algorithms/tests/test_mcRBM.py	Thu Sep 23 19:12:08 2010 -0400
+++ b/pylearn/algorithms/tests/test_mcRBM.py	Thu Sep 23 19:12:52 2010 -0400
@@ -1,119 +1,9 @@
 from pylearn.algorithms.mcRBM import *
 import pylearn.datasets.cifar10
+import pylearn.dataset_ops.tinyimages
 
 import pylearn.dataset_ops.cifar10
 
-def _mar_train_patches(dtype):
-    R,C=16,16
-    train_data = pylearn.dataset_ops.cifar10.train_data_labels(dtype)[0][:40000]
-    #train_data shape is (40000, 3072)
-    train_data = train_data.reshape((40000,3,32,32)).transpose([0,2,3,1])
-    patches = train_data[:, :R, :C, :].reshape((40000, 3*R*C))
-    patches -= patches.mean(axis=0)
-    wpatches = numpy.dot(patches, d['pcatransf'].T)
-    return wpatches
-
-def mar_centered(s_idx, split, dtype='float64', rasterized=False, color='grey'):
-    """ 
-    Returns a pair (img, label) of theano expressions for cifar-10 samples
-
-    :param s_idx: the indexes
-
-    :param split:
-
-    :param dtype:
-
-    :param rasterized: return examples as vectors (True) or 28x28 matrices (False)
-
-    :param color: control how to deal with the color in the images'
-      - grey   greyscale (with luminance weighting)
-      - rgb    add a trailing dimension of length 3 with rgb colour channels
-
-    """
-
-    split_options = {'train':(train_data, train_labels),
-            'valid': (valid_data, valid_labels),
-            'test': (test_data, test_labels)}
-
-    if split not in split_options:
-        raise ValueError('invalid split option', (split, split_options.keys()))
-
-    color_options = ('grey', 'rgb')
-    if color not in color_options:
-        raise ValueError('invalid color option', (color, color_options))
-
-    x_fn, y_fn = split_options[split]
-
-    x_op = TensorFnDataset(dtype, (False,), (x_fn, (dtype,)), (3072,))
-    y_op = TensorFnDataset('int32', (), y_fn)
-
-    x = x_op(s_idx)
-    y = y_op(s_idx)
-
-    # Y = 0.3R + 0.59G + 0.11B from
-    # http://gimp-savvy.com/BOOK/index.html?node54.html
-    rgb_dtype = 'float32'
-    if dtype == 'float64':
-        rgb_dtype = dtype
-    r = numpy.asarray(.3, dtype=rgb_dtype)
-    g = numpy.asarray(.59, dtype=rgb_dtype)
-    b = numpy.asarray(.11, dtype=rgb_dtype)
-
-    if x.ndim == 1:
-        if rasterized:
-            if color=='grey':
-                x = r * x[:1024] + g * x[1024:2048] + b * x[2048:]
-                if dtype=='uint8':
-                    x = theano.tensor.cast(x, 'uint8')
-            elif color=='rgb':
-                # the strides aren't what you'd expect between channels,
-                # but theano is all about weird strides
-                x = x.reshape((3,32*32)).T
-            else:
-                raise NotImplemented('color', color)
-        else:
-            if color=='grey':
-                x = r * x[:1024] + g * x[1024:2048] + b * x[2048:]
-                if dtype=='uint8':
-                    x = theano.tensor.cast(x, 'uint8')
-                x = x.reshape((32,32))
-            elif color=='rgb':
-                # the strides aren't what you'd expect between channels,
-                # but theano is all about weird strides
-                x = x.reshape((3,32,32)).dimshuffle(1, 2, 0)
-            else:
-                raise NotImplemented('color', color)
-    elif x.ndim == 2:
-        N = x.shape[0] # symbolic
-        if rasterized:
-            if color=='grey':
-                x = r * x[:,:1024] + g * x[:,1024:2048] + b * x[:,2048:]
-                if dtype=='uint8':
-                    x = theano.tensor.cast(x, 'uint8')
-            elif color=='rgb':
-                # the strides aren't what you'd expect between channels,
-                # but theano is all about weird strides
-                x = x.reshape((N, 3,32*32)).dimshuffle(0, 2, 1)
-            else:
-                raise NotImplemented('color', color)
-        else:
-            if color=='grey':
-                x = r * x[:,:1024] + g * x[:,1024:2048] + b * x[:,2048:]
-                if dtype=='uint8':
-                    x = theano.tensor.cast(x, 'uint8')
-                x.reshape((N, 32, 32))
-            elif color=='rgb':
-                # the strides aren't what you'd expect between channels,
-                # but theano is all about weird strides
-                x = x.reshape((N,3,32,32)).dimshuffle(0, 2, 3, 1)
-            else:
-                raise NotImplemented('color', color)
-    else:
-        raise ValueError('x has too many dimensions', x.ndim)
-
-    return x, y
-
-
 def _default_rbm_alloc(n_I, n_K=256, n_J=100):
     return mcRBM.alloc(n_I, n_K, n_J)
 
@@ -141,6 +31,11 @@
         n_vis=96 # pca components
         epoch_size=batchsize*500
         n_patches=epoch_size*20
+    elif dataset=='tinyimages_patches':
+        R,C=8,8
+        n_vis=81
+        epoch_size=batchsize*500
+        n_patches=epoch_size*20
     else:
         R,C= 16,16 # the size of image patches
         n_vis=R*C
@@ -161,6 +56,8 @@
                         X),
                     img_shape=(R,C))
             image_tiling.save_tiled_raster_images(_img, fname)
+    elif dataset == 'tinyimages_patches':
+        tile = pylearn.dataset_ops.tinyimages.save_filters
     else:
         def tile(X, fname):
             _img = image_tiling.tile_raster_images(X,
@@ -169,13 +66,16 @@
             image_tiling.save_tiled_raster_images(_img, fname)
 
     batch_idx = TT.iscalar()
+    batch_range =batch_idx * batchsize + np.arange(batchsize)
 
     if dataset == 'MAR':
-        train_batch = pylearn.dataset_ops.image_patches.ranzato_hinton_2010_op(batch_idx * batchsize + np.arange(batchsize))
+        train_batch = pylearn.dataset_ops.image_patches.ranzato_hinton_2010_op(batch_range)
     elif dataset == 'cifar10patches8x8':
-        train_batch = pylearn.dataset_ops.cifar10.cifar10_patches(batch_idx * batchsize +
-                np.arange(batchsize), 'train', n_patches=n_patches, patch_size=(R,C),
+        train_batch = pylearn.dataset_ops.cifar10.cifar10_patches(
+                batch_range, 'train', n_patches=n_patches, patch_size=(R,C),
                 pca_components=n_vis)
+    elif dataset == 'tinyimages_patches':
+        train_batch = pylearn.dataset_ops.tinyimages.tinydataset_op(batch_range)
     else:
         train_batch = pylearn.dataset_ops.image_patches.image_patches(
                 s_idx = (batch_idx * batchsize + np.arange(batchsize)),
@@ -319,13 +219,14 @@
                 )
 
     if 1:
+        # pretraining settings
         rbm,smplr = test_reproduce_ranzato_hinton_2010(
                 as_unittest=False,
                 n_train_iters=60000,
                 rbm_alloc=lambda n_I : mcRBM_withP.alloc_topo_P(n_I, n_J=81),
                 trainer_alloc=mcRBMTrainer.alloc_for_P,
                 lr_per_example=0.05,
-                dataset='cifar10patches8x8',
+                dataset='tinyimages_patches',
                 l1_penalty=1e-3,
                 l1_penalty_start=30000,
                 #l1_penalty_start=350, #DEBUG