# HG changeset patch # User James Bergstra # Date 1285283528 14400 # Node ID 976539956475baf5c2c5b33bbfb8997f602fe5cb # Parent 1817485d586d50c1cadb829b07d46390a0035c86 adding tinyimages diff -r 1817485d586d -r 976539956475 pylearn/dataset_ops/image_patches.py --- a/pylearn/dataset_ops/image_patches.py Wed Sep 15 17:49:21 2010 -0400 +++ b/pylearn/dataset_ops/image_patches.py Thu Sep 23 19:12:08 2010 -0400 @@ -30,7 +30,8 @@ def image_patches(s_idx, dims, split='train', dtype=theano.config.floatX, rasterized=False, center=True, - unitvar=True): + unitvar=True, + fn=get_dataset): N,R,C=dims if split != 'train': @@ -39,7 +40,7 @@ if not rasterized: raise NotImplementedError() - op = TensorFnDataset(dtype, bcast=(False,), fn=(get_dataset, (N,R,C,dtype,center,unitvar)), single_shape=(R*C,)) + op = TensorFnDataset(dtype, bcast=(False,), fn=(fn, (N,R,C,dtype,center,unitvar)), single_shape=(R*C,)) x = op(s_idx%N) if x.ndim == 1: if not rasterized: @@ -87,7 +88,8 @@ split='train', dtype=theano.config.floatX, rasterized=True, center=True, - unitvar=True): + unitvar=True, + fn=ranzato_hinton_2010_whitened_patches): N = 10240 if split != 'train': @@ -104,7 +106,7 @@ op = TensorFnDataset(dtype, bcast=(False,), - fn=ranzato_hinton_2010_whitened_patches, + fn=fn, single_shape=(105,)) x = op(s_idx%N) return x diff -r 1817485d586d -r 976539956475 pylearn/dataset_ops/tinyimages.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/dataset_ops/tinyimages.py Thu Sep 23 19:12:08 2010 -0400 @@ -0,0 +1,273 @@ +"""I'm not sure where to put this code.""" + +__authors__ = "James Bergstra" +__copyright__ = "(c) 2010, Universite de Montreal" +__license__ = "3-clause BSD License" +__contact__ = "bergstrj@iro.umontreal.ca" + + +import cPickle, logging, sys +import numpy +from pylearn.datasets import tinyimages, image_patches +import pylearn.preprocessing.pca +import theano +from pylearn.io import image_tiling + +from .protocol import TensorFnDataset # protocol.py __init__.py +from .memo import memo + + +# +# This part of the file (until main()) is for generating a dataset of image patches from the +# tinyimages dataset. These patches are used in the pretraining stage of the mcRBM training +# algorithm. +# +# Since the 'dataset' is properly seen as a cached-to-disk preprocessing derived from raw +# material in tinyimages, it is not a real dataset (with a standard disk location in the +# PYLEARN_DATA_ROOT root). +# +# Hopefully the upcoming pylearn library proposal will have a policy on how/where this sort of +# pre-processed data should be stored. For now it is stored in the current working directory. +# + +_raw_patch_file = 'tinydataset_raw.npy' +_pca_file = 'tinydataset_pca.pkl' +_whitened_file = 'tinydataset_whitened.npy' + +def normalize_channels(X, max_scale=5): + """Map images from (0,1) to all reals so that each channel of each image has zero mean, + [maximum] unit variance. + + Channels will not be scaled by more than max_scale, so the output variance might be smaller + than 1. + """ + n_imgs,n_rows,n_cols,n_channels = X.shape + X = X.copy() + # ensure that we're working with floats on (0,1) + if not str(X.dtype).startswith('float'): + raise TypeError() + if X.min() < 0: + raise ValueError('min out of bounds') + if X.max() > 1: + raise ValueError('max out of bounds') + assert n_channels==3 + imaxscale = 1.0 / max_scale + def centre(imgstack): + a,b,c = imgstack.shape + flat = imgstack.reshape((a,b*c)) + flat -= flat.mean(axis=1).reshape((a,1)) + flat /= numpy.maximum(flat.std(axis=1).reshape((a,1)),imaxscale) + # SHOULD CHANNEL VARIANCE BE STANDARDIZED? + #imgstack modified inplace + centre(X[:,:,:,0]) + centre(X[:,:,:,1]) + centre(X[:,:,:,2]) + return X + +def save_filters(X, fname, min_dynamic_range=1e-3, data_path=None): + """ + Save filters X (encoded as whitened images) in the original image space. + """ + dct = load_pca_dct() + pca = dct['eig_vals'], dct['eig_vecs'] + + _img = image_tiling.tile_raster_images( + pylearn.preprocessing.pca.pca_whiten_inverse(pca, X), + img_shape=(8,8), + min_dynamic_range=1e-6) + image_tiling.save_tiled_raster_images(_img, fname) + +def extract_save_patches(path=_raw_patch_file, n_imgs=1000*100, n_patches_per_image=10, patch_shape=(8,8)): + """ + Generate the dataset and store it to the path named in _raw_patch_file + """ + R,C=patch_shape + + dataset = numpy.empty((n_imgs*n_patches_per_image, R, C, 3), dtype='uint8') + savefile = open(path, 'wb') + + assert n_imgs < tinyimages.n_images + + image_stream = tinyimages.image_generator() + rng = numpy.random.RandomState(234) + + i = 0 + while i < n_imgs: + y = image_stream.next() + yy = image_patches.extract_random_patches( + y.reshape((1,32,32,3)), + n_patches_per_image, + R,C, + rng) + ii = i*n_patches_per_image + dataset[ii:ii+n_patches_per_image] = yy + i += 1 + + print 'saving' + numpy.save(savefile,dataset) + +def compute_save_pca(raw_path=_raw_patch_file, pca_path=_pca_file, use_only=100000, max_components=128, max_energy_fraction=.99): + """ + Memmap the data file named in `raw_path_file` and save the pca to `pca_path`. + """ + + data = numpy.load(raw_path, mmap_mode='r') + ofile = open(pca_path, 'wb') + + # each image channel is adjusted here + X = normalize_channels(numpy.asarray(data[:use_only], dtype='float32')/255) + + # rasterize images + X = X.reshape((X.shape[0], X.shape[1]* X.shape[2]* X.shape[3])) + + # subtract off global mean as part of pca + data_mean = X.mean(axis=0) + X = X - data_mean + + # calculating pca + (eig_vals,eig_vecs), _ = pylearn.preprocessing.pca.pca_from_examples( + X, max_components, max_energy_fraction, x_centered=True) + cPickle.dump( + dict( + mean=data_mean, + eig_vecs=eig_vecs, + eig_vals=eig_vals), + ofile) + ofile.close() + +def whiten_patches(raw_path=_raw_patch_file, pca_path=_pca_file, whitened_file=_whitened_file): + """ + Load the patches from sys.argv[1] and whiten them with sys.argv[2], saving them to + sys.argv[3]. + """ + data = numpy.load(raw_path, mmap_mode='r') + dct = cPickle.load(open(pca_path)) + + rval = numpy.empty((data.shape[0], len(dct['eig_vals'])), dtype='float32') + + print 'allocated output of size', rval.shape + + b = 100 + i = 0 + while i < len(rval): + # each image channel is adjusted here + xi = normalize_channels(numpy.asarray(data[i:i+b], dtype='float32')/255) + #rasterize images + xi = xi.reshape((xi.shape[0], xi.shape[1]*xi.shape[2]*xi.shape[3])) + xi -= dct['mean'] + rval[i:i+b] = pylearn.preprocessing.pca.pca_whiten((dct['eig_vals'], dct['eig_vecs']), xi) + i += b + print 'saving', whitened_file + numpy.save(whitened_file, rval) + +def main(): + if 0: #do this to render the dataset to the screen + sys.exit(glviewer()) + try: + open(_raw_patch_file).close() + assert 0 # force recomputation + except: + print 'saving patches' + extract_save_patches() + + try: + open(_pca_file).close() + assert 0 # force recomputation + except: + print 'computing pca' + compute_save_pca() + + try: + open(_whitened_file).close() + assert 0 # force recomputation + except: + print 'computing whitened data' + whiten_patches() + +# +# This part of the file defines an op-constructor that uses the pre-processed cache / dataset generated +# + +@memo +def load_whitened(path=_whitened_file): + """ + Replacement for dataset_ops.image_patches.ranzato_hinton_2010_op + """ + try: + return numpy.load(path, mmap_mode='r') + except: + print >> sys.stderr, "Maybe you need to run 'python pylearn.dataset_ops.tinyimages'?" + raise + +@memo +def load_pca_dct(path=_pca_file): + return cPickle.load(open(path)) + +def tinydataset_op(s_idx, + split='train', + fn=load_whitened): + + n_examples,n_dim = fn().shape + + if split != 'train': + raise NotImplementedError('train/test/valid splits for randomly sampled image patches?') + + op = TensorFnDataset('float32', bcast=(False,), fn=fn, single_shape=(n_dim,)) + x = op(s_idx%n_examples) + return x + + +def save_filters(X, fname): + dct = load_pca_dct() + eigs = dct['eig_vals'], dct['eig_vecs'] + mean = dct['mean'] + rasterized = pylearn.preprocessing.pca.pca_whiten_inverse(eigs, X)+mean + _img = image_tiling.tile_raster_images( + (rasterized[:,::3], rasterized[:,1::3], rasterized[:,2::3], None), + img_shape=(8,8), + min_dynamic_range=1e-6) + image_tiling.save_tiled_raster_images(_img, fname) + +def glviewer(split='train'): + from glviewer import GlViewer + #i = theano.tensor.iscalar() + #f = theano.function([i], mnist(i, split, dtype='uint8', rasterized=False)[0]) + data = numpy.load(_raw_patch_file, mmap_mode='r') + print 'RAW', data.shape + data = numpy.load(_whitened_file, mmap_mode='r') + print 'WHI', data.shape + + if 1: # check the raw data + data = numpy.load(_raw_patch_file, mmap_mode='r') + data = data.reshape((data.shape[0], data.shape[1]*data.shape[2], data.shape[3])) + def f(i): + j = i*5000 + jj = j + 5000 + return image_tiling.tile_raster_images( + (data[j:jj,:,0], data[j:jj,:,1], data[j:jj,:,2], None), + img_shape=(8,8)) + if 0: # check the whitened data + dct = load_pca_dct() + eigs = dct['eig_vals'], dct['eig_vecs'] + mean = dct['mean'] + data = numpy.load(_whitened_file, mmap_mode='r') + def f(i): + j = i*5000 + jj = j + 5000 + X = data[j:jj] + print 'j', j, jj + rasterized = pylearn.preprocessing.pca.pca_whiten_inverse(eigs, X)+mean + _img = image_tiling.tile_raster_images( + (rasterized[:,::3], rasterized[:,1::3], rasterized[:,2::3], None), + img_shape=(8,8), + min_dynamic_range=1e-6) + return _img + GlViewer(f).main() + + + +if __name__=='__main__': + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + sys.exit(main()) + + diff -r 1817485d586d -r 976539956475 pylearn/datasets/config.py --- a/pylearn/datasets/config.py Wed Sep 15 17:49:21 2010 -0400 +++ b/pylearn/datasets/config.py Thu Sep 23 19:12:08 2010 -0400 @@ -5,12 +5,12 @@ """ import os, sys, logging -_logger = logging.getLogger('pylearn.datasets.config') -def debug(*msg): _logger.debug(' '.join(str(m) for m in msg)) -def info(*msg): _logger.info(' '.join(str(m) for m in msg)) -def warn(*msg): _logger.warn(' '.join(str(m) for m in msg)) -def warning(*msg): _logger.warning(' '.join(str(m) for m in msg)) -def error(*msg): _logger.error(' '.join(str(m) for m in msg)) +def _logger(): return logging.getLogger('pylearn.datasets.config') +def debug(*msg): _logger().debug(' '.join(str(m) for m in msg)) +def info(*msg): _logger().info(' '.join(str(m) for m in msg)) +def warn(*msg): _logger().warn(' '.join(str(m) for m in msg)) +def warning(*msg): _logger().warning(' '.join(str(m) for m in msg)) +def error(*msg): _logger().error(' '.join(str(m) for m in msg)) def env_get(key, default, key2 = None): diff -r 1817485d586d -r 976539956475 pylearn/datasets/tinyimages.py --- a/pylearn/datasets/tinyimages.py Wed Sep 15 17:49:21 2010 -0400 +++ b/pylearn/datasets/tinyimages.py Thu Sep 23 19:12:08 2010 -0400 @@ -5,10 +5,12 @@ __license__ = "3-clause BSD License" __contact__ = "bergstrj@iro.umontreal.ca" -import os, sys +import logging, os, sys import PIL.Image import numpy +logger = logging.getLogger('pylearn.datasets.tinyimages') + def sorted_listdir(*path): r = os.listdir(os.path.join(*path)) r.sort() @@ -27,8 +29,7 @@ yield path, letter, label, img def load_image(path): - """ - """ + """Return the image at `path` as a numpy ndarray """ rval = numpy.asarray(PIL.Image.open(path)) return rval @@ -40,10 +41,17 @@ Be careful with this generator because the dataset in total is close to 20GB! """ + n_colour_conversions = 0 + n_yielded = 0 for p in iterate_over_filenames(path=_original): - y = load_image(*p) - assert y.shape == (32,32,3) - assert y.dtype == numpy.uint8 + y = load_image(os.path.join(*p)) + n_yielded += 1 + if y.shape == (32,32): + logger.info("put %i'th/%i images in colour"%(n_colour_conversions, n_yielded)) + y = numpy.asarray([y,y,y]).transpose((1,2,0)).copy() + n_colour_conversions += 1 + assert y.shape == (32,32,3), (p,y.shape) + assert y.dtype == numpy.uint8, (p,y.dtype) yield y def load_first_N(N): @@ -53,14 +61,20 @@ yield it.next() i +=1 -if __name__ == '__main__': - if 0: - def iter_len(x): - i = 0 - for xx in x: - i += 1 - return i - print 'got %i files' % iter_len(iterate_over_filenames()) +n_images = 1608356 + +def main(): + def iter_len(x): + i = 0 + for xx in x: + i += 1 + return i + n_files = iter_len(iterate_over_filenames()) + print 'got %i files' % n_files + assert n_images == n_files for p in load_first_N(10): load_image(os.path.join(*p)) + +if __name__ == '__main__': + sys.exit(main())