# HG changeset patch # User James Bergstra # Date 1286732633 14400 # Node ID ebcb76b388176fabfb880a028e0c1fd015fdc942 # Parent 881bce55a2037391906d81c6d23700f03d368edc tinyimages - added main script to whiten patches diff -r 881bce55a203 -r ebcb76b38817 pylearn/dataset_ops/tinyimages.py --- a/pylearn/dataset_ops/tinyimages.py Sun Oct 10 13:39:28 2010 -0400 +++ b/pylearn/dataset_ops/tinyimages.py Sun Oct 10 13:43:53 2010 -0400 @@ -1,4 +1,8 @@ -"""I'm not sure where to put this code.""" +"""I'm not sure where to put this code. + +THIS IS NOT POLISHED LIBRARY CODE YET. + +""" __authors__ = "James Bergstra" __copyright__ = "(c) 2010, Universite de Montreal" @@ -76,19 +80,17 @@ min_dynamic_range=1e-6) image_tiling.save_tiled_raster_images(_img, fname) -def extract_save_patches(path=_raw_patch_file, n_imgs=1000*100, n_patches_per_image=10, patch_shape=(8,8)): +def extract_patches(n_imgs=1000*100, n_patches_per_image=10, patch_shape=(8,8), rng=numpy.random.RandomState(234)): """ - Generate the dataset and store it to the path named in _raw_patch_file + Extract a number of patches from each of the first TinyImages """ R,C=patch_shape dataset = numpy.empty((n_imgs*n_patches_per_image, R, C, 3), dtype='uint8') - savefile = open(path, 'wb') assert n_imgs < tinyimages.n_images image_stream = tinyimages.image_generator() - rng = numpy.random.RandomState(234) i = 0 while i < n_imgs: @@ -101,24 +103,23 @@ ii = i*n_patches_per_image dataset[ii:ii+n_patches_per_image] = yy i += 1 - - print 'saving' - numpy.save(savefile,dataset) + return dataset -def compute_save_pca(raw_path=_raw_patch_file, pca_path=_pca_file, use_only=100000, max_components=128, max_energy_fraction=.99): - """ - Memmap the data file named in `raw_path_file` and save the pca to `pca_path`. - """ - - data = numpy.load(raw_path, mmap_mode='r') - ofile = open(pca_path, 'wb') +def compute_pca_dct(X, use_only=100000, max_components=128, max_energy_fraction=.99): # each image channel is adjusted here - X = normalize_channels(numpy.asarray(data[:use_only], dtype='float32')/255) + ### X = normalize_channels(numpy.asarray(data[:use_only], dtype='float32')/255) + # rasterize images X = X.reshape((X.shape[0], X.shape[1]* X.shape[2]* X.shape[3])) + # switch to floats + X = X.astype('float32') + + # subtract off each image mean (ignoring channels) #TODO: IS THIS GOOD IDEA? + X = X - X.mean(axis=1).reshape((X.shape[0], 1)) + # subtract off global mean as part of pca data_mean = X.mean(axis=0) X = X - data_mean @@ -126,62 +127,92 @@ # calculating pca (eig_vals,eig_vecs), _ = pylearn.preprocessing.pca.pca_from_examples( X, max_components, max_energy_fraction, x_centered=True) - cPickle.dump( - dict( - mean=data_mean, - eig_vecs=eig_vecs, - eig_vals=eig_vals), - ofile) - ofile.close() + + print "Keeping %i principle components" % len(eig_vals) -def whiten_patches(raw_path=_raw_patch_file, pca_path=_pca_file, whitened_file=_whitened_file): + return dict( + mean=data_mean, + eig_vecs=eig_vecs, + eig_vals=eig_vals) + +def whiten_patches(raw_patches, pca_dct): """ Load the patches from sys.argv[1] and whiten them with sys.argv[2], saving them to sys.argv[3]. """ - data = numpy.load(raw_path, mmap_mode='r') - dct = cPickle.load(open(pca_path)) - rval = numpy.empty((data.shape[0], len(dct['eig_vals'])), dtype='float32') + rval = numpy.empty((raw_patches.shape[0], len(pca_dct['eig_vals'])), dtype='float32') print 'allocated output of size', rval.shape - b = 100 + b = 100 #batchsize i = 0 while i < len(rval): - # each image channel is adjusted here - xi = normalize_channels(numpy.asarray(data[i:i+b], dtype='float32')/255) - #rasterize images + xi = numpy.asarray(raw_patches[i:i+b], dtype='float32') + # rasterize xi = xi.reshape((xi.shape[0], xi.shape[1]*xi.shape[2]*xi.shape[3])) - xi -= dct['mean'] - rval[i:i+b] = pylearn.preprocessing.pca.pca_whiten((dct['eig_vals'], dct['eig_vecs']), xi) + # remove image mean + xi = xi - xi.mean(axis=1).reshape((xi.shape[0], 1)) + # remove pixel means + xi -= pca_dct['mean'] + rval[i:i+b] = pylearn.preprocessing.pca.pca_whiten((pca_dct['eig_vals'], pca_dct['eig_vecs']), xi) i += b - print 'saving', whitened_file - numpy.save(whitened_file, rval) + return rval -def main(): +def main(n_imgs=1000, n_patches_per_image=10, max_components=128, seed=234): if 0: #do this to render the dataset to the screen sys.exit(glviewer()) + + rng = numpy.random.RandomState(seed) + try: - open(_raw_patch_file).close() - assert 0 # force recomputation + open(_raw_patch_file).close() #fails if file not present + load_raw_patches = True except: - print 'saving patches' - extract_save_patches() + load_raw_patches = False + + if load_raw_patches: + print 'loading raw patches from', _raw_patch_file + raw_patches = numpy.load(_raw_patch_file, mmap_mode='r') + else: + print 'extracting raw patches' + raw_patches = extract_patches(rng=rng, n_imgs=n_imgs, + n_patches_per_image=n_patches_per_image) + rng.shuffle(raw_patches) + print 'saving raw patches to', _raw_patch_file + numpy.save(open(_raw_patch_file, 'wb'), raw_patches) try: open(_pca_file).close() - assert 0 # force recomputation + load_pca = True except: + load_pca = False + + if load_pca: + print 'loading pca from', _pca_file + pca_dct = cPickle.load(open(_pca_file)) + else: print 'computing pca' - compute_save_pca() + pca_dct = compute_pca_dct(raw_patches, max_components=max_components) + print 'saving pca to', _pca_file + cPickle.dump(pca_dct, open(_pca_file, 'wb')) try: open(_whitened_file).close() - assert 0 # force recomputation + load_patches = True except: + load_patches = False + + if load_patches: + print 'loading whitened patches from', _whitened_file + whitened_patches = numpy.load(_whitened_file, mmap_mode='r') + else: print 'computing whitened data' - whiten_patches() + whitened_patches = whiten_patches(raw_patches, pca_dct) + print 'saving', _whitened_file + numpy.save(_whitened_file, whitened_patches) + + return whitened_patches, pca_dct # # This part of the file defines an op-constructor that uses the pre-processed cache / dataset generated