# HG changeset patch
# User James Bergstra <bergstrj@iro.umontreal.ca>
# Date 1286732633 14400
# Node ID ebcb76b388176fabfb880a028e0c1fd015fdc942
# Parent  881bce55a2037391906d81c6d23700f03d368edc
tinyimages - added main script to whiten patches

diff -r 881bce55a203 -r ebcb76b38817 pylearn/dataset_ops/tinyimages.py
--- a/pylearn/dataset_ops/tinyimages.py	Sun Oct 10 13:39:28 2010 -0400
+++ b/pylearn/dataset_ops/tinyimages.py	Sun Oct 10 13:43:53 2010 -0400
@@ -1,4 +1,8 @@
-"""I'm not sure where to put this code."""
+"""I'm not sure where to put this code.
+
+THIS IS NOT POLISHED LIBRARY CODE YET.
+
+"""
 
 __authors__ = "James Bergstra"
 __copyright__ = "(c) 2010, Universite de Montreal"
@@ -76,19 +80,17 @@
             min_dynamic_range=1e-6)
     image_tiling.save_tiled_raster_images(_img, fname)
 
-def extract_save_patches(path=_raw_patch_file, n_imgs=1000*100, n_patches_per_image=10, patch_shape=(8,8)):
+def extract_patches(n_imgs=1000*100, n_patches_per_image=10, patch_shape=(8,8), rng=numpy.random.RandomState(234)):
     """
-    Generate the dataset and store it to the path named in _raw_patch_file
+    Extract a number of patches from each of the first TinyImages
     """
     R,C=patch_shape
 
     dataset = numpy.empty((n_imgs*n_patches_per_image, R, C, 3), dtype='uint8')
-    savefile = open(path, 'wb')
 
     assert n_imgs < tinyimages.n_images
 
     image_stream = tinyimages.image_generator()
-    rng = numpy.random.RandomState(234)
 
     i = 0
     while i < n_imgs:
@@ -101,24 +103,23 @@
         ii = i*n_patches_per_image
         dataset[ii:ii+n_patches_per_image] = yy
         i += 1
-
-    print 'saving'
-    numpy.save(savefile,dataset)
+    return dataset
 
-def compute_save_pca(raw_path=_raw_patch_file, pca_path=_pca_file, use_only=100000, max_components=128, max_energy_fraction=.99):
-    """
-    Memmap the data file named in `raw_path_file` and save the pca to `pca_path`.
-    """
-
-    data = numpy.load(raw_path, mmap_mode='r')
-    ofile = open(pca_path, 'wb')
+def compute_pca_dct(X, use_only=100000, max_components=128, max_energy_fraction=.99):
 
     # each image channel is adjusted here
-    X = normalize_channels(numpy.asarray(data[:use_only], dtype='float32')/255)
+    ### X = normalize_channels(numpy.asarray(data[:use_only], dtype='float32')/255)
+
 
     # rasterize images
     X = X.reshape((X.shape[0], X.shape[1]* X.shape[2]* X.shape[3]))
 
+    # switch to floats
+    X = X.astype('float32')
+
+    # subtract off each image mean (ignoring channels) #TODO: IS THIS GOOD IDEA?
+    X = X - X.mean(axis=1).reshape((X.shape[0], 1))
+
     # subtract off global mean as part of pca
     data_mean = X.mean(axis=0)
     X = X - data_mean
@@ -126,62 +127,92 @@
     # calculating pca
     (eig_vals,eig_vecs), _ = pylearn.preprocessing.pca.pca_from_examples(
             X, max_components, max_energy_fraction, x_centered=True)
-    cPickle.dump(
-            dict(
-                mean=data_mean,
-                eig_vecs=eig_vecs,
-                eig_vals=eig_vals),
-            ofile)
-    ofile.close()
+
+    print "Keeping %i principle components" % len(eig_vals)
 
-def whiten_patches(raw_path=_raw_patch_file, pca_path=_pca_file, whitened_file=_whitened_file):
+    return dict(
+            mean=data_mean,
+            eig_vecs=eig_vecs,
+            eig_vals=eig_vals)
+
+def whiten_patches(raw_patches, pca_dct):
     """
     Load the patches from sys.argv[1] and whiten them with sys.argv[2], saving them to
     sys.argv[3].
     """
-    data = numpy.load(raw_path, mmap_mode='r')
-    dct = cPickle.load(open(pca_path))
 
-    rval = numpy.empty((data.shape[0], len(dct['eig_vals'])), dtype='float32')
+    rval = numpy.empty((raw_patches.shape[0], len(pca_dct['eig_vals'])), dtype='float32')
 
     print 'allocated output of size', rval.shape
 
-    b = 100
+    b = 100 #batchsize
     i = 0
     while i < len(rval):
-        # each image channel is adjusted here
-        xi = normalize_channels(numpy.asarray(data[i:i+b], dtype='float32')/255)
-        #rasterize images
+        xi = numpy.asarray(raw_patches[i:i+b], dtype='float32')
+        # rasterize
         xi = xi.reshape((xi.shape[0], xi.shape[1]*xi.shape[2]*xi.shape[3]))
-        xi -= dct['mean']
-        rval[i:i+b] = pylearn.preprocessing.pca.pca_whiten((dct['eig_vals'], dct['eig_vecs']), xi)
+        # remove image mean
+        xi = xi - xi.mean(axis=1).reshape((xi.shape[0], 1))
+        # remove pixel means
+        xi -= pca_dct['mean']
+        rval[i:i+b] = pylearn.preprocessing.pca.pca_whiten((pca_dct['eig_vals'], pca_dct['eig_vecs']), xi)
         i += b
-    print 'saving', whitened_file
-    numpy.save(whitened_file, rval)
+    return rval
 
-def main():
+def main(n_imgs=1000, n_patches_per_image=10, max_components=128, seed=234):
     if 0: #do this to render the dataset to the screen
         sys.exit(glviewer())
+
+    rng = numpy.random.RandomState(seed)
+
     try:
-        open(_raw_patch_file).close()
-        assert 0 # force recomputation
+        open(_raw_patch_file).close() #fails if file not present
+        load_raw_patches = True
     except:
-        print 'saving patches'
-        extract_save_patches()
+        load_raw_patches = False
+
+    if load_raw_patches:
+        print 'loading raw patches from', _raw_patch_file
+        raw_patches = numpy.load(_raw_patch_file, mmap_mode='r')
+    else:
+        print 'extracting raw patches'
+        raw_patches = extract_patches(rng=rng, n_imgs=n_imgs,
+                n_patches_per_image=n_patches_per_image)
+        rng.shuffle(raw_patches)
+        print 'saving raw patches to', _raw_patch_file
+        numpy.save(open(_raw_patch_file, 'wb'), raw_patches)
 
     try:
         open(_pca_file).close()
-        assert 0 # force recomputation
+        load_pca = True
     except:
+        load_pca = False
+
+    if load_pca:
+        print 'loading pca from', _pca_file
+        pca_dct = cPickle.load(open(_pca_file))
+    else:
         print 'computing pca'
-        compute_save_pca()
+        pca_dct = compute_pca_dct(raw_patches, max_components=max_components)
+        print 'saving pca to', _pca_file
+        cPickle.dump(pca_dct, open(_pca_file, 'wb'))
 
     try:
         open(_whitened_file).close()
-        assert 0 # force recomputation
+        load_patches = True
     except:
+        load_patches = False
+
+    if load_patches:
+        print 'loading whitened patches from', _whitened_file
+        whitened_patches = numpy.load(_whitened_file, mmap_mode='r')
+    else:
         print 'computing whitened data'
-        whiten_patches()
+        whitened_patches = whiten_patches(raw_patches, pca_dct)
+        print 'saving', _whitened_file
+        numpy.save(_whitened_file, whitened_patches)
+
+    return whitened_patches, pca_dct
 
 #
 # This part of the file defines an op-constructor that uses the pre-processed cache / dataset generated