changeset 1454:283fb236f104

datasets/tinyimages can generate a shuffled file
author James Bergstra <bergstrj@iro.umontreal.ca>
date Mon, 04 Apr 2011 19:00:53 -0400
parents d9dd09a2ee90
children 93e5ce7ccd6d
files pylearn/datasets/tinyimages.py
diffstat 1 files changed, 38 insertions(+), 11 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/datasets/tinyimages.py	Mon Apr 04 18:59:18 2011 -0400
+++ b/pylearn/datasets/tinyimages.py	Mon Apr 04 19:00:53 2011 -0400
@@ -22,6 +22,8 @@
 _tinyimages_root='/data/lisa/data/tinyimages'
 _original=_tinyimages_root+'/tinyimages/original'
 _npy_file=_tinyimages_root+'/tinyimages.npy'
+_shuffled_npy_file=_tinyimages_root+'/tinyimages_shuffled.npy'
+_shuffled_npy_seed=12345
 _README_file=_tinyimages_root+'/README.txt'
 _README = """
 TinyImages is a dataset of 32x32 RGB images.
@@ -89,15 +91,14 @@
 def arrange_first_N_into_tiling(R,C, filename):
     R=int(R)
     C=int(C)
-    A = numpy.asarray([i.copy() for i,ii in zip(image_generator(), xrange(R*C))],
-            dtype='float32')
-    print A.shape
-    A.shape = (R*C, 32*32,3)
-    pylearn.io.image_tiling.save_tiled_raster_images(
-        pylearn.io.image_tiling.tile_raster_images(
-            (A[:,:,0], A[:,:,1], A[:,:,2], None),
-            (32,32)),
-        filename)
+    if 1:
+        A = numpy.asarray([i.copy() for i,ii in zip(image_generator(), xrange(R*C))])
+    elif 0:
+        A = get_memmapped_file(R*C)
+    else:
+        A = get_shuffled_memmapped_file(R*C)
+    pylearn.io.image_tiling.tile_slices_to_image_uint8(A,
+            tile_shape=(R,C)).save(filename)
 
 
 n_images = 1608356
@@ -107,22 +108,48 @@
             dtype='uint8',
             mode='r',
             shape=(N,32,32,3))
+def get_shuffled_memmapped_file(N=n_images, filename=_shuffled_npy_file):
+    return get_memmapped_file(N, filename)
 
-def rebuild_numpy_file(N=n_images, filename=_npy_file):
+def rebuild_memmapped_file(N=n_images, filename=_npy_file):
     shp = (N,32,32,3)
     print >> sys.stderr, "pylearn.datasets.tinyimages rebuilding", filename, shp, N*32*32*3 / float(1024**3), 'gigabytes'
     open(_README_file, 'w').write(_README)
     mmap = numpy.memmap(filename,
             dtype='uint8',
-            mode='w+',
+            mode='w+', #create over overwrite file for R/W
             shape=shp)
     ig = image_generator()
     for ii in xrange(N):
         mmap[ii] = ig.next()
     mmap.flush()
 
+def rebuild_shuffled_memmapped_file(N=n_images, filename=_shuffled_npy_file,
+        seed=_shuffled_npy_seed,
+        orig_filename=_npy_file):
+    try:
+        orig = get_memmapped_file(N, orig_filename)
+    except IOError:
+        print >> sys.stderr, "pylearn.datasets.tinyimages: rebuild un-shuffled file first"
+        raise
+    shp = orig.shape
+    print >> sys.stderr, "pylearn.datasets.tinyimages rebuilding", filename, shp, N*32*32*3 / float(1024**3), 'gigabytes'
+    mmap = numpy.memmap(filename,
+            dtype='uint8',
+            mode='w+',#create over overwrite file for R/W
+            shape=shp)
+    idxlist = numpy.arange(orig.shape[0])
+    numpy.random.RandomState(seed).shuffle(idxlist)
+    assert idxlist[0] != 0
+    for i0, i1 in enumerate(idxlist):
+        mmap[i0] = orig[i1]
+        if not i0 % 10000:
+            print>> sys.stderr, "%i/%i"%(i0, len(idxlist))
+    mmap.flush()
+
 def main(argv=[]):
     if argv:
+        print "Saving images to ", argv[2]
         arrange_first_N_into_tiling( argv[0], argv[1], argv[2])
     else:
         def iter_len(x):