changeset 1415:234e5e48d60d

added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
author James Bergstra <bergstrj@iro.umontreal.ca>
date Thu, 03 Feb 2011 18:07:04 -0500
parents 2b82c5a11512
children 28b2f17991aa
files pylearn/datasets/tinyimages.py
diffstat 1 files changed, 44 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/datasets/tinyimages.py	Thu Feb 03 13:50:27 2011 -0500
+++ b/pylearn/datasets/tinyimages.py	Thu Feb 03 18:07:04 2011 -0500
@@ -1,4 +1,5 @@
-"""Code for loading the tinyimages dataset."""
+"""Code for loading the tinyimages dataset.
+"""
 
 __authors__ = "James Bergstra"
 __copyright__ = "(c) 2010, Universite de Montreal"
@@ -18,7 +19,28 @@
     r.sort()
     return r
 
-_original='/data/lisa/data/tinyimages/tinyimages/original'
+_tinyimages_root='/data/lisa/data/tinyimages'
+_original=_tinyimages_root+'/tinyimages/original'
+_npy_file=_tinyimages_root+'/tinyimages.npy'
+_README_file=_tinyimages_root+'/README.txt'
+_README = """
+TinyImages is a dataset of 32x32 RGB images.
+This database contains 1608356 images, although there are something like
+80 million of them here: http://groups.csail.mit.edu/vision/TinyImages/
+
+The database was downloaded from ***
+The dataset is described in ***.
+
+The large numpy in this directory is a mem-mappable tensor of the form:
+    [n_images, rows, cols, channels].
+The elements are unsigned integers from 0 to 255, that mean the conventional
+channel pixel intensity.
+
+The numpy file is generated by calling
+    pylearn.datasets.tinyimages.rebuild_numpy_file()
+
+"""
+
 
 def iterate_over_filenames(path=_original):
     """
@@ -78,7 +100,26 @@
         filename)
 
 
-n_images = 1608356 
+n_images = 1608356
+
+def get_memmapped_file(N=n_images, filename=_npy_file):
+    return numpy.memmap(filename,
+            dtype='uint8',
+            mode='r',
+            shape=(N,32,32,3))
+
+def rebuild_numpy_file(N=n_images, filename=_npy_file):
+    shp = (N,32,32,3)
+    print >> sys.stderr, "pylearn.datasets.tinyimages rebuilding", filename, shp, N*32*32*3 / float(1024**3), 'gigabytes'
+    open(_README_file, 'w').write(_README)
+    mmap = numpy.memmap(filename,
+            dtype='uint8',
+            mode='w+',
+            shape=shp)
+    ig = image_generator()
+    for ii in xrange(N):
+        mmap[ii] = ig.next()
+    mmap.flush()
 
 def main(argv=[]):
     if argv: