# HG changeset patch # User James Bergstra # Date 1296774424 18000 # Node ID 234e5e48d60df61b12079a1d93cb44fea6f540e0 # Parent 2b82c5a11512c21d978449d0cb1811b4538d5b22 added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images diff -r 2b82c5a11512 -r 234e5e48d60d pylearn/datasets/tinyimages.py --- a/pylearn/datasets/tinyimages.py Thu Feb 03 13:50:27 2011 -0500 +++ b/pylearn/datasets/tinyimages.py Thu Feb 03 18:07:04 2011 -0500 @@ -1,4 +1,5 @@ -"""Code for loading the tinyimages dataset.""" +"""Code for loading the tinyimages dataset. +""" __authors__ = "James Bergstra" __copyright__ = "(c) 2010, Universite de Montreal" @@ -18,7 +19,28 @@ r.sort() return r -_original='/data/lisa/data/tinyimages/tinyimages/original' +_tinyimages_root='/data/lisa/data/tinyimages' +_original=_tinyimages_root+'/tinyimages/original' +_npy_file=_tinyimages_root+'/tinyimages.npy' +_README_file=_tinyimages_root+'/README.txt' +_README = """ +TinyImages is a dataset of 32x32 RGB images. +This database contains 1608356 images, although there are something like +80 million of them here: http://groups.csail.mit.edu/vision/TinyImages/ + +The database was downloaded from *** +The dataset is described in ***. + +The large numpy in this directory is a mem-mappable tensor of the form: + [n_images, rows, cols, channels]. +The elements are unsigned integers from 0 to 255, that mean the conventional +channel pixel intensity. + +The numpy file is generated by calling + pylearn.datasets.tinyimages.rebuild_numpy_file() + +""" + def iterate_over_filenames(path=_original): """ @@ -78,7 +100,26 @@ filename) -n_images = 1608356 +n_images = 1608356 + +def get_memmapped_file(N=n_images, filename=_npy_file): + return numpy.memmap(filename, + dtype='uint8', + mode='r', + shape=(N,32,32,3)) + +def rebuild_numpy_file(N=n_images, filename=_npy_file): + shp = (N,32,32,3) + print >> sys.stderr, "pylearn.datasets.tinyimages rebuilding", filename, shp, N*32*32*3 / float(1024**3), 'gigabytes' + open(_README_file, 'w').write(_README) + mmap = numpy.memmap(filename, + dtype='uint8', + mode='w+', + shape=shp) + ig = image_generator() + for ii in xrange(N): + mmap[ii] = ig.next() + mmap.flush() def main(argv=[]): if argv: