Mercurial > pylearn
changeset 1415:234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Thu, 03 Feb 2011 18:07:04 -0500 |
parents | 2b82c5a11512 |
children | 28b2f17991aa |
files | pylearn/datasets/tinyimages.py |
diffstat | 1 files changed, 44 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/pylearn/datasets/tinyimages.py Thu Feb 03 13:50:27 2011 -0500 +++ b/pylearn/datasets/tinyimages.py Thu Feb 03 18:07:04 2011 -0500 @@ -1,4 +1,5 @@ -"""Code for loading the tinyimages dataset.""" +"""Code for loading the tinyimages dataset. +""" __authors__ = "James Bergstra" __copyright__ = "(c) 2010, Universite de Montreal" @@ -18,7 +19,28 @@ r.sort() return r -_original='/data/lisa/data/tinyimages/tinyimages/original' +_tinyimages_root='/data/lisa/data/tinyimages' +_original=_tinyimages_root+'/tinyimages/original' +_npy_file=_tinyimages_root+'/tinyimages.npy' +_README_file=_tinyimages_root+'/README.txt' +_README = """ +TinyImages is a dataset of 32x32 RGB images. +This database contains 1608356 images, although there are something like +80 million of them here: http://groups.csail.mit.edu/vision/TinyImages/ + +The database was downloaded from *** +The dataset is described in ***. + +The large numpy in this directory is a mem-mappable tensor of the form: + [n_images, rows, cols, channels]. +The elements are unsigned integers from 0 to 255, that mean the conventional +channel pixel intensity. + +The numpy file is generated by calling + pylearn.datasets.tinyimages.rebuild_numpy_file() + +""" + def iterate_over_filenames(path=_original): """ @@ -78,7 +100,26 @@ filename) -n_images = 1608356 +n_images = 1608356 + +def get_memmapped_file(N=n_images, filename=_npy_file): + return numpy.memmap(filename, + dtype='uint8', + mode='r', + shape=(N,32,32,3)) + +def rebuild_numpy_file(N=n_images, filename=_npy_file): + shp = (N,32,32,3) + print >> sys.stderr, "pylearn.datasets.tinyimages rebuilding", filename, shp, N*32*32*3 / float(1024**3), 'gigabytes' + open(_README_file, 'w').write(_README) + mmap = numpy.memmap(filename, + dtype='uint8', + mode='w+', + shape=shp) + ig = image_generator() + for ii in xrange(N): + mmap[ii] = ig.next() + mmap.flush() def main(argv=[]): if argv: