Mercurial > pylearn
view pylearn/datasets/tinyimages.py @ 1524:9d21919e2332
autopep8
author | Frederic Bastien <nouiz@nouiz.org> |
---|---|
date | Fri, 02 Nov 2012 13:02:18 -0400 |
parents | 31d8c6a0a70d |
children |
line wrap: on
line source
"""Code for loading the tinyimages dataset. """ __authors__ = "James Bergstra" __copyright__ = "(c) 2010, Universite de Montreal" __license__ = "3-clause BSD License" __contact__ = "bergstrj@iro.umontreal.ca" import logging, os, sys import PIL.Image import numpy import pylearn.io.image_tiling logger = logging.getLogger('pylearn.datasets.tinyimages') def sorted_listdir(*path): r = os.listdir(os.path.join(*path)) r.sort() return r _tinyimages_root='/data/lisa/data/tinyimages' _original=_tinyimages_root+'/tinyimages/original' _npy_file=_tinyimages_root+'/tinyimages.npy' _shuffled_npy_file=_tinyimages_root+'/tinyimages_shuffled.npy' _shuffled_npy_seed=12345 _README_file=_tinyimages_root+'/README.txt' _README = """ TinyImages is a dataset of 32x32 RGB images. This database contains 1608356 images, although there are something like 80 million of them here: http://groups.csail.mit.edu/vision/TinyImages/ The database was downloaded from *** The dataset is described in ***. The large numpy in this directory is a mem-mappable tensor of the form: [n_images, rows, cols, channels]. The elements are unsigned integers from 0 to 255, that mean the conventional channel pixel intensity. The numpy file is generated by calling pylearn.datasets.tinyimages.rebuild_numpy_file() """ def iterate_over_filenames(path=_original): """ Generate (root, letter, label, filename) tuples for each image file in the dataset. """ for letter in sorted_listdir(path): for label in sorted_listdir(path, letter): for img in sorted_listdir(path, letter, label): yield path, letter, label, img def load_image(path): """Return the image at `path` as a numpy ndarray """ rval = numpy.asarray(PIL.Image.open(path)) return rval def image_generator(path=_original): """ Generate numpy ndarrays of size (32,32,3) and dtype 'uint8' for each image in the dataset. Be careful with this generator because the dataset in total is close to 20GB! """ n_colour_conversions = 0 n_yielded = 0 for p in iterate_over_filenames(path=_original): y = load_image(os.path.join(*p)) n_yielded += 1 if y.shape == (32,32): logger.info("put %i'th/%i images in colour"%(n_colour_conversions, n_yielded)) y = numpy.asarray([y,y,y]).transpose((1,2,0)).copy() n_colour_conversions += 1 assert y.shape == (32,32,3), (p,y.shape) assert y.dtype == numpy.uint8, (p,y.dtype) yield y def load_first_N(N): i = 0 it = iterate_over_filenames() while i < N: yield it.next() i +=1 def arrange_first_N_into_tiling(R,C, fileroot): R=int(R) C=int(C) A = numpy.asarray([i.copy() for i,ii in zip(image_generator(), xrange(R*C))]) pylearn.io.image_tiling.tile_slices_to_image_uint8(A, tile_shape=(R,C)).save(fileroot+'_from_imgs.png') A = get_memmapped_file(R*C) pylearn.io.image_tiling.tile_slices_to_image_uint8(A, tile_shape=(R,C)).save(fileroot+'_memmapped.png') A = get_shuffled_memmapped_file(R*C) pylearn.io.image_tiling.tile_slices_to_image_uint8(A, tile_shape=(R,C)).save(fileroot+'_shuffled.png') n_images = 1608356 def get_memmapped_file(N=n_images, filename=_npy_file): return numpy.memmap(filename, dtype='uint8', mode='r', shape=(N,32,32,3)) def get_shuffled_memmapped_file(N=n_images, filename=_shuffled_npy_file): return get_memmapped_file(N, filename) def rebuild_memmapped_file(N=n_images, filename=_npy_file): shp = (N,32,32,3) print >> sys.stderr, "pylearn.datasets.tinyimages rebuilding", filename, shp, N*32*32*3 / float(1024**3), 'gigabytes' open(_README_file, 'w').write(_README) mmap = numpy.memmap(filename, dtype='uint8', mode='w+', #create over overwrite file for R/W shape=shp) ig = image_generator() for ii in xrange(N): mmap[ii] = ig.next() mmap.flush() def rebuild_shuffled_memmapped_file(N=n_images, filename=_shuffled_npy_file, seed=_shuffled_npy_seed, orig_filename=_npy_file): try: orig = get_memmapped_file(N, orig_filename) except IOError: print >> sys.stderr, "pylearn.datasets.tinyimages: rebuild un-shuffled file first" raise shp = orig.shape print >> sys.stderr, "pylearn.datasets.tinyimages rebuilding", filename, shp, N*32*32*3 / float(1024**3), 'gigabytes' mmap = numpy.memmap(filename, dtype='uint8', mode='w+',#create over overwrite file for R/W shape=shp) idxlist = numpy.arange(orig.shape[0]) numpy.random.RandomState(seed).shuffle(idxlist) assert idxlist[0] != 0 for i0, i1 in enumerate(idxlist): mmap[i0] = orig[i1] if not i0 % 10000: print>> sys.stderr, "%i/%i"%(i0, len(idxlist)) mmap.flush() def main(argv=[]): if argv: print "Saving images to ", argv[2] arrange_first_N_into_tiling( argv[0], argv[1], argv[2]) else: def iter_len(x): i = 0 for xx in x: i += 1 return i n_files = iter_len(iterate_over_filenames()) print 'got %i files' % n_files assert n_images == n_files for p in load_first_N(10): load_image(os.path.join(*p)) if __name__ == '__main__': sys.exit(main(sys.argv[1:]))