view pylearn/datasets/tinyimages.py @ 1415:234e5e48d60d

added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
author James Bergstra <bergstrj@iro.umontreal.ca>
date Thu, 03 Feb 2011 18:07:04 -0500
parents 785aeb7a4df2
children 283fb236f104
line wrap: on
line source

"""Code for loading the tinyimages dataset.
"""

__authors__ = "James Bergstra"
__copyright__ = "(c) 2010, Universite de Montreal"
__license__ = "3-clause BSD License"
__contact__ = "bergstrj@iro.umontreal.ca"

import logging, os, sys
import PIL.Image
import numpy

import pylearn.io.image_tiling

logger = logging.getLogger('pylearn.datasets.tinyimages')

def sorted_listdir(*path):
    r = os.listdir(os.path.join(*path))
    r.sort()
    return r

_tinyimages_root='/data/lisa/data/tinyimages'
_original=_tinyimages_root+'/tinyimages/original'
_npy_file=_tinyimages_root+'/tinyimages.npy'
_README_file=_tinyimages_root+'/README.txt'
_README = """
TinyImages is a dataset of 32x32 RGB images.
This database contains 1608356 images, although there are something like
80 million of them here: http://groups.csail.mit.edu/vision/TinyImages/

The database was downloaded from ***
The dataset is described in ***.

The large numpy in this directory is a mem-mappable tensor of the form:
    [n_images, rows, cols, channels].
The elements are unsigned integers from 0 to 255, that mean the conventional
channel pixel intensity.

The numpy file is generated by calling
    pylearn.datasets.tinyimages.rebuild_numpy_file()

"""


def iterate_over_filenames(path=_original):
    """
    Generate (root, letter, label, filename) tuples for each image file in the
    dataset.
    """
    for letter in sorted_listdir(path):
        for label in sorted_listdir(path, letter):
            for img in sorted_listdir(path, letter, label):
                yield path, letter, label, img

def load_image(path):
    """Return the image at `path` as a numpy ndarray """
    rval = numpy.asarray(PIL.Image.open(path))
    return rval

def image_generator(path=_original):
    """
    Generate numpy ndarrays of size (32,32,3) and dtype 'uint8' for each image
    in the dataset.

    Be careful with this generator because the dataset in total is close to
    20GB!
    """
    n_colour_conversions = 0
    n_yielded = 0
    for p in iterate_over_filenames(path=_original):
        y = load_image(os.path.join(*p))
        n_yielded += 1
        if y.shape == (32,32):
            logger.info("put %i'th/%i images in colour"%(n_colour_conversions, n_yielded))
            y = numpy.asarray([y,y,y]).transpose((1,2,0)).copy()
            n_colour_conversions += 1
        assert y.shape == (32,32,3), (p,y.shape)
        assert y.dtype == numpy.uint8, (p,y.dtype)
        yield y

def load_first_N(N):
    i = 0
    it = iterate_over_filenames()
    while i < N:
        yield it.next()
        i +=1


def arrange_first_N_into_tiling(R,C, filename):
    R=int(R)
    C=int(C)
    A = numpy.asarray([i.copy() for i,ii in zip(image_generator(), xrange(R*C))],
            dtype='float32')
    print A.shape
    A.shape = (R*C, 32*32,3)
    pylearn.io.image_tiling.save_tiled_raster_images(
        pylearn.io.image_tiling.tile_raster_images(
            (A[:,:,0], A[:,:,1], A[:,:,2], None),
            (32,32)),
        filename)


n_images = 1608356

def get_memmapped_file(N=n_images, filename=_npy_file):
    return numpy.memmap(filename,
            dtype='uint8',
            mode='r',
            shape=(N,32,32,3))

def rebuild_numpy_file(N=n_images, filename=_npy_file):
    shp = (N,32,32,3)
    print >> sys.stderr, "pylearn.datasets.tinyimages rebuilding", filename, shp, N*32*32*3 / float(1024**3), 'gigabytes'
    open(_README_file, 'w').write(_README)
    mmap = numpy.memmap(filename,
            dtype='uint8',
            mode='w+',
            shape=shp)
    ig = image_generator()
    for ii in xrange(N):
        mmap[ii] = ig.next()
    mmap.flush()

def main(argv=[]):
    if argv:
        arrange_first_N_into_tiling( argv[0], argv[1], argv[2])
    else:
        def iter_len(x):
            i = 0
            for xx in x:
                i += 1
            return i
        n_files = iter_len(iterate_over_filenames())
        print 'got %i files' % n_files
        assert n_images == n_files

        for p in load_first_N(10):
            load_image(os.path.join(*p))


if __name__ == '__main__':
    sys.exit(main(sys.argv[1:]))