view pylearn/datasets/tinyimages.py @ 1380:785aeb7a4df2

added a fn to datasets/tiny_images to output a mosaic of images from the dataset
author James Bergstra <bergstrj@iro.umontreal.ca>
date Fri, 03 Dec 2010 09:09:00 -0500
parents 976539956475
children 234e5e48d60d
line wrap: on
line source

"""Code for loading the tinyimages dataset."""

__authors__ = "James Bergstra"
__copyright__ = "(c) 2010, Universite de Montreal"
__license__ = "3-clause BSD License"
__contact__ = "bergstrj@iro.umontreal.ca"

import logging, os, sys
import PIL.Image
import numpy

import pylearn.io.image_tiling

logger = logging.getLogger('pylearn.datasets.tinyimages')

def sorted_listdir(*path):
    r = os.listdir(os.path.join(*path))
    r.sort()
    return r

_original='/data/lisa/data/tinyimages/tinyimages/original'

def iterate_over_filenames(path=_original):
    """
    Generate (root, letter, label, filename) tuples for each image file in the
    dataset.
    """
    for letter in sorted_listdir(path):
        for label in sorted_listdir(path, letter):
            for img in sorted_listdir(path, letter, label):
                yield path, letter, label, img

def load_image(path):
    """Return the image at `path` as a numpy ndarray """
    rval = numpy.asarray(PIL.Image.open(path))
    return rval

def image_generator(path=_original):
    """
    Generate numpy ndarrays of size (32,32,3) and dtype 'uint8' for each image
    in the dataset.

    Be careful with this generator because the dataset in total is close to
    20GB!
    """
    n_colour_conversions = 0
    n_yielded = 0
    for p in iterate_over_filenames(path=_original):
        y = load_image(os.path.join(*p))
        n_yielded += 1
        if y.shape == (32,32):
            logger.info("put %i'th/%i images in colour"%(n_colour_conversions, n_yielded))
            y = numpy.asarray([y,y,y]).transpose((1,2,0)).copy()
            n_colour_conversions += 1
        assert y.shape == (32,32,3), (p,y.shape)
        assert y.dtype == numpy.uint8, (p,y.dtype)
        yield y

def load_first_N(N):
    i = 0
    it = iterate_over_filenames()
    while i < N:
        yield it.next()
        i +=1


def arrange_first_N_into_tiling(R,C, filename):
    R=int(R)
    C=int(C)
    A = numpy.asarray([i.copy() for i,ii in zip(image_generator(), xrange(R*C))],
            dtype='float32')
    print A.shape
    A.shape = (R*C, 32*32,3)
    pylearn.io.image_tiling.save_tiled_raster_images(
        pylearn.io.image_tiling.tile_raster_images(
            (A[:,:,0], A[:,:,1], A[:,:,2], None),
            (32,32)),
        filename)


n_images = 1608356 

def main(argv=[]):
    if argv:
        arrange_first_N_into_tiling( argv[0], argv[1], argv[2])
    else:
        def iter_len(x):
            i = 0
            for xx in x:
                i += 1
            return i
        n_files = iter_len(iterate_over_filenames())
        print 'got %i files' % n_files
        assert n_images == n_files

        for p in load_first_N(10):
            load_image(os.path.join(*p))


if __name__ == '__main__':
    sys.exit(main(sys.argv[1:]))