view pylearn/datasets/tinyimages.py @ 1458:31d8c6a0a70d

fixed backward logic regarding shuffled in tinyimages_op
author James Bergstra <bergstrj@iro.umontreal.ca>
date Mon, 04 Apr 2011 23:40:06 -0400
parents 283fb236f104
children
line wrap: on
line source

"""Code for loading the tinyimages dataset.
"""

__authors__ = "James Bergstra"
__copyright__ = "(c) 2010, Universite de Montreal"
__license__ = "3-clause BSD License"
__contact__ = "bergstrj@iro.umontreal.ca"

import logging, os, sys
import PIL.Image
import numpy

import pylearn.io.image_tiling

logger = logging.getLogger('pylearn.datasets.tinyimages')

def sorted_listdir(*path):
    r = os.listdir(os.path.join(*path))
    r.sort()
    return r

_tinyimages_root='/data/lisa/data/tinyimages'
_original=_tinyimages_root+'/tinyimages/original'
_npy_file=_tinyimages_root+'/tinyimages.npy'
_shuffled_npy_file=_tinyimages_root+'/tinyimages_shuffled.npy'
_shuffled_npy_seed=12345
_README_file=_tinyimages_root+'/README.txt'
_README = """
TinyImages is a dataset of 32x32 RGB images.
This database contains 1608356 images, although there are something like
80 million of them here: http://groups.csail.mit.edu/vision/TinyImages/

The database was downloaded from ***
The dataset is described in ***.

The large numpy in this directory is a mem-mappable tensor of the form:
    [n_images, rows, cols, channels].
The elements are unsigned integers from 0 to 255, that mean the conventional
channel pixel intensity.

The numpy file is generated by calling
    pylearn.datasets.tinyimages.rebuild_numpy_file()

"""


def iterate_over_filenames(path=_original):
    """
    Generate (root, letter, label, filename) tuples for each image file in the
    dataset.
    """
    for letter in sorted_listdir(path):
        for label in sorted_listdir(path, letter):
            for img in sorted_listdir(path, letter, label):
                yield path, letter, label, img

def load_image(path):
    """Return the image at `path` as a numpy ndarray """
    rval = numpy.asarray(PIL.Image.open(path))
    return rval

def image_generator(path=_original):
    """
    Generate numpy ndarrays of size (32,32,3) and dtype 'uint8' for each image
    in the dataset.

    Be careful with this generator because the dataset in total is close to
    20GB!
    """
    n_colour_conversions = 0
    n_yielded = 0
    for p in iterate_over_filenames(path=_original):
        y = load_image(os.path.join(*p))
        n_yielded += 1
        if y.shape == (32,32):
            logger.info("put %i'th/%i images in colour"%(n_colour_conversions, n_yielded))
            y = numpy.asarray([y,y,y]).transpose((1,2,0)).copy()
            n_colour_conversions += 1
        assert y.shape == (32,32,3), (p,y.shape)
        assert y.dtype == numpy.uint8, (p,y.dtype)
        yield y

def load_first_N(N):
    i = 0
    it = iterate_over_filenames()
    while i < N:
        yield it.next()
        i +=1


def arrange_first_N_into_tiling(R,C, fileroot):
    R=int(R)
    C=int(C)
    A = numpy.asarray([i.copy() for i,ii in zip(image_generator(), xrange(R*C))])
    pylearn.io.image_tiling.tile_slices_to_image_uint8(A,
            tile_shape=(R,C)).save(fileroot+'_from_imgs.png')
    A = get_memmapped_file(R*C)
    pylearn.io.image_tiling.tile_slices_to_image_uint8(A,
            tile_shape=(R,C)).save(fileroot+'_memmapped.png')
    A = get_shuffled_memmapped_file(R*C)
    pylearn.io.image_tiling.tile_slices_to_image_uint8(A,
            tile_shape=(R,C)).save(fileroot+'_shuffled.png')


n_images = 1608356

def get_memmapped_file(N=n_images, filename=_npy_file):
    return numpy.memmap(filename,
            dtype='uint8',
            mode='r',
            shape=(N,32,32,3))
def get_shuffled_memmapped_file(N=n_images, filename=_shuffled_npy_file):
    return get_memmapped_file(N, filename)

def rebuild_memmapped_file(N=n_images, filename=_npy_file):
    shp = (N,32,32,3)
    print >> sys.stderr, "pylearn.datasets.tinyimages rebuilding", filename, shp, N*32*32*3 / float(1024**3), 'gigabytes'
    open(_README_file, 'w').write(_README)
    mmap = numpy.memmap(filename,
            dtype='uint8',
            mode='w+', #create over overwrite file for R/W
            shape=shp)
    ig = image_generator()
    for ii in xrange(N):
        mmap[ii] = ig.next()
    mmap.flush()

def rebuild_shuffled_memmapped_file(N=n_images, filename=_shuffled_npy_file,
        seed=_shuffled_npy_seed,
        orig_filename=_npy_file):
    try:
        orig = get_memmapped_file(N, orig_filename)
    except IOError:
        print >> sys.stderr, "pylearn.datasets.tinyimages: rebuild un-shuffled file first"
        raise
    shp = orig.shape
    print >> sys.stderr, "pylearn.datasets.tinyimages rebuilding", filename, shp, N*32*32*3 / float(1024**3), 'gigabytes'
    mmap = numpy.memmap(filename,
            dtype='uint8',
            mode='w+',#create over overwrite file for R/W
            shape=shp)
    idxlist = numpy.arange(orig.shape[0])
    numpy.random.RandomState(seed).shuffle(idxlist)
    assert idxlist[0] != 0
    for i0, i1 in enumerate(idxlist):
        mmap[i0] = orig[i1]
        if not i0 % 10000:
            print>> sys.stderr, "%i/%i"%(i0, len(idxlist))
    mmap.flush()

def main(argv=[]):
    if argv:
        print "Saving images to ", argv[2]
        arrange_first_N_into_tiling( argv[0], argv[1], argv[2])
    else:
        def iter_len(x):
            i = 0
            for xx in x:
                i += 1
            return i
        n_files = iter_len(iterate_over_filenames())
        print 'got %i files' % n_files
        assert n_images == n_files

        for p in load_first_N(10):
            load_image(os.path.join(*p))


if __name__ == '__main__':
    sys.exit(main(sys.argv[1:]))