# HG changeset patch # User James Bergstra # Date 1285197744 14400 # Node ID 86d802226a97a6b5766aef1fa959584bd6776698 # Parent d9f93923765fd52f5ff468661ed708897d28327a added tinyimages support code in datasets diff -r d9f93923765f -r 86d802226a97 pylearn/datasets/tinyimages.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/tinyimages.py Wed Sep 22 19:22:24 2010 -0400 @@ -0,0 +1,66 @@ +"""Code for loading the tinyimages dataset.""" + +__authors__ = "James Bergstra" +__copyright__ = "(c) 2010, Universite de Montreal" +__license__ = "3-clause BSD License" +__contact__ = "bergstrj@iro.umontreal.ca" + +import os, sys +import PIL.Image +import numpy + +def sorted_listdir(*path): + r = os.listdir(os.path.join(*path)) + r.sort() + return r + +_original='/data/lisa/data/tinyimages/tinyimages/original' + +def iterate_over_filenames(path=_original): + """ + Generate (root, letter, label, filename) tuples for each image file in the + dataset. + """ + for letter in sorted_listdir(path): + for label in sorted_listdir(path, letter): + for img in sorted_listdir(path, letter, label): + yield path, letter, label, img + +def load_image(path): + """ + """ + rval = numpy.asarray(PIL.Image.open(path)) + return rval + +def image_generator(path=_original): + """ + Generate numpy ndarrays of size (32,32,3) and dtype 'uint8' for each image + in the dataset. + + Be careful with this generator because the dataset in total is close to + 20GB! + """ + for p in iterate_over_filenames(path=_original): + y = load_image(*p) + assert y.shape == (32,32,3) + assert y.dtype == numpy.uint8 + yield y + +def load_first_N(N): + i = 0 + it = iterate_over_filenames() + while i < N: + yield it.next() + i +=1 + +if __name__ == '__main__': + if 0: + def iter_len(x): + i = 0 + for xx in x: + i += 1 + return i + print 'got %i files' % iter_len(iterate_over_filenames()) + + for p in load_first_N(10): + load_image(os.path.join(*p))