changeset 1228:86d802226a97

added tinyimages support code in datasets
author James Bergstra <bergstrj@iro.umontreal.ca>
date Wed, 22 Sep 2010 19:22:24 -0400
parents d9f93923765f
children 515033d4d3bf
files pylearn/datasets/tinyimages.py
diffstat 1 files changed, 66 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/tinyimages.py	Wed Sep 22 19:22:24 2010 -0400
@@ -0,0 +1,66 @@
+"""Code for loading the tinyimages dataset."""
+
+__authors__ = "James Bergstra"
+__copyright__ = "(c) 2010, Universite de Montreal"
+__license__ = "3-clause BSD License"
+__contact__ = "bergstrj@iro.umontreal.ca"
+
+import os, sys
+import PIL.Image
+import numpy
+
+def sorted_listdir(*path):
+    r = os.listdir(os.path.join(*path))
+    r.sort()
+    return r
+
+_original='/data/lisa/data/tinyimages/tinyimages/original'
+
+def iterate_over_filenames(path=_original):
+    """
+    Generate (root, letter, label, filename) tuples for each image file in the
+    dataset.
+    """
+    for letter in sorted_listdir(path):
+        for label in sorted_listdir(path, letter):
+            for img in sorted_listdir(path, letter, label):
+                yield path, letter, label, img
+
+def load_image(path):
+    """
+    """
+    rval = numpy.asarray(PIL.Image.open(path))
+    return rval
+
+def image_generator(path=_original):
+    """
+    Generate numpy ndarrays of size (32,32,3) and dtype 'uint8' for each image
+    in the dataset.
+
+    Be careful with this generator because the dataset in total is close to
+    20GB!
+    """
+    for p in iterate_over_filenames(path=_original):
+        y = load_image(*p)
+        assert y.shape == (32,32,3)
+        assert y.dtype == numpy.uint8
+        yield y
+
+def load_first_N(N):
+    i = 0
+    it = iterate_over_filenames()
+    while i < N:
+        yield it.next()
+        i +=1
+
+if __name__ == '__main__':
+    if 0:
+        def iter_len(x):
+            i = 0
+            for xx in x:
+                i += 1
+            return i
+        print 'got %i files' % iter_len(iterate_over_filenames())
+
+    for p in load_first_N(10):
+        load_image(os.path.join(*p))