Mercurial > pylearn
annotate pylearn/datasets/tinyimages.py @ 1524:9d21919e2332
autopep8
author | Frederic Bastien <nouiz@nouiz.org> |
---|---|
date | Fri, 02 Nov 2012 13:02:18 -0400 |
parents | 31d8c6a0a70d |
children |
rev | line source |
---|---|
1415
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
1 """Code for loading the tinyimages dataset. |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
2 """ |
1228
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
3 |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
4 __authors__ = "James Bergstra" |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
5 __copyright__ = "(c) 2010, Universite de Montreal" |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
6 __license__ = "3-clause BSD License" |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
7 __contact__ = "bergstrj@iro.umontreal.ca" |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
8 |
1285
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
9 import logging, os, sys |
1228
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
10 import PIL.Image |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
11 import numpy |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
12 |
1380
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
13 import pylearn.io.image_tiling |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
14 |
1285
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
15 logger = logging.getLogger('pylearn.datasets.tinyimages') |
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
16 |
1228
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
17 def sorted_listdir(*path): |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
18 r = os.listdir(os.path.join(*path)) |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
19 r.sort() |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
20 return r |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
21 |
1415
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
22 _tinyimages_root='/data/lisa/data/tinyimages' |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
23 _original=_tinyimages_root+'/tinyimages/original' |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
24 _npy_file=_tinyimages_root+'/tinyimages.npy' |
1454
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
25 _shuffled_npy_file=_tinyimages_root+'/tinyimages_shuffled.npy' |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
26 _shuffled_npy_seed=12345 |
1415
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
27 _README_file=_tinyimages_root+'/README.txt' |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
28 _README = """ |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
29 TinyImages is a dataset of 32x32 RGB images. |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
30 This database contains 1608356 images, although there are something like |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
31 80 million of them here: http://groups.csail.mit.edu/vision/TinyImages/ |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
32 |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
33 The database was downloaded from *** |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
34 The dataset is described in ***. |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
35 |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
36 The large numpy in this directory is a mem-mappable tensor of the form: |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
37 [n_images, rows, cols, channels]. |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
38 The elements are unsigned integers from 0 to 255, that mean the conventional |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
39 channel pixel intensity. |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
40 |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
41 The numpy file is generated by calling |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
42 pylearn.datasets.tinyimages.rebuild_numpy_file() |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
43 |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
44 """ |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
45 |
1228
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
46 |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
47 def iterate_over_filenames(path=_original): |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
48 """ |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
49 Generate (root, letter, label, filename) tuples for each image file in the |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
50 dataset. |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
51 """ |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
52 for letter in sorted_listdir(path): |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
53 for label in sorted_listdir(path, letter): |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
54 for img in sorted_listdir(path, letter, label): |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
55 yield path, letter, label, img |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
56 |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
57 def load_image(path): |
1285
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
58 """Return the image at `path` as a numpy ndarray """ |
1228
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
59 rval = numpy.asarray(PIL.Image.open(path)) |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
60 return rval |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
61 |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
62 def image_generator(path=_original): |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
63 """ |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
64 Generate numpy ndarrays of size (32,32,3) and dtype 'uint8' for each image |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
65 in the dataset. |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
66 |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
67 Be careful with this generator because the dataset in total is close to |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
68 20GB! |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
69 """ |
1285
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
70 n_colour_conversions = 0 |
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
71 n_yielded = 0 |
1228
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
72 for p in iterate_over_filenames(path=_original): |
1285
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
73 y = load_image(os.path.join(*p)) |
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
74 n_yielded += 1 |
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
75 if y.shape == (32,32): |
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
76 logger.info("put %i'th/%i images in colour"%(n_colour_conversions, n_yielded)) |
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
77 y = numpy.asarray([y,y,y]).transpose((1,2,0)).copy() |
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
78 n_colour_conversions += 1 |
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
79 assert y.shape == (32,32,3), (p,y.shape) |
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
80 assert y.dtype == numpy.uint8, (p,y.dtype) |
1228
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
81 yield y |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
82 |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
83 def load_first_N(N): |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
84 i = 0 |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
85 it = iterate_over_filenames() |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
86 while i < N: |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
87 yield it.next() |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
88 i +=1 |
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
89 |
1380
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
90 |
1458
31d8c6a0a70d
fixed backward logic regarding shuffled in tinyimages_op
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1454
diff
changeset
|
91 def arrange_first_N_into_tiling(R,C, fileroot): |
1380
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
92 R=int(R) |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
93 C=int(C) |
1458
31d8c6a0a70d
fixed backward logic regarding shuffled in tinyimages_op
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1454
diff
changeset
|
94 A = numpy.asarray([i.copy() for i,ii in zip(image_generator(), xrange(R*C))]) |
31d8c6a0a70d
fixed backward logic regarding shuffled in tinyimages_op
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1454
diff
changeset
|
95 pylearn.io.image_tiling.tile_slices_to_image_uint8(A, |
31d8c6a0a70d
fixed backward logic regarding shuffled in tinyimages_op
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1454
diff
changeset
|
96 tile_shape=(R,C)).save(fileroot+'_from_imgs.png') |
31d8c6a0a70d
fixed backward logic regarding shuffled in tinyimages_op
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1454
diff
changeset
|
97 A = get_memmapped_file(R*C) |
1454
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
98 pylearn.io.image_tiling.tile_slices_to_image_uint8(A, |
1458
31d8c6a0a70d
fixed backward logic regarding shuffled in tinyimages_op
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1454
diff
changeset
|
99 tile_shape=(R,C)).save(fileroot+'_memmapped.png') |
31d8c6a0a70d
fixed backward logic regarding shuffled in tinyimages_op
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1454
diff
changeset
|
100 A = get_shuffled_memmapped_file(R*C) |
31d8c6a0a70d
fixed backward logic regarding shuffled in tinyimages_op
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1454
diff
changeset
|
101 pylearn.io.image_tiling.tile_slices_to_image_uint8(A, |
31d8c6a0a70d
fixed backward logic regarding shuffled in tinyimages_op
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1454
diff
changeset
|
102 tile_shape=(R,C)).save(fileroot+'_shuffled.png') |
1380
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
103 |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
104 |
1415
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
105 n_images = 1608356 |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
106 |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
107 def get_memmapped_file(N=n_images, filename=_npy_file): |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
108 return numpy.memmap(filename, |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
109 dtype='uint8', |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
110 mode='r', |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
111 shape=(N,32,32,3)) |
1454
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
112 def get_shuffled_memmapped_file(N=n_images, filename=_shuffled_npy_file): |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
113 return get_memmapped_file(N, filename) |
1415
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
114 |
1454
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
115 def rebuild_memmapped_file(N=n_images, filename=_npy_file): |
1415
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
116 shp = (N,32,32,3) |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
117 print >> sys.stderr, "pylearn.datasets.tinyimages rebuilding", filename, shp, N*32*32*3 / float(1024**3), 'gigabytes' |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
118 open(_README_file, 'w').write(_README) |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
119 mmap = numpy.memmap(filename, |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
120 dtype='uint8', |
1454
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
121 mode='w+', #create over overwrite file for R/W |
1415
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
122 shape=shp) |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
123 ig = image_generator() |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
124 for ii in xrange(N): |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
125 mmap[ii] = ig.next() |
234e5e48d60d
added datasets.tinyimages.rebuild_numpy_file method to build 5GB memmappable file of all images
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1380
diff
changeset
|
126 mmap.flush() |
1285
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
127 |
1454
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
128 def rebuild_shuffled_memmapped_file(N=n_images, filename=_shuffled_npy_file, |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
129 seed=_shuffled_npy_seed, |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
130 orig_filename=_npy_file): |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
131 try: |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
132 orig = get_memmapped_file(N, orig_filename) |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
133 except IOError: |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
134 print >> sys.stderr, "pylearn.datasets.tinyimages: rebuild un-shuffled file first" |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
135 raise |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
136 shp = orig.shape |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
137 print >> sys.stderr, "pylearn.datasets.tinyimages rebuilding", filename, shp, N*32*32*3 / float(1024**3), 'gigabytes' |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
138 mmap = numpy.memmap(filename, |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
139 dtype='uint8', |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
140 mode='w+',#create over overwrite file for R/W |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
141 shape=shp) |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
142 idxlist = numpy.arange(orig.shape[0]) |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
143 numpy.random.RandomState(seed).shuffle(idxlist) |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
144 assert idxlist[0] != 0 |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
145 for i0, i1 in enumerate(idxlist): |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
146 mmap[i0] = orig[i1] |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
147 if not i0 % 10000: |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
148 print>> sys.stderr, "%i/%i"%(i0, len(idxlist)) |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
149 mmap.flush() |
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
150 |
1380
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
151 def main(argv=[]): |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
152 if argv: |
1454
283fb236f104
datasets/tinyimages can generate a shuffled file
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1415
diff
changeset
|
153 print "Saving images to ", argv[2] |
1380
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
154 arrange_first_N_into_tiling( argv[0], argv[1], argv[2]) |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
155 else: |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
156 def iter_len(x): |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
157 i = 0 |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
158 for xx in x: |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
159 i += 1 |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
160 return i |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
161 n_files = iter_len(iterate_over_filenames()) |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
162 print 'got %i files' % n_files |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
163 assert n_images == n_files |
1228
86d802226a97
added tinyimages support code in datasets
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
164 |
1380
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
165 for p in load_first_N(10): |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
166 load_image(os.path.join(*p)) |
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
167 |
1285
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
168 |
976539956475
adding tinyimages
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1228
diff
changeset
|
169 if __name__ == '__main__': |
1380
785aeb7a4df2
added a fn to datasets/tiny_images to output a mosaic of images from the dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
1285
diff
changeset
|
170 sys.exit(main(sys.argv[1:])) |