annotate pylearn/datasets/cifar10.py @ 1484:83d3c9ee6d65

* changed MNIST dataset to use config.get_filepath_in_roots mechanism
author gdesjardins
date Tue, 05 Jul 2011 11:01:51 -0400
parents 5ae77ac21609
children
rev   line source
855
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
1 """
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
2 Various routines to load/access MNIST data.
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
3 """
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
4 from __future__ import absolute_import
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
5
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
6 import os
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
7 import numpy
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
8 import cPickle
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
9
936
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
10 import logging
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
11 _logger = logging.getLogger('pylearn.datasets.cifar10')
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
12
855
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
13 from pylearn.datasets.config import data_root # config
936
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
14 from pylearn.datasets.dataset import Dataset # dataset.py
855
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
15
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
16 def unpickle(file):
936
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
17 fname = os.path.join(data_root(),
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
18 'cifar10',
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
19 'cifar-10-batches-py',
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
20 file)
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
21 _logger.info('loading file %s' % fname)
855
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
22 fo = open(fname, 'rb')
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
23 dict = cPickle.load(fo)
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
24 fo.close()
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
25 return dict
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
26
936
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
27 class cifar10(object):
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
28 """
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
29
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
30 This class gives access to meta-data of cifar10 dataset.
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
31 The constructor loads it from <data>/cifar10/cifar-10-batches-py/
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
32 where <data> is the pylearn data root (os.getenv('PYLEARN_DATA_ROOT')).
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
33
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
34 Attributes:
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
35
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
36 self.img_shape - the unrasterized image shape of each row in all.x
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
37 self.img_size - the number of pixels in (aka length of) each row
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
38 self.n_classes - the number of labels in the dataset (10)
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
39
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
40 self.all.x matrix - all train and test images as rasterized rows
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
41 self.all.y vector - all train and test labels as integers
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
42 self.train.x matrix - first ntrain rows of all.x
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
43 self.train.y matrix - first ntrain elements of all.y
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
44 self.valid.x matrix - rows ntrain to ntrain+nvalid of all.x
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
45 self.valid.y vector - elements ntrain to ntrain+nvalid of all.y
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
46 self.test.x matrix - rows ntrain+valid to end of all.x
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
47 self.test.y vector - elements ntrain+valid to end of all.y
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
48
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
49 """
855
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
50
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
51 def __init__(self, dtype='uint8', ntrain=40000, nvalid=10000, ntest=10000):
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
52 assert ntrain + nvalid <= 50000
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
53 assert ntest <= 10000
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
54
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
55 self.img_shape = (3,32,32)
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
56 self.img_size = numpy.prod(self.img_shape)
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
57 self.n_classes = 10
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
58
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
59 lenx = numpy.ceil((ntrain + nvalid) / 10000.)*10000
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
60 x = numpy.zeros((lenx,self.img_size), dtype=dtype)
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
61 y = numpy.zeros(lenx, dtype=dtype)
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
62
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
63 fnames = ['data_batch_%i'%i for i in range(1,6)]
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
64
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
65 # load train and validation data
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
66 nloaded = 0
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
67 for i, fname in enumerate(fnames):
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
68 data = unpickle(fname)
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
69 x[i*10000:(i+1)*10000, :] = data['data']
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
70 y[i*10000:(i+1)*10000] = data['labels']
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
71
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
72 nloaded += 10000
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
73 if nloaded >= ntrain + nvalid + ntest: break;
936
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
74
f732ec90e249 added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 855
diff changeset
75 self.all = Dataset.Obj(x=x, y=y)
855
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
76
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
77 self.train = Dataset.Obj(x=x[0:ntrain], y=y[0:ntrain])
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
78 self.valid = Dataset.Obj(x=x[ntrain:ntrain+nvalid],
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
79 y=y[ntrain:ntrain+nvalid])
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
80
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
81 # load test data
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
82 data = unpickle('test_batch')
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
83 self.test = Dataset.Obj(x=data['data'][0:ntest],
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
84 y=data['labels'][0:ntest])
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
85
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
86 def preprocess(self, x):
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
87 return numpy.float64( x *1.0 / 255.0)
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
88
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
89 def first_1k(dtype='uint8', ntrain=1000, nvalid=200, ntest=200):
553bf0861fb5 adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff changeset
90 return cifar10(dtype, ntrain, nvalid, ntest)
951
5d70dfc70ec0 added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 936
diff changeset
91
1281
5ae77ac21609 extended cifar10.tile_rasterized_examples to work for patches too
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 951
diff changeset
92 def tile_rasterized_examples(X, img_shape=(32,32)):
951
5d70dfc70ec0 added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 936
diff changeset
93 """Returns an ndarray that is ready to be passed to `image_tiling.save_tiled_raster_images`
5d70dfc70ec0 added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 936
diff changeset
94
5d70dfc70ec0 added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 936
diff changeset
95 This function is for the `x` matrices in the cifar dataset, or for the weight matrices
5d70dfc70ec0 added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 936
diff changeset
96 (filters) used to multiply them.
5d70dfc70ec0 added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 936
diff changeset
97 """
1281
5ae77ac21609 extended cifar10.tile_rasterized_examples to work for patches too
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 951
diff changeset
98 ndim = img_shape[0]*img_shape[1]
5ae77ac21609 extended cifar10.tile_rasterized_examples to work for patches too
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 951
diff changeset
99 assert ndim *3 == X.shape[1], (ndim, X.shape)
951
5d70dfc70ec0 added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 936
diff changeset
100 X = X.astype('float32')
1281
5ae77ac21609 extended cifar10.tile_rasterized_examples to work for patches too
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 951
diff changeset
101 r = X[:,:ndim]
5ae77ac21609 extended cifar10.tile_rasterized_examples to work for patches too
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 951
diff changeset
102 g = X[:,ndim:ndim*2]
5ae77ac21609 extended cifar10.tile_rasterized_examples to work for patches too
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 951
diff changeset
103 b = X[:,ndim*2:]
951
5d70dfc70ec0 added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 936
diff changeset
104 from pylearn.io.image_tiling import tile_raster_images
1281
5ae77ac21609 extended cifar10.tile_rasterized_examples to work for patches too
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 951
diff changeset
105 rval = tile_raster_images((r,g,b,None), img_shape=img_shape)
951
5d70dfc70ec0 added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 936
diff changeset
106 return rval
5d70dfc70ec0 added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 936
diff changeset
107
5d70dfc70ec0 added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 936
diff changeset
108