Mercurial > pylearn
annotate pylearn/datasets/cifar10.py @ 1484:83d3c9ee6d65
* changed MNIST dataset to use config.get_filepath_in_roots mechanism
author | gdesjardins |
---|---|
date | Tue, 05 Jul 2011 11:01:51 -0400 |
parents | 5ae77ac21609 |
children |
rev | line source |
---|---|
855
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
1 """ |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
2 Various routines to load/access MNIST data. |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
3 """ |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
4 from __future__ import absolute_import |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
5 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
6 import os |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
7 import numpy |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
8 import cPickle |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
9 |
936
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
10 import logging |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
11 _logger = logging.getLogger('pylearn.datasets.cifar10') |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
12 |
855
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
13 from pylearn.datasets.config import data_root # config |
936
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
14 from pylearn.datasets.dataset import Dataset # dataset.py |
855
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
15 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
16 def unpickle(file): |
936
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
17 fname = os.path.join(data_root(), |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
18 'cifar10', |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
19 'cifar-10-batches-py', |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
20 file) |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
21 _logger.info('loading file %s' % fname) |
855
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
22 fo = open(fname, 'rb') |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
23 dict = cPickle.load(fo) |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
24 fo.close() |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
25 return dict |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
26 |
936
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
27 class cifar10(object): |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
28 """ |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
29 |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
30 This class gives access to meta-data of cifar10 dataset. |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
31 The constructor loads it from <data>/cifar10/cifar-10-batches-py/ |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
32 where <data> is the pylearn data root (os.getenv('PYLEARN_DATA_ROOT')). |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
33 |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
34 Attributes: |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
35 |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
36 self.img_shape - the unrasterized image shape of each row in all.x |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
37 self.img_size - the number of pixels in (aka length of) each row |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
38 self.n_classes - the number of labels in the dataset (10) |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
39 |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
40 self.all.x matrix - all train and test images as rasterized rows |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
41 self.all.y vector - all train and test labels as integers |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
42 self.train.x matrix - first ntrain rows of all.x |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
43 self.train.y matrix - first ntrain elements of all.y |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
44 self.valid.x matrix - rows ntrain to ntrain+nvalid of all.x |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
45 self.valid.y vector - elements ntrain to ntrain+nvalid of all.y |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
46 self.test.x matrix - rows ntrain+valid to end of all.x |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
47 self.test.y vector - elements ntrain+valid to end of all.y |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
48 |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
49 """ |
855
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
50 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
51 def __init__(self, dtype='uint8', ntrain=40000, nvalid=10000, ntest=10000): |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
52 assert ntrain + nvalid <= 50000 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
53 assert ntest <= 10000 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
54 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
55 self.img_shape = (3,32,32) |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
56 self.img_size = numpy.prod(self.img_shape) |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
57 self.n_classes = 10 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
58 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
59 lenx = numpy.ceil((ntrain + nvalid) / 10000.)*10000 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
60 x = numpy.zeros((lenx,self.img_size), dtype=dtype) |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
61 y = numpy.zeros(lenx, dtype=dtype) |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
62 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
63 fnames = ['data_batch_%i'%i for i in range(1,6)] |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
64 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
65 # load train and validation data |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
66 nloaded = 0 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
67 for i, fname in enumerate(fnames): |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
68 data = unpickle(fname) |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
69 x[i*10000:(i+1)*10000, :] = data['data'] |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
70 y[i*10000:(i+1)*10000] = data['labels'] |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
71 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
72 nloaded += 10000 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
73 if nloaded >= ntrain + nvalid + ntest: break; |
936
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
74 |
f732ec90e249
added code comments and "all" attribute to datasets.cifar10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
855
diff
changeset
|
75 self.all = Dataset.Obj(x=x, y=y) |
855
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
76 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
77 self.train = Dataset.Obj(x=x[0:ntrain], y=y[0:ntrain]) |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
78 self.valid = Dataset.Obj(x=x[ntrain:ntrain+nvalid], |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
79 y=y[ntrain:ntrain+nvalid]) |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
80 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
81 # load test data |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
82 data = unpickle('test_batch') |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
83 self.test = Dataset.Obj(x=data['data'][0:ntest], |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
84 y=data['labels'][0:ntest]) |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
85 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
86 def preprocess(self, x): |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
87 return numpy.float64( x *1.0 / 255.0) |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
88 |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
89 def first_1k(dtype='uint8', ntrain=1000, nvalid=200, ntest=200): |
553bf0861fb5
adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and
desjagui@opale.iro.umontreal.ca
parents:
diff
changeset
|
90 return cifar10(dtype, ntrain, nvalid, ntest) |
951
5d70dfc70ec0
added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
936
diff
changeset
|
91 |
1281
5ae77ac21609
extended cifar10.tile_rasterized_examples to work for patches too
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
951
diff
changeset
|
92 def tile_rasterized_examples(X, img_shape=(32,32)): |
951
5d70dfc70ec0
added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
936
diff
changeset
|
93 """Returns an ndarray that is ready to be passed to `image_tiling.save_tiled_raster_images` |
5d70dfc70ec0
added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
936
diff
changeset
|
94 |
5d70dfc70ec0
added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
936
diff
changeset
|
95 This function is for the `x` matrices in the cifar dataset, or for the weight matrices |
5d70dfc70ec0
added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
936
diff
changeset
|
96 (filters) used to multiply them. |
5d70dfc70ec0
added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
936
diff
changeset
|
97 """ |
1281
5ae77ac21609
extended cifar10.tile_rasterized_examples to work for patches too
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
951
diff
changeset
|
98 ndim = img_shape[0]*img_shape[1] |
5ae77ac21609
extended cifar10.tile_rasterized_examples to work for patches too
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
951
diff
changeset
|
99 assert ndim *3 == X.shape[1], (ndim, X.shape) |
951
5d70dfc70ec0
added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
936
diff
changeset
|
100 X = X.astype('float32') |
1281
5ae77ac21609
extended cifar10.tile_rasterized_examples to work for patches too
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
951
diff
changeset
|
101 r = X[:,:ndim] |
5ae77ac21609
extended cifar10.tile_rasterized_examples to work for patches too
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
951
diff
changeset
|
102 g = X[:,ndim:ndim*2] |
5ae77ac21609
extended cifar10.tile_rasterized_examples to work for patches too
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
951
diff
changeset
|
103 b = X[:,ndim*2:] |
951
5d70dfc70ec0
added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
936
diff
changeset
|
104 from pylearn.io.image_tiling import tile_raster_images |
1281
5ae77ac21609
extended cifar10.tile_rasterized_examples to work for patches too
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
951
diff
changeset
|
105 rval = tile_raster_images((r,g,b,None), img_shape=img_shape) |
951
5d70dfc70ec0
added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
936
diff
changeset
|
106 return rval |
5d70dfc70ec0
added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
936
diff
changeset
|
107 |
5d70dfc70ec0
added comments and image-rendering code to cifar-10
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
936
diff
changeset
|
108 |