diff datasets/smallNorb.py @ 505:74b3e65f5f24

added smallNorb dataset, switched to PYLEARN_DATA_ROOT
author James Bergstra <bergstrj@iro.umontreal.ca>
date Wed, 29 Oct 2008 17:09:04 -0400
parents
children eda3d576ee97
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datasets/smallNorb.py	Wed Oct 29 17:09:04 2008 -0400
@@ -0,0 +1,102 @@
+import os
+import numpy
+from ..filetensor import read
+from .config import data_root
+
+#Path = '/u/bergstrj/pub/data/smallnorb'
+#Path = '/home/fringant2/lisa/louradoj/data/smallnorb'
+#Path = '/home/louradou/data/norb'
+
+class Paths(object):
+    """File-related operations on smallNorb
+    """
+    def __init__(self):
+        smallnorb = [data_root(), 'smallnorb']
+        self.train_dat = os.path.join(*\
+                smallnorb + ['smallnorb-5x46789x9x18x6x2x96x96-training-dat.mat'])
+        self.test_dat = os.path.join(*\
+                smallnorb + ['smallnorb-5x01235x9x18x6x2x96x96-testing-dat.mat'])
+        self.train_cat = os.path.join(*\
+                smallnorb + ['smallnorb-5x46789x9x18x6x2x96x96-training-cat.mat'])
+        self.test_cat = os.path.join(*\
+                smallnorb + ['smallnorb-5x01235x9x18x6x2x96x96-testing-cat.mat'])
+        self.train_info = os.path.join(*\
+                smallnorb + ['smallnorb-5x46789x9x18x6x2x96x96-training-info.mat'])
+        self.test_info = os.path.join(*\
+                smallnorb + ['smallnorb-5x01235x9x18x6x2x96x96-testing-info.mat'])
+
+    def load_append_train_test(self, normalize_pixels=True, downsample_amt=1, dtype='float64'):
+
+        def downsample(dataset):
+            return dataset[:, 0, ::downsample_amt, ::downsample_amt]
+        samples = downsample(read(open(self.train_dat)))
+        samples = numpy.vstack((samples, downsample(read(open(self.test_dat)))))
+        samples = numpy.asarray(samples, dtype=dtype)
+        if normalize_pixels:
+            samples *= (1.0 / 255.0)
+
+        labels = read(open(self.train_cat))
+        labels = numpy.hstack((labels, read(open(self.test_cat))))
+
+        infos = read(open(self.train_info))
+        infos = numpy.vstack((infos, read(open(self.test_info))))
+
+        return samples, labels, infos
+    
+def smallnorb_iid(ntrain=29160, nvalid=9720, ntest=9720, dtype='float64', normalize_pixels=True):
+    """Variation of the smallNorb task in which we randomly shuffle all the object instances
+    together before dividing into train/valid/test.
+
+    The default train/valid/test sizes correspond to 60/20/20 split of the entire dataset.
+
+    :returns: 5, (train_x, train_labels), (valid_x, valid_labels), (test_x, test_labels) 
+
+    """
+    # cut from /u/louradoj/theano/hpu/expcode1.py
+    rng = numpy.random.RandomState(1)        
+    samples, labels, infos = Paths().load_append_train_test(downsample_amt=3, dtype=dtype, normalize_pixels=normalize_pixels)
+
+    nsamples = samples.shape[0]
+    if ntrain + nvalid + ntest > nsamples:
+        raise Exception("ntrain+nvalid+ntest exceeds number of samples (%i)" % nsamples, 
+                (ntrain, nvalid, ntest))
+    i0 = 0
+    i1 = ntrain
+    i2 = ntrain + nvalid
+    i3 = ntrain + nvalid + ntest
+
+    indices = rng.permutation(nsamples)
+    train_rows = indices[i0:i1]
+    valid_rows = indices[i1:i2]
+    test_rows = indices[i2:i3]
+
+    n_labels = 5
+
+    def _pick_rows(rows):
+        a = numpy.array([ samples[i].flatten() for i in train_rows])
+        b = numpy.array([ [labels[i]] for i in train_rows])
+        return a, b
+
+    return n_labels, [_pick_rows(r) for r in (train_rows, valid_rows, test_rows)]
+
+def smallnorb_azSplit():
+    # cut from /u/louradoj/theano/hpu/expcode1.py
+    # WARNING NOT NECESSARILY WORKING CODE
+
+    samples, labels, infos = _load_append_train_test()
+    train_rows, valid_rows, test_rows = [], [], []
+    train_rows_azimuth = []
+    for instance in range(10):
+        az_min = 4*instance
+        az_max = 4*instance + 18
+        train_rows_azimuth.append( [a % 36 for a in range(az_min,az_max,2)] )
+    #print "train_rows_azimuth", train_rows_azimuth
+    for i, info in enumerate(infos):
+        if info[2] in train_rows_azimuth[info[0]]:
+            train_rows.append(i)
+        elif info[2] / 2 % 2 == 0:
+            test_rows.append(i)
+        else:
+            valid_rows.append(i)
+
+    return [_pick_rows(samples, labels, r) for r in (train_rows, valid_rows, test_rows)]