view datasets/smallNorb.py @ 534:eaa5ad4089a1

Another bugfix in pylearn.embeddings.length()
author Joseph Turian <turian@gmail.com>
date Tue, 18 Nov 2008 03:49:37 -0500
parents 60b7dd5be860
children
line wrap: on
line source

import os
import numpy
from ..filetensor import read
from .config import data_root

#Path = '/u/bergstrj/pub/data/smallnorb'
#Path = '/home/fringant2/lisa/louradoj/data/smallnorb'
#Path = '/home/louradou/data/norb'

class Paths(object):
    """File-related operations on smallNorb
    """
    def __init__(self):
        smallnorb = [data_root(), 'smallnorb']
        self.train_dat = os.path.join(*\
                smallnorb + ['smallnorb-5x46789x9x18x6x2x96x96-training-dat.mat'])
        self.test_dat = os.path.join(*\
                smallnorb + ['smallnorb-5x01235x9x18x6x2x96x96-testing-dat.mat'])
        self.train_cat = os.path.join(*\
                smallnorb + ['smallnorb-5x46789x9x18x6x2x96x96-training-cat.mat'])
        self.test_cat = os.path.join(*\
                smallnorb + ['smallnorb-5x01235x9x18x6x2x96x96-testing-cat.mat'])
        self.train_info = os.path.join(*\
                smallnorb + ['smallnorb-5x46789x9x18x6x2x96x96-training-info.mat'])
        self.test_info = os.path.join(*\
                smallnorb + ['smallnorb-5x01235x9x18x6x2x96x96-testing-info.mat'])

    def load_append_train_test(self, normalize_pixels=True, downsample_amt=1, dtype='uint8'):
        """ Load the smallNorb data into numpy matrices.

        normalize_pixels True will divide the values by 255, which makes sense in conjunction
        with dtype=float32 or dtype=float64.

        """
        def downsample(dataset):
            return dataset[:, 0, ::downsample_amt, ::downsample_amt]

        samples = downsample(read(open(self.train_dat)))
        samples = numpy.vstack((samples, downsample(read(open(self.test_dat)))))
        samples = numpy.asarray(samples, dtype=dtype)
        if normalize_pixels:
            samples *= (1.0 / 255.0)

        labels = read(open(self.train_cat))
        labels = numpy.hstack((labels, read(open(self.test_cat))))

        infos = read(open(self.train_info))
        infos = numpy.vstack((infos, read(open(self.test_info))))

        return samples, labels, infos
    
def smallnorb_iid(ntrain=29160, nvalid=9720, ntest=9720, dtype='float64', normalize_pixels=True):
    """Variation of the smallNorb task in which we randomly shuffle all the object instances
    together before dividing into train/valid/test.

    The default train/valid/test sizes correspond to 60/20/20 split of the entire dataset.

    :returns: 5, (train_x, train_labels), (valid_x, valid_labels), (test_x, test_labels) 

    """
    # cut from /u/louradoj/theano/hpu/expcode1.py
    rng = numpy.random.RandomState(1)        
    samples, labels, infos = Paths().load_append_train_test(downsample_amt=3, dtype=dtype, normalize_pixels=normalize_pixels)

    nsamples = samples.shape[0]
    if ntrain + nvalid + ntest > nsamples:
        raise Exception("ntrain+nvalid+ntest exceeds number of samples (%i)" % nsamples, 
                (ntrain, nvalid, ntest))
    i0 = 0
    i1 = ntrain
    i2 = ntrain + nvalid
    i3 = ntrain + nvalid + ntest

    indices = rng.permutation(nsamples)
    train_rows = indices[i0:i1]
    valid_rows = indices[i1:i2]
    test_rows = indices[i2:i3]

    n_labels = 5

    def _pick_rows(rows):
        a = numpy.array([samples[i].flatten() for i in rows])
        b = numpy.array([labels[i] for i in rows])
        return a, b

    return [_pick_rows(r) for r in (train_rows, valid_rows, test_rows)]

def smallnorb_azSplit():
    # cut from /u/louradoj/theano/hpu/expcode1.py
    # WARNING NOT NECESSARILY WORKING CODE

    samples, labels, infos = _load_append_train_test()
    train_rows, valid_rows, test_rows = [], [], []
    train_rows_azimuth = []
    for instance in range(10):
        az_min = 4*instance
        az_max = 4*instance + 18
        train_rows_azimuth.append( [a % 36 for a in range(az_min,az_max,2)] )
    #print "train_rows_azimuth", train_rows_azimuth
    for i, info in enumerate(infos):
        if info[2] in train_rows_azimuth[info[0]]:
            train_rows.append(i)
        elif info[2] / 2 % 2 == 0:
            test_rows.append(i)
        else:
            valid_rows.append(i)

    return [_pick_rows(samples, labels, r) for r in (train_rows, valid_rows, test_rows)]