view pylearn/datasets/utlc.py @ 1477:48efafaaf7fb

Add function for loading the transfer labels of utlc
author Pascal Lamblin <lamblinp@iro.umontreal.ca>
date Sat, 21 May 2011 01:03:10 -0400
parents 2aa80f5b5bbc
children
line wrap: on
line source

"""
user should use the load _ndarray_dataset or load_sparse_dataset function

See the file ${PYLEARN_DATA_ROOT}/UTCL/README for detail on the datasets.

See the end of this file for an example.
"""

import cPickle
import gzip
import os

import numpy
import theano

import pylearn.io.filetensor as ft
import config

def load_ndarray_dataset(name, normalize=True, transfer=False,
                         normalize_on_the_fly=False, randomize_valid=False,
                         randomize_test=False):
    """ Load the train,valid,test data for the dataset `name`
        and return it in ndarray format.

        We suppose the data was created with ift6266h11/pretraitement/to_npy.py that
        shuffle the train. So the train should already be shuffled.

    :param normalize: If True, we normalize the train dataset
                      before returning it
    :param transfer: If True also return the transfer labels
    :param normalize_on_the_fly: If True, we return a Theano Variable that will give
                                 as output the normalized value. If the user only
                                 take a subtensor of that variable, Theano optimization
                                 should make that we will only have in memory the subtensor
                                 portion that is computed in normalized form. We store
                                 the original data in shared memory in its original dtype.

                                 This is usefull to have the original data in its original
                                 dtype in memory to same memory. Especialy usefull to
                                 be able to use rita and harry with 1G per jobs.
    :param randomize_valid: Do we randomize the order of the valid set?
                            We always use the same random order
                            If False, return in the same order as downloaded on the web
    :param randomize_test: Do we randomize the order of the test set?
                           We always use the same random order
                           If False, return in the same order as downloaded on the web
    """
    assert not (normalize and normalize_on_the_fly), "Can't normalize in 2 way at the same time!"

    assert name in ['avicenna','harry','rita','sylvester','ule']
    common = os.path.join('UTLC','filetensor',name+'_')
    trname,vname,tename = [config.get_filepath_in_roots(common+subset+'.ft.gz',
                                                        common+subset+'.ft')
                           for subset in ['train','valid','test']]

    train = load_filetensor(trname)
    valid = load_filetensor(vname)
    test = load_filetensor(tename)
    if randomize_valid:
        rng = numpy.random.RandomState([1,2,3,4])
        perm = rng.permutation(valid.shape[0])
        valid = valid[perm]
    if randomize_test:
        rng = numpy.random.RandomState([1,2,3,4])
        perm = rng.permutation(test.shape[0])
        test = test[perm]

    if normalize or normalize_on_the_fly:
        if normalize_on_the_fly:
            # Shared variables of the original type
            train = theano.shared(train, borrow=True, name=name+"_train")
            valid = theano.shared(valid, borrow=True, name=name+"_valid")
            test = theano.shared(test, borrow=True, name=name+"_test")
            # Symbolic variables cast into floatX
            train = theano.tensor.cast(train, theano.config.floatX)
            valid = theano.tensor.cast(valid, theano.config.floatX)
            test = theano.tensor.cast(test, theano.config.floatX)
        else:
            train = numpy.asarray(train, theano.config.floatX)
            valid = numpy.asarray(valid, theano.config.floatX)
            test = numpy.asarray(test, theano.config.floatX)

        if name == "ule":
            train /= 255
            valid /= 255
            test /= 255
        elif name in ["avicenna", "sylvester"]:
            if name == "avicenna":
                train_mean = 514.62154022835455
                train_std = 6.829096494224145
            else:
                train_mean = 403.81889927027686
                train_std = 96.43841050784053
            train -= train_mean
            valid -= train_mean
            test -= train_mean
            train /= train_std
            valid /= train_std
            test /= train_std
        elif name == "harry":
            std = 0.69336046033925791#train.std()slow to compute
            train /= std
            valid /= std
            test /= std
        elif name == "rita":
            v = numpy.asarray(230, dtype=theano.config.floatX)
            train /= v
            valid /= v
            test /= v
        else:
            raise Exception("This dataset don't have its normalization defined")
    if transfer:
        transfer = load_ndarray_transfer(name)
        return train, valid, test, transfer
    else:
        return train, valid, test

def load_sparse_dataset(name, normalize=True, transfer=False,
                        randomize_valid=False,
                        randomize_test=False):
    """ Load the train,valid,test data for the dataset `name`
        and return it in sparse format.

        We suppose the data was created with ift6266h11/pretraitement/to_npy.py that
        shuffle the train. So the train should already be shuffled.

    :param normalize: If True, we normalize the train dataset
                      before returning it
    :param transfer: If True also return the transfer label
    :param randomize_valid: see same option for load_ndarray_dataset
    :param randomize_test: see same option for load_ndarray_dataset

    """
    assert name in ['harry','terry','ule']
    common = os.path.join('UTLC','sparse',name+'_')
    trname,vname,tename = [config.get_filepath_in_roots(common+subset+'.npy.gz',
                                                        common+subset+'.npy')
                           for subset in ['train','valid','test']]
    train = load_sparse(trname)
    valid = load_sparse(vname)
    test = load_sparse(tename)

    # Data should already be in csr format that support
    # this type of indexing.
    if randomize_valid:
        rng = numpy.random.RandomState([1,2,3,4])
        perm = rng.permutation(valid.shape[0])
        valid = valid[perm]
    if randomize_test:
        rng = numpy.random.RandomState([1,2,3,4])
        perm = rng.permutation(test.shape[0])
        test = test[perm]

    if normalize:
        if name == "ule":
            train = train.astype(theano.config.floatX) / 255
            valid = valid.astype(theano.config.floatX) / 255
            test = test.astype(theano.config.floatX) / 255
        elif name == "harry":
            train = train.astype(theano.config.floatX)
            valid = valid.astype(theano.config.floatX)
            test = test.astype(theano.config.floatX)
            std = 0.69336046033925791#train.std()slow to compute
            train = (train) / std
            valid = (valid) / std
            test = (test) / std
        elif name == "terry":
            train = train.astype(theano.config.floatX)
            valid = valid.astype(theano.config.floatX)
            test = test.astype(theano.config.floatX)
            train = (train) / 300
            valid = (valid) / 300
            test = (test) / 300
        else:
            raise Exception("This dataset don't have its normalization defined")
    if transfer:
        transfer = load_filetensor(os.path.join(config.data_root(),"UTLC","filetensor",name+"_transfer.ft"))
        return train, valid, test, transfer
    else:
        return train, valid, test

def load_ndarray_transfer(name):
    """
    Load the transfer labels for the training set of data set `name`.

    It will be returned in ndarray format.
    """
    assert name in ['avicenna','harry','rita','sylvester','terry','ule']
    transfer = load_filetensor(os.path.join(config.data_root(), 'UTLC',
        'filetensor', name+'_transfer.ft'))
    return transfer

def load_ndarray_label(name):
    """ Load the train,valid,test data for the dataset `name`
        and return it in ndarray format.

        This is only available for the toy dataset ule.
    """
    assert name in ['ule']
    trname,vname,tename = [os.path.join(config.data_root(),
                                        'UTLC','filetensor',
                                        name+'_'+subset+'.ft')
                           for subset in ['trainl','validl','testl']]
    trainl = load_filetensor(trname)
    validl = load_filetensor(vname)
    testl = load_filetensor(tename)
    return trainl, validl, testl

def load_filetensor(fname):
    f = None
    try:
        if not os.path.exists(fname):
            fname = fname+'.gz'
            assert os.path.exists(fname)
            f = gzip.open(fname)
        elif fname.endswith('.gz'):
            f = gzip.open(fname)
        else:
            f = open(fname)
        d = ft.read(f)
    finally:
        if f:
            f.close()

    return d

def load_sparse(fname):
    f = None
    try:
        if not os.path.exists(fname):
            fname = fname+'.gz'
            assert os.path.exists(fname)
            f = gzip.open(fname)
        elif fname.endswith('.gz'):
            f = gzip.open(fname)
        else:
            f = open(fname)
        d = cPickle.load(f)
    finally:
        if f:
            f.close()
    return d

if __name__ == '__main__':
    import numpy
    import scipy.sparse

    # Test loading of transfer data
    train, valid, test, transfer = load_ndarray_dataset("ule", normalize=True, transfer=True)
    assert train.shape[0]==transfer.shape[0]

    for name in ['avicenna','harry','rita','sylvester','ule']:
        train, valid, test = load_ndarray_dataset(name, normalize=True)
        print name,"dtype, max, min, mean, std"
        print train.dtype, train.max(), train.min(), train.mean(), train.std()
        assert isinstance(train, numpy.ndarray)
        assert isinstance(valid, numpy.ndarray)
        assert isinstance(test, numpy.ndarray)
        assert train.shape[1]==test.shape[1]==valid.shape[1]

    # Test loading of transfer data
    train, valid, test, transfer = load_sparse_dataset("ule", normalize=True, transfer=True)
    assert train.shape[0]==transfer.shape[0]

    for name in ['harry','terry','ule']:
        train, valid, test = load_sparse_dataset(name, normalize=True)
        nb_elem = numpy.prod(train.shape)
        mi = train.data.min()
        ma = train.data.max()
        mi = min(0, mi)
        ma = max(0, ma)
        su = train.data.sum()
        mean = float(su)/nb_elem
        print name,"dtype, max, min, mean, nb non-zero, nb element, %sparse"
        print train.dtype, ma, mi, mean, train.nnz, nb_elem, (nb_elem-float(train.nnz))/nb_elem
        print name,"max, min, mean, std (all stats on non-zero element)"
        print train.data.max(), train.data.min(), train.data.mean(), train.data.std()
        assert scipy.sparse.issparse(train)
        assert scipy.sparse.issparse(valid)
        assert scipy.sparse.issparse(test)
        assert train.shape[1]==test.shape[1]==valid.shape[1]