view pylearn/datasets/utlc.py @ 1426:4988f8ea0836

in utlc.py, implement a parameter that return a Theano variable that represent the normalized data. Usefull for on the fly normalization.
author Frederic Bastien <nouiz@nouiz.org>
date Tue, 08 Feb 2011 12:33:33 -0500
parents 68fdb895f53f
children a36d3a406c59
line wrap: on
line source

""" 
user should use the load _ndarray_dataset or load_sparse_dataset function

See the file ${PYLEARN_DATA_ROOT}/UTCL/README for detail on the datasets.

See the end of this file for an example.
"""

import cPickle
import gzip
import os

import numpy
import theano

import pylearn.io.filetensor as ft
import config

def load_ndarray_dataset(name, normalize=True, transfer=False, normalize_on_the_fly=False):
    """ Load the train,valid,test data for the dataset `name`
        and return it in ndarray format.
    
    :param normalize: If True, we normalize the train dataset
                      before returning it
    :param transfer: If True also return the transfer label
    :param normalize_on_the_fly: If True, we return a Theano Variable that will give
                                 as output the normalized value. If the user only
                                 take a subtensor of that variable, Theano optimization
                                 should make that we will only have in memory the subtensor
                                 portion that is computed in normalized form. We store
                                 the original data in shared memory in its original dtype.

                                 This is usefull to have the original data in its original
                                 dtype in memory to same memory. Especialy usefull to
                                 be able to use rita and harry with 1G per jobs.
    """
    assert not (normalize and normalize_on_the_fly), "Can't normalize in 2 way at the same time!"
    
    assert name in ['avicenna','harry','rita','sylvester','ule']
    common = os.path.join('UTLC','filetensor',name+'_')
    trname,vname,tename = [config.get_filepath_in_roots(common+subset+'.ft.gz',
                                                        common+subset+'.ft')
                           for subset in ['train','valid','test']]

    train = load_filetensor(trname)
    valid = load_filetensor(vname)
    test = load_filetensor(tename)

    if normalize or normalize_on_the_fly:
        if normalize_on_the_fly:
            train = theano.shared(train, borrow=True, name=name+"_train")
            valid = theano.shared(valid, borrow=True, name=name+"_valid")
            test = theano.shared(test, borrow=True, name=name+"_test")
        else:
            train = numpy.asarray(train, theano.config.floatX)
            valid = numpy.asarray(valid, theano.config.floatX)
            test = numpy.asarray(test, theano.config.floatX)

        if name == "ule":
            train /= 255
            valid /= 255
            test /= 255
        elif name in ["avicenna", "sylvester"]:
            if name == "avicenna":
                train_mean = 514.62154022835455
                train_std = 6.829096494224145
            else:
                train_mean = 403.81889927027686
                train_std = 96.43841050784053
            train -= train_mean
            valid -= train_mean
            test -= train_mean
            train /= train_std
            valid /= train_std
            test /= train_std
        elif name == "harry":
            std = 0.69336046033925791#train.std()slow to compute
            train /= std
            valid /= std
            test /= std  
        elif name == "rita":
            v = numpy.asarray(230, dtype=theano.config.floatX)
            train /= v
            valid /= v
            test /= v
        else:
            raise Exception("This dataset don't have its normalization defined")
    if transfer:
        transfer = load_filetensor(os.path.join(config.data_root(),"UTLC","filetensor",name+"_transfer.ft"))
        return train, valid, test, transfer
    else:
        return train, valid, test

def load_sparse_dataset(name, normalize=True, transfer=False):
    """ Load the train,valid,test data for the dataset `name`
        and return it in sparse format.
    
    :param normalize: If True, we normalize the train dataset
                      before returning it
    :param transfer: If True also return the transfer label
    """
    assert name in ['harry','terry','ule']
    trname,vname,tename = [os.path.join(config.data_root(),
                                        'UTLC','sparse',
                                        name+'_'+subset+'.npy') 
                           for subset in ['train','valid','test']]
    train = load_sparse(trname)
    valid = load_sparse(vname)
    test = load_sparse(tename)
    if normalize:
        if name == "ule":
            train = train.astype(theano.config.floatX) / 255
            valid = valid.astype(theano.config.floatX) / 255
            test = test.astype(theano.config.floatX) / 255
        elif name == "harry":
            train = train.astype(theano.config.floatX)
            valid = valid.astype(theano.config.floatX)
            test = test.astype(theano.config.floatX)
            std = 0.69336046033925791#train.std()slow to compute
            train = (train) / std
            valid = (valid) / std
            test = (test) / std  
        elif name == "terry":
            train = train.astype(theano.config.floatX)
            valid = valid.astype(theano.config.floatX)
            test = test.astype(theano.config.floatX)
            train = (train) / 300
            valid = (valid) / 300
            test = (test) / 300
        else:
            raise Exception("This dataset don't have its normalization defined")
    if transfer:
        transfer = load_sparse(os.path.join(config.data_root(),"UTLC","sparse",name+"_transfer.npy"))
        return train, valid, test, transfer
    else:
        return train, valid, test
    
def load_ndarray_label(name):
    """ Load the train,valid,test data for the dataset `name`
        and return it in ndarray format.
    
        This is only available for the toy dataset ule.
    """
    assert name in ['ule']
    trname,vname,tename = [os.path.join(config.data_root(),
                                        'UTLC','filetensor',
                                        name+'_'+subset+'.ft') 
                           for subset in ['trainl','validl','testl']]
    trainl = load_filetensor(trname)
    validl = load_filetensor(vname)
    testl = load_filetensor(tename)
    return trainl, validl, testl

def load_filetensor(fname):
    f = None
    try:
        if not os.path.exists(fname):
            fname = fname+'.gz'
            assert os.path.exists(fname)
            f = gzip.open(fname)
        elif fname.endswith('.gz'):
            f = gzip.open(fname)
        else:
            f = open(fname)
        d = ft.read(f)
    finally:
        if f:
            f.close()

    return d

def load_sparse(fname):
    f = None
    try:
        if not os.path.exists(fname):
            fname = fname+'.gz'
            assert os.path.exists(fname)
            f = gzip.open(fname)
        else:
            f = open(fname)
        d = cPickle.load(f)
    finally:
        if f:
            f.close()
    return d

if __name__ == '__main__':
    import numpy
    import scipy.sparse

    # Test loading of transfer data
    train, valid, test, transfer = load_ndarray_dataset("ule", normalize=True, transfer=True)
    assert train.shape[0]==transfer.shape[0]

    for name in ['avicenna','harry','rita','sylvester','ule']:
        train, valid, test = load_ndarray_dataset(name, normalize=True)
        print name,"dtype, max, min, mean, std"
        print train.dtype, train.max(), train.min(), train.mean(), train.std()
        assert isinstance(train, numpy.ndarray)
        assert isinstance(valid, numpy.ndarray)
        assert isinstance(test, numpy.ndarray)
        assert train.shape[1]==test.shape[1]==valid.shape[1]

    # Test loading of transfer data
    train, valid, test, transfer = load_sparse_dataset("ule", normalize=True, transfer=True)
    assert train.shape[0]==transfer.shape[0]

    for name in ['harry','terry','ule']:
        train, valid, test = load_sparse_dataset(name, normalize=True)
        nb_elem = numpy.prod(train.shape)
        mi = train.data.min()
        ma = train.data.max()
        mi = min(0, mi)
        ma = max(0, ma)
        su = train.data.sum()
        mean = float(su)/nb_elem
        print name,"dtype, max, min, mean, nb non-zero, nb element, %sparse"
        print train.dtype, ma, mi, mean, train.nnz, nb_elem, (nb_elem-float(train.nnz))/nb_elem
        print name,"max, min, mean, std (all stats on non-zero element)"
        print train.data.max(), train.data.min(), train.data.mean(), train.data.std()
        assert scipy.sparse.issparse(train)
        assert scipy.sparse.issparse(valid)
        assert scipy.sparse.issparse(test)
        assert train.shape[1]==test.shape[1]==valid.shape[1]