view pylearn/datasets/utlc.py @ 1408:2993b2a5c1af

allow to load UTLC transfer label data.
author Frederic Bastien <nouiz@nouiz.org>
date Fri, 28 Jan 2011 11:00:11 -0500
parents 6003f733a994
children e7844692e6e2
line wrap: on
line source

""" 
user should use the load _ndarray_dataset or load_sparse_dataset function

See the file ${PYLEARN_DATA_ROOT}/UTCL/README for detail on the datasets.

See the end of this file for an example.
"""

import cPickle
import gzip
import os

import numpy
import theano

import pylearn.io.filetensor as ft
import config

def load_ndarray_dataset(name, normalize=True, transfer=False):
    """ Load the train,valid,test data for the dataset `name`
        and return it in ndarray format.
    
    :param normalize: If True, we normalize the train dataset
                      before returning it
    :param transfer: If True also return the transfer label
    """
    assert name in ['avicenna','harry','rita','sylvester','ule']
    trname,vname,tename = [os.path.join(config.data_root(),
                                        'UTLC','filetensor',
                                        name+'_'+subset+'.ft') 
                           for subset in ['train','valid','test']]
    train = load_filetensor(trname)
    valid = load_filetensor(vname)
    test = load_filetensor(tename)
    if normalize:
        if name == "ule":
            train = numpy.asarray(train, theano.config.floatX) / 255
            valid = numpy.asarray(valid, theano.config.floatX) / 255
            test = numpy.asarray(test, theano.config.floatX) / 255
        elif name in ["avicenna", "sylvester"]:
            train = numpy.asarray(train, theano.config.floatX)
            valid = numpy.asarray(valid, theano.config.floatX)
            test = numpy.asarray(test, theano.config.floatX)
            mean = train.mean()
            std = train.std()
            train = (train - mean) / std
            valid = (valid - mean) / std
            test = (test - mean) / std  
        elif name == "harry":
            #force float32 as otherwise too big to keep in memory completly
            train = numpy.asarray(train, "float32")
            valid = numpy.asarray(valid, "float32")
            test = numpy.asarray(test, "float32")
            std = 0.69336046033925791#train.std()slow to compute
            train = (train) / std
            valid = (valid) / std
            test = (test) / std  
        elif name == "rita":
            #force float32 as otherwise too big to keep in memory completly
            train = numpy.asarray(train, "float32")
            valid = numpy.asarray(valid, "float32")
            test = numpy.asarray(test, "float32")
            max = train.max()
            train = (train) / max
            valid = (valid) / max
            test = (test) / max  
        else:
            raise Exception("This dataset don't have its normalization defined")
    if transfer:
        transfer = load_filetensor(os.path.join(config.data_root(),"UTLC","filetensor",name+"_transfer.ft"))
        return train, valid, test, transfer
    else:
        return train, valid, test

def load_sparse_dataset(name, normalize=True, transfer=False):
    """ Load the train,valid,test data for the dataset `name`
        and return it in sparse format.
    
    :param normalize: If True, we normalize the train dataset
                      before returning it
    :param transfer: If True also return the transfer label
    """
    assert name in ['harry','terry','ule']
    trname,vname,tename = [os.path.join(config.data_root(),
                                        'UTLC','sparse',
                                        name+'_'+subset+'.npy') 
                           for subset in ['train','valid','test']]
    train = load_sparse(trname)
    valid = load_sparse(vname)
    test = load_sparse(tename)
    if normalize:
        if name == "ule":
            train = train.astype(theano.config.floatX) / 255
            valid = valid.astype(theano.config.floatX) / 255
            test = test.astype(theano.config.floatX) / 255
        elif name == "harry":
            train = train.astype(theano.config.floatX)
            valid = valid.astype(theano.config.floatX)
            test = test.astype(theano.config.floatX)
            std = 0.69336046033925791#train.std()slow to compute
            train = (train) / std
            valid = (valid) / std
            test = (test) / std  
        elif name == "terry":
            train = train.astype(theano.config.floatX)
            valid = valid.astype(theano.config.floatX)
            test = test.astype(theano.config.floatX)
            train = (train) / 300
            valid = (valid) / 300
            test = (test) / 300
        else:
            raise Exception("This dataset don't have its normalization defined")
    if transfer:
        transfer = load_sparse(os.path.join(config.data_root(),"UTLC","sparse",name+"_transfer.npy"))
        return train, valid, test, transfer
    else:
        return train, valid, test
    
def load_filetensor(fname):
    f = None
    try:
        if not os.path.exists(fname):
            fname = fname+'.gz'
            assert os.path.exists(fname)
            f = gzip.open(fname)
        else:
            f = open(fname)
        d = ft.read(f)
    finally:
        if f:
            f.close()

    return d

def load_sparse(fname):
    f = None
    try:
        if not os.path.exists(fname):
            fname = fname+'.gz'
            assert os.path.exists(fname)
            f = gzip.open(fname)
        else:
            f = open(fname)
        d = cPickle.load(f)
    finally:
        if f:
            f.close()
    return d

if __name__ == '__main__':
    import numpy
    import scipy.sparse

    # Test loading of transfer data
    train, valid, test, transfer = load_ndarray_dataset("ule", normalize=True, transfer=True)
    assert train.shape[0]==transfer.shape[0]

    for name in ['avicenna','harry','rita','sylvester','ule']:
        train, valid, test = load_ndarray_dataset(name, normalize=True)
        print name,"dtype, max, min, mean, std"
        print train.dtype, train.max(), train.min(), train.mean(), train.std()
        assert isinstance(train, numpy.ndarray)
        assert isinstance(valid, numpy.ndarray)
        assert isinstance(test, numpy.ndarray)
        assert train.shape[1]==test.shape[1]==valid.shape[1]

    # Test loading of transfer data
    train, valid, test, transfer = load_sparse_dataset("ule", normalize=True, transfer=True)
    assert train.shape[0]==transfer.shape[0]

    for name in ['harry','terry','ule']:
        train, valid, test = load_sparse_dataset(name, normalize=True)
        nb_elem = numpy.prod(train.shape)
        mi = train.data.min()
        ma = train.data.max()
        mi = min(0, mi)
        ma = max(0, ma)
        su = train.data.sum()
        mean = float(su)/nb_elem
        print name,"dtype, max, min, mean, nb non-zero, nb element, %sparse"
        print train.dtype, ma, mi, mean, train.nnz, nb_elem, (nb_elem-float(train.nnz))/nb_elem
        print name,"max, min, mean, std (all stats on non-zero element)"
        print train.data.max(), train.data.min(), train.data.mean(), train.data.std()
        assert scipy.sparse.issparse(train)
        assert scipy.sparse.issparse(valid)
        assert scipy.sparse.issparse(test)
        assert train.shape[1]==test.shape[1]==valid.shape[1]