Mercurial > pylearn
view pylearn/datasets/utlc.py @ 1411:68fdb895f53f
allow to load the ule labels.
author | Frederic Bastien <nouiz@nouiz.org> |
---|---|
date | Wed, 02 Feb 2011 12:39:35 -0500 |
parents | e7844692e6e2 |
children | 4988f8ea0836 |
line wrap: on
line source
""" user should use the load _ndarray_dataset or load_sparse_dataset function See the file ${PYLEARN_DATA_ROOT}/UTCL/README for detail on the datasets. See the end of this file for an example. """ import cPickle import gzip import os import numpy import theano import pylearn.io.filetensor as ft import config def load_ndarray_dataset(name, normalize=True, transfer=False): """ Load the train,valid,test data for the dataset `name` and return it in ndarray format. :param normalize: If True, we normalize the train dataset before returning it :param transfer: If True also return the transfer label """ assert name in ['avicenna','harry','rita','sylvester','ule'] trname,vname,tename = [os.path.join(config.data_root(), 'UTLC','filetensor', name+'_'+subset+'.ft') for subset in ['train','valid','test']] train = load_filetensor(trname) valid = load_filetensor(vname) test = load_filetensor(tename) if normalize: if name == "ule": train = numpy.asarray(train, theano.config.floatX) / 255 valid = numpy.asarray(valid, theano.config.floatX) / 255 test = numpy.asarray(test, theano.config.floatX) / 255 elif name in ["avicenna", "sylvester"]: train = numpy.asarray(train, theano.config.floatX) valid = numpy.asarray(valid, theano.config.floatX) test = numpy.asarray(test, theano.config.floatX) mean = train.mean() std = train.std() train -= mean valid -= mean test -= mean train /= std valid /= std test /= std elif name == "harry": #force float32 as otherwise too big to keep in memory completly train = numpy.asarray(train, "float32") valid = numpy.asarray(valid, "float32") test = numpy.asarray(test, "float32") std = 0.69336046033925791#train.std()slow to compute train /= std valid /= std test /= std elif name == "rita": #force float32 as otherwise too big to keep in memory completly train = numpy.asarray(train, "float32") valid = numpy.asarray(valid, "float32") test = numpy.asarray(test, "float32") max = train.max() train /= max valid /= max test /= max else: raise Exception("This dataset don't have its normalization defined") if transfer: transfer = load_filetensor(os.path.join(config.data_root(),"UTLC","filetensor",name+"_transfer.ft")) return train, valid, test, transfer else: return train, valid, test def load_sparse_dataset(name, normalize=True, transfer=False): """ Load the train,valid,test data for the dataset `name` and return it in sparse format. :param normalize: If True, we normalize the train dataset before returning it :param transfer: If True also return the transfer label """ assert name in ['harry','terry','ule'] trname,vname,tename = [os.path.join(config.data_root(), 'UTLC','sparse', name+'_'+subset+'.npy') for subset in ['train','valid','test']] train = load_sparse(trname) valid = load_sparse(vname) test = load_sparse(tename) if normalize: if name == "ule": train = train.astype(theano.config.floatX) / 255 valid = valid.astype(theano.config.floatX) / 255 test = test.astype(theano.config.floatX) / 255 elif name == "harry": train = train.astype(theano.config.floatX) valid = valid.astype(theano.config.floatX) test = test.astype(theano.config.floatX) std = 0.69336046033925791#train.std()slow to compute train = (train) / std valid = (valid) / std test = (test) / std elif name == "terry": train = train.astype(theano.config.floatX) valid = valid.astype(theano.config.floatX) test = test.astype(theano.config.floatX) train = (train) / 300 valid = (valid) / 300 test = (test) / 300 else: raise Exception("This dataset don't have its normalization defined") if transfer: transfer = load_sparse(os.path.join(config.data_root(),"UTLC","sparse",name+"_transfer.npy")) return train, valid, test, transfer else: return train, valid, test def load_ndarray_label(name): """ Load the train,valid,test data for the dataset `name` and return it in ndarray format. This is only available for the toy dataset ule. """ assert name in ['ule'] trname,vname,tename = [os.path.join(config.data_root(), 'UTLC','filetensor', name+'_'+subset+'.ft') for subset in ['trainl','validl','testl']] trainl = load_filetensor(trname) validl = load_filetensor(vname) testl = load_filetensor(tename) return trainl, validl, testl def load_filetensor(fname): f = None try: if not os.path.exists(fname): fname = fname+'.gz' assert os.path.exists(fname) f = gzip.open(fname) else: f = open(fname) d = ft.read(f) finally: if f: f.close() return d def load_sparse(fname): f = None try: if not os.path.exists(fname): fname = fname+'.gz' assert os.path.exists(fname) f = gzip.open(fname) else: f = open(fname) d = cPickle.load(f) finally: if f: f.close() return d if __name__ == '__main__': import numpy import scipy.sparse # Test loading of transfer data train, valid, test, transfer = load_ndarray_dataset("ule", normalize=True, transfer=True) assert train.shape[0]==transfer.shape[0] for name in ['avicenna','harry','rita','sylvester','ule']: train, valid, test = load_ndarray_dataset(name, normalize=True) print name,"dtype, max, min, mean, std" print train.dtype, train.max(), train.min(), train.mean(), train.std() assert isinstance(train, numpy.ndarray) assert isinstance(valid, numpy.ndarray) assert isinstance(test, numpy.ndarray) assert train.shape[1]==test.shape[1]==valid.shape[1] # Test loading of transfer data train, valid, test, transfer = load_sparse_dataset("ule", normalize=True, transfer=True) assert train.shape[0]==transfer.shape[0] for name in ['harry','terry','ule']: train, valid, test = load_sparse_dataset(name, normalize=True) nb_elem = numpy.prod(train.shape) mi = train.data.min() ma = train.data.max() mi = min(0, mi) ma = max(0, ma) su = train.data.sum() mean = float(su)/nb_elem print name,"dtype, max, min, mean, nb non-zero, nb element, %sparse" print train.dtype, ma, mi, mean, train.nnz, nb_elem, (nb_elem-float(train.nnz))/nb_elem print name,"max, min, mean, std (all stats on non-zero element)" print train.data.max(), train.data.min(), train.data.mean(), train.data.std() assert scipy.sparse.issparse(train) assert scipy.sparse.issparse(valid) assert scipy.sparse.issparse(test) assert train.shape[1]==test.shape[1]==valid.shape[1]