Mercurial > pylearn
view pylearn/datasets/utlc.py @ 1406:6003f733a994
added the normalization of the last UTLC dataset
author | Frederic Bastien <nouiz@nouiz.org> |
---|---|
date | Tue, 25 Jan 2011 04:16:33 -0500 |
parents | 89017617ab36 |
children | 2993b2a5c1af |
line wrap: on
line source
""" user should use the load _ndarray_dataset or load_sparse_dataset function See the file ${PYLEARN_DATA_ROOT}/UTCL/README for detail on the datasets. See the end of this file for an example. """ import cPickle import gzip import os import numpy import theano import pylearn.io.filetensor as ft import config def load_ndarray_dataset(name, normalize=True): assert name in ['avicenna','harry','rita','sylvester','ule'] trname,vname,tename = [os.path.join(config.data_root(), 'UTLC','filetensor', name+'_'+subset+'.ft') for subset in ['train','valid','test']] train = load_filetensor(trname) valid = load_filetensor(vname) test = load_filetensor(tename) if normalize: if name == "ule": train = numpy.asarray(train, theano.config.floatX) / 255 valid = numpy.asarray(valid, theano.config.floatX) / 255 test = numpy.asarray(test, theano.config.floatX) / 255 elif name in ["avicenna", "sylvester"]: train = numpy.asarray(train, theano.config.floatX) valid = numpy.asarray(valid, theano.config.floatX) test = numpy.asarray(test, theano.config.floatX) mean = train.mean() std = train.std() train = (train - mean) / std valid = (valid - mean) / std test = (test - mean) / std elif name == "harry": #force float32 as otherwise too big to keep in memory completly train = numpy.asarray(train, "float32") valid = numpy.asarray(valid, "float32") test = numpy.asarray(test, "float32") std = 0.69336046033925791#train.std()slow to compute train = (train) / std valid = (valid) / std test = (test) / std elif name == "rita": #force float32 as otherwise too big to keep in memory completly train = numpy.asarray(train, "float32") valid = numpy.asarray(valid, "float32") test = numpy.asarray(test, "float32") max = train.max() train = (train) / max valid = (valid) / max test = (test) / max else: raise Exception("This dataset don't have its normalization defined") return train, valid, test def load_sparse_dataset(name, normalize=True): assert name in ['harry','terry','ule'] trname,vname,tename = [os.path.join(config.data_root(), 'UTLC','sparse', name+'_'+subset+'.npy') for subset in ['train','valid','test']] train = load_sparse(trname) valid = load_sparse(vname) test = load_sparse(tename) if normalize: if name == "ule": train = train.astype(theano.config.floatX) / 255 valid = valid.astype(theano.config.floatX) / 255 test = test.astype(theano.config.floatX) / 255 elif name == "harry": train = train.astype(theano.config.floatX) valid = valid.astype(theano.config.floatX) test = test.astype(theano.config.floatX) std = 0.69336046033925791#train.std()slow to compute train = (train) / std valid = (valid) / std test = (test) / std elif name == "terry": train = train.astype(theano.config.floatX) valid = valid.astype(theano.config.floatX) test = test.astype(theano.config.floatX) train = (train) / 300 valid = (valid) / 300 test = (test) / 300 else: raise Exception("This dataset don't have its normalization defined") return train, valid, test def load_filetensor(fname): f = None try: if not os.path.exists(fname): fname = fname+'.gz' assert os.path.exists(fname) f = gzip.open(fname) else: f = open(fname) d = ft.read(f) finally: if f: f.close() return d def load_sparse(fname): f = None try: if not os.path.exists(fname): fname = fname+'.gz' assert os.path.exists(fname) f = gzip.open(fname) else: f = open(fname) d = cPickle.load(f) finally: if f: f.close() return d if __name__ == '__main__': import numpy import scipy.sparse for name in ['avicenna','harry','rita','sylvester','ule']: train, valid, test = load_ndarray_dataset(name, normalize=True) print name,"dtype, max, min, mean, std" print train.dtype, train.max(), train.min(), train.mean(), train.std() assert isinstance(train, numpy.ndarray) assert isinstance(valid, numpy.ndarray) assert isinstance(test, numpy.ndarray) assert train.shape[1]==test.shape[1]==valid.shape[1] for name in ['harry','terry','ule']: train, valid, test = load_sparse_dataset(name, normalize=True) nb_elem = numpy.prod(train.shape) mi = train.data.min() ma = train.data.max() mi = min(0, mi) ma = max(0, ma) su = train.data.sum() mean = float(su)/nb_elem print name,"dtype, max, min, mean, nb non-zero, nb element, %sparse" print train.dtype, ma, mi, mean, train.nnz, nb_elem, (nb_elem-float(train.nnz))/nb_elem print name,"max, min, mean, std (all stats on non-zero element)" print train.data.max(), train.data.min(), train.data.mean(), train.data.std() assert scipy.sparse.issparse(train) assert scipy.sparse.issparse(valid) assert scipy.sparse.issparse(test) assert train.shape[1]==test.shape[1]==valid.shape[1]