Mercurial > pylearn
view pylearn/datasets/utlc.py @ 1404:89017617ab36
normalize 5 of the UTLC datasets.
author | Frederic Bastien <nouiz@nouiz.org> |
---|---|
date | Mon, 24 Jan 2011 13:18:43 -0500 |
parents | b14f3d6f5cd4 |
children | 6003f733a994 |
line wrap: on
line source
""" user should use the load _ndarray_dataset or load_sparse_dataset function See the file ${PYLEARN_DATA_ROOT}/UTCL/README for detail on the datasets. See the end of this file for an example. """ import cPickle import gzip import os import numpy import theano import pylearn.io.filetensor as ft import config def load_ndarray_dataset(name, normalize=True): assert name in ['avicenna','harry','rita','sylvester','ule'] trname,vname,tename = [os.path.join(config.data_root(), 'UTLC','filetensor', name+'_'+subset+'.ft') for subset in ['train','valid','test']] train = load_filetensor(trname) valid = load_filetensor(vname) test = load_filetensor(tename) if normalize: if name == "ule": train = numpy.asarray(train, theano.config.floatX) / 255 valid = numpy.asarray(valid, theano.config.floatX) / 255 test = numpy.asarray(test, theano.config.floatX) / 255 elif name in ["avicenna", "sylvester"]: train = numpy.asarray(train, theano.config.floatX) valid = numpy.asarray(valid, theano.config.floatX) test = numpy.asarray(test, theano.config.floatX) mean = train.mean() std = train.std() train = (train - mean) / std valid = (valid - mean) / std test = (test - mean) / std elif name == "harry": #force float32 as otherwise too big to keep in memory completly train = numpy.asarray(train, "float32") valid = numpy.asarray(valid, "float32") test = numpy.asarray(test, "float32") std = 0.69336046033925791#train.std()slow to compute train = (train) / std valid = (valid) / std test = (test) / std elif name == "rita": #force float32 as otherwise too big to keep in memory completly train = numpy.asarray(train, "float32") valid = numpy.asarray(valid, "float32") test = numpy.asarray(test, "float32") max = train.max() train = (train) / max valid = (valid) / max test = (test) / max else: raise Exception("This dataset don't have its normalization defined") return train, valid, test def load_sparse_dataset(name, normalize=True): assert name in ['harry','terry','ule'] trname,vname,tename = [os.path.join(config.data_root(), 'UTLC','sparse', name+'_'+subset+'.npy') for subset in ['train','valid','test']] train = load_sparse(trname) valid = load_sparse(vname) test = load_sparse(tename) if normalize: if name == "ule": train = train.astype(theano.config.floatX) / 255 valid = valid.astype(theano.config.floatX) / 255 test = test.astype(theano.config.floatX) / 255 elif name == "harry": train = train.astype(theano.config.floatX) valid = valid.astype(theano.config.floatX) test = test.astype(theano.config.floatX) std = 0.69336046033925791#train.std()slow to compute train = (train) / std valid = (valid) / std test = (test) / std #elif name == "terry": # import pdb;pdb.set_trace() # train = train.astype(theano.config.floatX) # valid = valid.astype(theano.config.floatX) # test = test.astype(theano.config.floatX) #max = max(train.data.max(),0) #train = (train) / max #valid = (valid) / max #test = (test) / max else: raise Exception("This dataset don't have its normalization defined") return train, valid, test def load_filetensor(fname): f = None try: if not os.path.exists(fname): fname = fname+'.gz' assert os.path.exists(fname) f = gzip.open(fname) else: f = open(fname) d = ft.read(f) finally: if f: f.close() return d def load_sparse(fname): f = None try: if not os.path.exists(fname): fname = fname+'.gz' assert os.path.exists(fname) f = gzip.open(fname) else: f = open(fname) d = cPickle.load(f) finally: if f: f.close() return d if __name__ == '__main__': import numpy import scipy.sparse for name in ['avicenna','harry','rita','sylvester','ule']: train, valid, test = load_ndarray_dataset(name, normalize=True) print name,"dtype, max, min, mean, std" print train.dtype, train.max(), train.min(), train.mean(), train.std() assert isinstance(train, numpy.ndarray) assert isinstance(valid, numpy.ndarray) assert isinstance(test, numpy.ndarray) assert train.shape[1]==test.shape[1]==valid.shape[1] for name in ['harry','ule','ule']: train, valid, test = load_sparse_dataset(name) nb_elem = numpy.prod(train.shape) mi = train.data.min() ma = train.data.max() mi = min(0, mi) ma = max(0, ma) su = train.data.sum() mean = float(su)/nb_elem print name,"dtype, max, min, mean, nb non-zero, nb element, %sparse" print train.dtype, ma, mi, mean, train.nnz, nb_elem, (nb_elem-float(train.nnz))/nb_elem assert scipy.sparse.issparse(train) assert scipy.sparse.issparse(valid) assert scipy.sparse.issparse(test) assert train.shape[1]==test.shape[1]==valid.shape[1]