# HG changeset patch # User Frederic Bastien # Date 1297186413 18000 # Node ID 4988f8ea0836668a437960658bb6ecfafae8a529 # Parent 25985fb3bb4fb855bab9ce162dfad0d77321cf87 in utlc.py, implement a parameter that return a Theano variable that represent the normalized data. Usefull for on the fly normalization. diff -r 25985fb3bb4f -r 4988f8ea0836 pylearn/datasets/utlc.py --- a/pylearn/datasets/utlc.py Tue Feb 08 11:14:37 2011 -0500 +++ b/pylearn/datasets/utlc.py Tue Feb 08 12:33:33 2011 -0500 @@ -16,57 +16,73 @@ import pylearn.io.filetensor as ft import config -def load_ndarray_dataset(name, normalize=True, transfer=False): +def load_ndarray_dataset(name, normalize=True, transfer=False, normalize_on_the_fly=False): """ Load the train,valid,test data for the dataset `name` and return it in ndarray format. :param normalize: If True, we normalize the train dataset before returning it :param transfer: If True also return the transfer label + :param normalize_on_the_fly: If True, we return a Theano Variable that will give + as output the normalized value. If the user only + take a subtensor of that variable, Theano optimization + should make that we will only have in memory the subtensor + portion that is computed in normalized form. We store + the original data in shared memory in its original dtype. + + This is usefull to have the original data in its original + dtype in memory to same memory. Especialy usefull to + be able to use rita and harry with 1G per jobs. """ + assert not (normalize and normalize_on_the_fly), "Can't normalize in 2 way at the same time!" + assert name in ['avicenna','harry','rita','sylvester','ule'] - trname,vname,tename = [os.path.join(config.data_root(), - 'UTLC','filetensor', - name+'_'+subset+'.ft') + common = os.path.join('UTLC','filetensor',name+'_') + trname,vname,tename = [config.get_filepath_in_roots(common+subset+'.ft.gz', + common+subset+'.ft') for subset in ['train','valid','test']] + train = load_filetensor(trname) valid = load_filetensor(vname) test = load_filetensor(tename) - if normalize: - if name == "ule": - train = numpy.asarray(train, theano.config.floatX) / 255 - valid = numpy.asarray(valid, theano.config.floatX) / 255 - test = numpy.asarray(test, theano.config.floatX) / 255 - elif name in ["avicenna", "sylvester"]: + + if normalize or normalize_on_the_fly: + if normalize_on_the_fly: + train = theano.shared(train, borrow=True, name=name+"_train") + valid = theano.shared(valid, borrow=True, name=name+"_valid") + test = theano.shared(test, borrow=True, name=name+"_test") + else: train = numpy.asarray(train, theano.config.floatX) valid = numpy.asarray(valid, theano.config.floatX) test = numpy.asarray(test, theano.config.floatX) - mean = train.mean() - std = train.std() - train -= mean - valid -= mean - test -= mean - train /= std - valid /= std - test /= std + + if name == "ule": + train /= 255 + valid /= 255 + test /= 255 + elif name in ["avicenna", "sylvester"]: + if name == "avicenna": + train_mean = 514.62154022835455 + train_std = 6.829096494224145 + else: + train_mean = 403.81889927027686 + train_std = 96.43841050784053 + train -= train_mean + valid -= train_mean + test -= train_mean + train /= train_std + valid /= train_std + test /= train_std elif name == "harry": - #force float32 as otherwise too big to keep in memory completly - train = numpy.asarray(train, "float32") - valid = numpy.asarray(valid, "float32") - test = numpy.asarray(test, "float32") std = 0.69336046033925791#train.std()slow to compute train /= std valid /= std test /= std elif name == "rita": - #force float32 as otherwise too big to keep in memory completly - train = numpy.asarray(train, "float32") - valid = numpy.asarray(valid, "float32") - test = numpy.asarray(test, "float32") - max = train.max() - train /= max - valid /= max - test /= max + v = numpy.asarray(230, dtype=theano.config.floatX) + train /= v + valid /= v + test /= v else: raise Exception("This dataset don't have its normalization defined") if transfer: @@ -142,6 +158,8 @@ fname = fname+'.gz' assert os.path.exists(fname) f = gzip.open(fname) + elif fname.endswith('.gz'): + f = gzip.open(fname) else: f = open(fname) d = ft.read(f)