changeset 1426:4988f8ea0836

in utlc.py, implement a parameter that return a Theano variable that represent the normalized data. Usefull for on the fly normalization.
author Frederic Bastien <nouiz@nouiz.org>
date Tue, 08 Feb 2011 12:33:33 -0500
parents 25985fb3bb4f
children a36d3a406c59
files pylearn/datasets/utlc.py
diffstat 1 files changed, 48 insertions(+), 30 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/datasets/utlc.py	Tue Feb 08 11:14:37 2011 -0500
+++ b/pylearn/datasets/utlc.py	Tue Feb 08 12:33:33 2011 -0500
@@ -16,57 +16,73 @@
 import pylearn.io.filetensor as ft
 import config
 
-def load_ndarray_dataset(name, normalize=True, transfer=False):
+def load_ndarray_dataset(name, normalize=True, transfer=False, normalize_on_the_fly=False):
     """ Load the train,valid,test data for the dataset `name`
         and return it in ndarray format.
     
     :param normalize: If True, we normalize the train dataset
                       before returning it
     :param transfer: If True also return the transfer label
+    :param normalize_on_the_fly: If True, we return a Theano Variable that will give
+                                 as output the normalized value. If the user only
+                                 take a subtensor of that variable, Theano optimization
+                                 should make that we will only have in memory the subtensor
+                                 portion that is computed in normalized form. We store
+                                 the original data in shared memory in its original dtype.
+
+                                 This is usefull to have the original data in its original
+                                 dtype in memory to same memory. Especialy usefull to
+                                 be able to use rita and harry with 1G per jobs.
     """
+    assert not (normalize and normalize_on_the_fly), "Can't normalize in 2 way at the same time!"
+    
     assert name in ['avicenna','harry','rita','sylvester','ule']
-    trname,vname,tename = [os.path.join(config.data_root(),
-                                        'UTLC','filetensor',
-                                        name+'_'+subset+'.ft') 
+    common = os.path.join('UTLC','filetensor',name+'_')
+    trname,vname,tename = [config.get_filepath_in_roots(common+subset+'.ft.gz',
+                                                        common+subset+'.ft')
                            for subset in ['train','valid','test']]
+
     train = load_filetensor(trname)
     valid = load_filetensor(vname)
     test = load_filetensor(tename)
-    if normalize:
-        if name == "ule":
-            train = numpy.asarray(train, theano.config.floatX) / 255
-            valid = numpy.asarray(valid, theano.config.floatX) / 255
-            test = numpy.asarray(test, theano.config.floatX) / 255
-        elif name in ["avicenna", "sylvester"]:
+
+    if normalize or normalize_on_the_fly:
+        if normalize_on_the_fly:
+            train = theano.shared(train, borrow=True, name=name+"_train")
+            valid = theano.shared(valid, borrow=True, name=name+"_valid")
+            test = theano.shared(test, borrow=True, name=name+"_test")
+        else:
             train = numpy.asarray(train, theano.config.floatX)
             valid = numpy.asarray(valid, theano.config.floatX)
             test = numpy.asarray(test, theano.config.floatX)
-            mean = train.mean()
-            std = train.std()
-            train -= mean
-            valid -= mean
-            test -= mean
-            train /= std
-            valid /= std
-            test /= std
+
+        if name == "ule":
+            train /= 255
+            valid /= 255
+            test /= 255
+        elif name in ["avicenna", "sylvester"]:
+            if name == "avicenna":
+                train_mean = 514.62154022835455
+                train_std = 6.829096494224145
+            else:
+                train_mean = 403.81889927027686
+                train_std = 96.43841050784053
+            train -= train_mean
+            valid -= train_mean
+            test -= train_mean
+            train /= train_std
+            valid /= train_std
+            test /= train_std
         elif name == "harry":
-            #force float32 as otherwise too big to keep in memory completly
-            train = numpy.asarray(train, "float32")
-            valid = numpy.asarray(valid, "float32")
-            test = numpy.asarray(test, "float32")
             std = 0.69336046033925791#train.std()slow to compute
             train /= std
             valid /= std
             test /= std  
         elif name == "rita":
-            #force float32 as otherwise too big to keep in memory completly
-            train = numpy.asarray(train, "float32")
-            valid = numpy.asarray(valid, "float32")
-            test = numpy.asarray(test, "float32")
-            max = train.max()
-            train /= max
-            valid /= max
-            test /= max  
+            v = numpy.asarray(230, dtype=theano.config.floatX)
+            train /= v
+            valid /= v
+            test /= v
         else:
             raise Exception("This dataset don't have its normalization defined")
     if transfer:
@@ -142,6 +158,8 @@
             fname = fname+'.gz'
             assert os.path.exists(fname)
             f = gzip.open(fname)
+        elif fname.endswith('.gz'):
+            f = gzip.open(fname)
         else:
             f = open(fname)
         d = ft.read(f)