changeset 1404:89017617ab36

normalize 5 of the UTLC datasets.
author Frederic Bastien <nouiz@nouiz.org>
date Mon, 24 Jan 2011 13:18:43 -0500
parents 6ade5b39b773
children f9e4d71aa353 6003f733a994
files pylearn/datasets/utlc.py
diffstat 1 files changed, 81 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/datasets/utlc.py	Fri Jan 21 20:40:57 2011 -0500
+++ b/pylearn/datasets/utlc.py	Mon Jan 24 13:18:43 2011 -0500
@@ -1,17 +1,22 @@
 """ 
 user should use the load _ndarray_dataset or load_sparse_dataset function
-See the file PYLEARN_DB_PATH/UTCL/README for detail on the datasets.
-See the end of this file for an example on how to load the file.
+
+See the file ${PYLEARN_DATA_ROOT}/UTCL/README for detail on the datasets.
+
+See the end of this file for an example.
 """
 
 import cPickle
 import gzip
 import os
 
+import numpy
+import theano
+
 import pylearn.io.filetensor as ft
 import config
 
-def load_ndarray_dataset(name):
+def load_ndarray_dataset(name, normalize=True):
     assert name in ['avicenna','harry','rita','sylvester','ule']
     trname,vname,tename = [os.path.join(config.data_root(),
                                         'UTLC','filetensor',
@@ -20,9 +25,43 @@
     train = load_filetensor(trname)
     valid = load_filetensor(vname)
     test = load_filetensor(tename)
+    if normalize:
+        if name == "ule":
+            train = numpy.asarray(train, theano.config.floatX) / 255
+            valid = numpy.asarray(valid, theano.config.floatX) / 255
+            test = numpy.asarray(test, theano.config.floatX) / 255
+        elif name in ["avicenna", "sylvester"]:
+            train = numpy.asarray(train, theano.config.floatX)
+            valid = numpy.asarray(valid, theano.config.floatX)
+            test = numpy.asarray(test, theano.config.floatX)
+            mean = train.mean()
+            std = train.std()
+            train = (train - mean) / std
+            valid = (valid - mean) / std
+            test = (test - mean) / std  
+        elif name == "harry":
+            #force float32 as otherwise too big to keep in memory completly
+            train = numpy.asarray(train, "float32")
+            valid = numpy.asarray(valid, "float32")
+            test = numpy.asarray(test, "float32")
+            std = 0.69336046033925791#train.std()slow to compute
+            train = (train) / std
+            valid = (valid) / std
+            test = (test) / std  
+        elif name == "rita":
+            #force float32 as otherwise too big to keep in memory completly
+            train = numpy.asarray(train, "float32")
+            valid = numpy.asarray(valid, "float32")
+            test = numpy.asarray(test, "float32")
+            max = train.max()
+            train = (train) / max
+            valid = (valid) / max
+            test = (test) / max  
+        else:
+            raise Exception("This dataset don't have its normalization defined")
     return train, valid, test
 
-def load_sparse_dataset(name):
+def load_sparse_dataset(name, normalize=True):
     assert name in ['harry','terry','ule']
     trname,vname,tename = [os.path.join(config.data_root(),
                                         'UTLC','sparse',
@@ -31,6 +70,30 @@
     train = load_sparse(trname)
     valid = load_sparse(vname)
     test = load_sparse(tename)
+    if normalize:
+        if name == "ule":
+            train = train.astype(theano.config.floatX) / 255
+            valid = valid.astype(theano.config.floatX) / 255
+            test = test.astype(theano.config.floatX) / 255
+        elif name == "harry":
+            train = train.astype(theano.config.floatX)
+            valid = valid.astype(theano.config.floatX)
+            test = test.astype(theano.config.floatX)
+            std = 0.69336046033925791#train.std()slow to compute
+            train = (train) / std
+            valid = (valid) / std
+            test = (test) / std  
+        #elif name == "terry":
+        #    import pdb;pdb.set_trace()
+        #    train = train.astype(theano.config.floatX)
+        #    valid = valid.astype(theano.config.floatX)
+        #    test = test.astype(theano.config.floatX)
+            #max = max(train.data.max(),0)
+            #train = (train) / max
+            #valid = (valid) / max
+            #test = (test) / max  
+        else:
+            raise Exception("This dataset don't have its normalization defined")
     return train, valid, test
     
 def load_filetensor(fname):
@@ -68,18 +131,28 @@
     import numpy
     import scipy.sparse
     for name in ['avicenna','harry','rita','sylvester','ule']:
-        train, valid, test = load_ndarray_dataset(name)
+        train, valid, test = load_ndarray_dataset(name, normalize=True)
+        print name,"dtype, max, min, mean, std"
+        print train.dtype, train.max(), train.min(), train.mean(), train.std()
         assert isinstance(train, numpy.ndarray)
         assert isinstance(valid, numpy.ndarray)
         assert isinstance(test, numpy.ndarray)
-        import pdb;pdb.set_trace()
         assert train.shape[1]==test.shape[1]==valid.shape[1]
 
-    for name in ['harry','terry','ule']:
+    for name in ['harry','ule','ule']:
         train, valid, test = load_sparse_dataset(name)
+        nb_elem = numpy.prod(train.shape)
+        mi = train.data.min()
+        ma = train.data.max()
+        mi = min(0, mi)
+        ma = max(0, ma)
+        su = train.data.sum()
+        mean = float(su)/nb_elem
+        print name,"dtype, max, min, mean, nb non-zero, nb element, %sparse"
+        print train.dtype, ma, mi, mean, train.nnz, nb_elem, (nb_elem-float(train.nnz))/nb_elem
+        
         assert scipy.sparse.issparse(train)
         assert scipy.sparse.issparse(valid)
         assert scipy.sparse.issparse(test)
-        import pdb;pdb.set_trace()
         assert train.shape[1]==test.shape[1]==valid.shape[1]