annotate pylearn/datasets/utlc.py @ 1406:6003f733a994

added the normalization of the last UTLC dataset
author Frederic Bastien <nouiz@nouiz.org>
date Tue, 25 Jan 2011 04:16:33 -0500
parents 89017617ab36
children 2993b2a5c1af
rev   line source
1402
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
1 """
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
2 user should use the load _ndarray_dataset or load_sparse_dataset function
1404
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
3
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
4 See the file ${PYLEARN_DATA_ROOT}/UTCL/README for detail on the datasets.
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
5
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
6 See the end of this file for an example.
1402
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
7 """
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
8
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
9 import cPickle
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
10 import gzip
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
11 import os
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
12
1404
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
13 import numpy
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
14 import theano
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
15
1402
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
16 import pylearn.io.filetensor as ft
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
17 import config
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
18
1404
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
19 def load_ndarray_dataset(name, normalize=True):
1402
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
20 assert name in ['avicenna','harry','rita','sylvester','ule']
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
21 trname,vname,tename = [os.path.join(config.data_root(),
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
22 'UTLC','filetensor',
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
23 name+'_'+subset+'.ft')
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
24 for subset in ['train','valid','test']]
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
25 train = load_filetensor(trname)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
26 valid = load_filetensor(vname)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
27 test = load_filetensor(tename)
1404
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
28 if normalize:
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
29 if name == "ule":
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
30 train = numpy.asarray(train, theano.config.floatX) / 255
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
31 valid = numpy.asarray(valid, theano.config.floatX) / 255
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
32 test = numpy.asarray(test, theano.config.floatX) / 255
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
33 elif name in ["avicenna", "sylvester"]:
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
34 train = numpy.asarray(train, theano.config.floatX)
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
35 valid = numpy.asarray(valid, theano.config.floatX)
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
36 test = numpy.asarray(test, theano.config.floatX)
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
37 mean = train.mean()
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
38 std = train.std()
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
39 train = (train - mean) / std
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
40 valid = (valid - mean) / std
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
41 test = (test - mean) / std
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
42 elif name == "harry":
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
43 #force float32 as otherwise too big to keep in memory completly
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
44 train = numpy.asarray(train, "float32")
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
45 valid = numpy.asarray(valid, "float32")
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
46 test = numpy.asarray(test, "float32")
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
47 std = 0.69336046033925791#train.std()slow to compute
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
48 train = (train) / std
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
49 valid = (valid) / std
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
50 test = (test) / std
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
51 elif name == "rita":
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
52 #force float32 as otherwise too big to keep in memory completly
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
53 train = numpy.asarray(train, "float32")
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
54 valid = numpy.asarray(valid, "float32")
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
55 test = numpy.asarray(test, "float32")
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
56 max = train.max()
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
57 train = (train) / max
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
58 valid = (valid) / max
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
59 test = (test) / max
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
60 else:
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
61 raise Exception("This dataset don't have its normalization defined")
1402
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
62 return train, valid, test
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
63
1404
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
64 def load_sparse_dataset(name, normalize=True):
1402
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
65 assert name in ['harry','terry','ule']
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
66 trname,vname,tename = [os.path.join(config.data_root(),
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
67 'UTLC','sparse',
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
68 name+'_'+subset+'.npy')
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
69 for subset in ['train','valid','test']]
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
70 train = load_sparse(trname)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
71 valid = load_sparse(vname)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
72 test = load_sparse(tename)
1404
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
73 if normalize:
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
74 if name == "ule":
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
75 train = train.astype(theano.config.floatX) / 255
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
76 valid = valid.astype(theano.config.floatX) / 255
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
77 test = test.astype(theano.config.floatX) / 255
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
78 elif name == "harry":
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
79 train = train.astype(theano.config.floatX)
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
80 valid = valid.astype(theano.config.floatX)
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
81 test = test.astype(theano.config.floatX)
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
82 std = 0.69336046033925791#train.std()slow to compute
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
83 train = (train) / std
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
84 valid = (valid) / std
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
85 test = (test) / std
1406
6003f733a994 added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents: 1404
diff changeset
86 elif name == "terry":
6003f733a994 added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents: 1404
diff changeset
87 train = train.astype(theano.config.floatX)
6003f733a994 added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents: 1404
diff changeset
88 valid = valid.astype(theano.config.floatX)
6003f733a994 added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents: 1404
diff changeset
89 test = test.astype(theano.config.floatX)
6003f733a994 added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents: 1404
diff changeset
90 train = (train) / 300
6003f733a994 added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents: 1404
diff changeset
91 valid = (valid) / 300
6003f733a994 added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents: 1404
diff changeset
92 test = (test) / 300
1404
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
93 else:
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
94 raise Exception("This dataset don't have its normalization defined")
1402
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
95 return train, valid, test
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
96
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
97 def load_filetensor(fname):
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
98 f = None
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
99 try:
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
100 if not os.path.exists(fname):
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
101 fname = fname+'.gz'
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
102 assert os.path.exists(fname)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
103 f = gzip.open(fname)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
104 else:
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
105 f = open(fname)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
106 d = ft.read(f)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
107 finally:
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
108 if f:
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
109 f.close()
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
110
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
111 return d
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
112
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
113 def load_sparse(fname):
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
114 f = None
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
115 try:
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
116 if not os.path.exists(fname):
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
117 fname = fname+'.gz'
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
118 assert os.path.exists(fname)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
119 f = gzip.open(fname)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
120 else:
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
121 f = open(fname)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
122 d = cPickle.load(f)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
123 finally:
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
124 if f:
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
125 f.close()
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
126 return d
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
127
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
128 if __name__ == '__main__':
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
129 import numpy
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
130 import scipy.sparse
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
131 for name in ['avicenna','harry','rita','sylvester','ule']:
1404
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
132 train, valid, test = load_ndarray_dataset(name, normalize=True)
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
133 print name,"dtype, max, min, mean, std"
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
134 print train.dtype, train.max(), train.min(), train.mean(), train.std()
1402
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
135 assert isinstance(train, numpy.ndarray)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
136 assert isinstance(valid, numpy.ndarray)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
137 assert isinstance(test, numpy.ndarray)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
138 assert train.shape[1]==test.shape[1]==valid.shape[1]
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
139
1406
6003f733a994 added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents: 1404
diff changeset
140 for name in ['harry','terry','ule']:
6003f733a994 added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents: 1404
diff changeset
141 train, valid, test = load_sparse_dataset(name, normalize=True)
1404
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
142 nb_elem = numpy.prod(train.shape)
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
143 mi = train.data.min()
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
144 ma = train.data.max()
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
145 mi = min(0, mi)
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
146 ma = max(0, ma)
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
147 su = train.data.sum()
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
148 mean = float(su)/nb_elem
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
149 print name,"dtype, max, min, mean, nb non-zero, nb element, %sparse"
89017617ab36 normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents: 1402
diff changeset
150 print train.dtype, ma, mi, mean, train.nnz, nb_elem, (nb_elem-float(train.nnz))/nb_elem
1406
6003f733a994 added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents: 1404
diff changeset
151 print name,"max, min, mean, std (all stats on non-zero element)"
6003f733a994 added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents: 1404
diff changeset
152 print train.data.max(), train.data.min(), train.data.mean(), train.data.std()
1402
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
153 assert scipy.sparse.issparse(train)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
154 assert scipy.sparse.issparse(valid)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
155 assert scipy.sparse.issparse(test)
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
156 assert train.shape[1]==test.shape[1]==valid.shape[1]
b14f3d6f5cd4 first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff changeset
157