Mercurial > pylearn
annotate pylearn/datasets/utlc.py @ 1406:6003f733a994
added the normalization of the last UTLC dataset
author | Frederic Bastien <nouiz@nouiz.org> |
---|---|
date | Tue, 25 Jan 2011 04:16:33 -0500 |
parents | 89017617ab36 |
children | 2993b2a5c1af |
rev | line source |
---|---|
1402
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
1 """ |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
2 user should use the load _ndarray_dataset or load_sparse_dataset function |
1404
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
3 |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
4 See the file ${PYLEARN_DATA_ROOT}/UTCL/README for detail on the datasets. |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
5 |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
6 See the end of this file for an example. |
1402
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
7 """ |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
8 |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
9 import cPickle |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
10 import gzip |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
11 import os |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
12 |
1404
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
13 import numpy |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
14 import theano |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
15 |
1402
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
16 import pylearn.io.filetensor as ft |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
17 import config |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
18 |
1404
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
19 def load_ndarray_dataset(name, normalize=True): |
1402
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
20 assert name in ['avicenna','harry','rita','sylvester','ule'] |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
21 trname,vname,tename = [os.path.join(config.data_root(), |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
22 'UTLC','filetensor', |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
23 name+'_'+subset+'.ft') |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
24 for subset in ['train','valid','test']] |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
25 train = load_filetensor(trname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
26 valid = load_filetensor(vname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
27 test = load_filetensor(tename) |
1404
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
28 if normalize: |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
29 if name == "ule": |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
30 train = numpy.asarray(train, theano.config.floatX) / 255 |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
31 valid = numpy.asarray(valid, theano.config.floatX) / 255 |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
32 test = numpy.asarray(test, theano.config.floatX) / 255 |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
33 elif name in ["avicenna", "sylvester"]: |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
34 train = numpy.asarray(train, theano.config.floatX) |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
35 valid = numpy.asarray(valid, theano.config.floatX) |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
36 test = numpy.asarray(test, theano.config.floatX) |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
37 mean = train.mean() |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
38 std = train.std() |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
39 train = (train - mean) / std |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
40 valid = (valid - mean) / std |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
41 test = (test - mean) / std |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
42 elif name == "harry": |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
43 #force float32 as otherwise too big to keep in memory completly |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
44 train = numpy.asarray(train, "float32") |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
45 valid = numpy.asarray(valid, "float32") |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
46 test = numpy.asarray(test, "float32") |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
47 std = 0.69336046033925791#train.std()slow to compute |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
48 train = (train) / std |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
49 valid = (valid) / std |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
50 test = (test) / std |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
51 elif name == "rita": |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
52 #force float32 as otherwise too big to keep in memory completly |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
53 train = numpy.asarray(train, "float32") |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
54 valid = numpy.asarray(valid, "float32") |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
55 test = numpy.asarray(test, "float32") |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
56 max = train.max() |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
57 train = (train) / max |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
58 valid = (valid) / max |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
59 test = (test) / max |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
60 else: |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
61 raise Exception("This dataset don't have its normalization defined") |
1402
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
62 return train, valid, test |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
63 |
1404
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
64 def load_sparse_dataset(name, normalize=True): |
1402
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
65 assert name in ['harry','terry','ule'] |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
66 trname,vname,tename = [os.path.join(config.data_root(), |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
67 'UTLC','sparse', |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
68 name+'_'+subset+'.npy') |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
69 for subset in ['train','valid','test']] |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
70 train = load_sparse(trname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
71 valid = load_sparse(vname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
72 test = load_sparse(tename) |
1404
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
73 if normalize: |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
74 if name == "ule": |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
75 train = train.astype(theano.config.floatX) / 255 |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
76 valid = valid.astype(theano.config.floatX) / 255 |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
77 test = test.astype(theano.config.floatX) / 255 |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
78 elif name == "harry": |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
79 train = train.astype(theano.config.floatX) |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
80 valid = valid.astype(theano.config.floatX) |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
81 test = test.astype(theano.config.floatX) |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
82 std = 0.69336046033925791#train.std()slow to compute |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
83 train = (train) / std |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
84 valid = (valid) / std |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
85 test = (test) / std |
1406
6003f733a994
added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents:
1404
diff
changeset
|
86 elif name == "terry": |
6003f733a994
added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents:
1404
diff
changeset
|
87 train = train.astype(theano.config.floatX) |
6003f733a994
added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents:
1404
diff
changeset
|
88 valid = valid.astype(theano.config.floatX) |
6003f733a994
added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents:
1404
diff
changeset
|
89 test = test.astype(theano.config.floatX) |
6003f733a994
added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents:
1404
diff
changeset
|
90 train = (train) / 300 |
6003f733a994
added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents:
1404
diff
changeset
|
91 valid = (valid) / 300 |
6003f733a994
added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents:
1404
diff
changeset
|
92 test = (test) / 300 |
1404
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
93 else: |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
94 raise Exception("This dataset don't have its normalization defined") |
1402
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
95 return train, valid, test |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
96 |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
97 def load_filetensor(fname): |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
98 f = None |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
99 try: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
100 if not os.path.exists(fname): |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
101 fname = fname+'.gz' |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
102 assert os.path.exists(fname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
103 f = gzip.open(fname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
104 else: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
105 f = open(fname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
106 d = ft.read(f) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
107 finally: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
108 if f: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
109 f.close() |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
110 |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
111 return d |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
112 |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
113 def load_sparse(fname): |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
114 f = None |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
115 try: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
116 if not os.path.exists(fname): |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
117 fname = fname+'.gz' |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
118 assert os.path.exists(fname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
119 f = gzip.open(fname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
120 else: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
121 f = open(fname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
122 d = cPickle.load(f) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
123 finally: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
124 if f: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
125 f.close() |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
126 return d |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
127 |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
128 if __name__ == '__main__': |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
129 import numpy |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
130 import scipy.sparse |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
131 for name in ['avicenna','harry','rita','sylvester','ule']: |
1404
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
132 train, valid, test = load_ndarray_dataset(name, normalize=True) |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
133 print name,"dtype, max, min, mean, std" |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
134 print train.dtype, train.max(), train.min(), train.mean(), train.std() |
1402
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
135 assert isinstance(train, numpy.ndarray) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
136 assert isinstance(valid, numpy.ndarray) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
137 assert isinstance(test, numpy.ndarray) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
138 assert train.shape[1]==test.shape[1]==valid.shape[1] |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
139 |
1406
6003f733a994
added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents:
1404
diff
changeset
|
140 for name in ['harry','terry','ule']: |
6003f733a994
added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents:
1404
diff
changeset
|
141 train, valid, test = load_sparse_dataset(name, normalize=True) |
1404
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
142 nb_elem = numpy.prod(train.shape) |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
143 mi = train.data.min() |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
144 ma = train.data.max() |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
145 mi = min(0, mi) |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
146 ma = max(0, ma) |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
147 su = train.data.sum() |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
148 mean = float(su)/nb_elem |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
149 print name,"dtype, max, min, mean, nb non-zero, nb element, %sparse" |
89017617ab36
normalize 5 of the UTLC datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
1402
diff
changeset
|
150 print train.dtype, ma, mi, mean, train.nnz, nb_elem, (nb_elem-float(train.nnz))/nb_elem |
1406
6003f733a994
added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents:
1404
diff
changeset
|
151 print name,"max, min, mean, std (all stats on non-zero element)" |
6003f733a994
added the normalization of the last UTLC dataset
Frederic Bastien <nouiz@nouiz.org>
parents:
1404
diff
changeset
|
152 print train.data.max(), train.data.min(), train.data.mean(), train.data.std() |
1402
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
153 assert scipy.sparse.issparse(train) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
154 assert scipy.sparse.issparse(valid) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
155 assert scipy.sparse.issparse(test) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
156 assert train.shape[1]==test.shape[1]==valid.shape[1] |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
157 |