Mercurial > pylearn
annotate pylearn/datasets/utlc.py @ 1402:b14f3d6f5cd4
first version of a script to load the utlc datasets.
author | Frederic Bastien <nouiz@nouiz.org> |
---|---|
date | Fri, 21 Jan 2011 17:05:46 -0500 |
parents | |
children | 89017617ab36 |
rev | line source |
---|---|
1402
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
1 """ |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
2 user should use the load _ndarray_dataset or load_sparse_dataset function |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
3 See the file PYLEARN_DB_PATH/UTCL/README for detail on the datasets. |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
4 See the end of this file for an example on how to load the file. |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
5 """ |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
6 |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
7 import cPickle |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
8 import gzip |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
9 import os |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
10 |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
11 import pylearn.io.filetensor as ft |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
12 import config |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
13 |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
14 def load_ndarray_dataset(name): |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
15 assert name in ['avicenna','harry','rita','sylvester','ule'] |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
16 trname,vname,tename = [os.path.join(config.data_root(), |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
17 'UTLC','filetensor', |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
18 name+'_'+subset+'.ft') |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
19 for subset in ['train','valid','test']] |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
20 train = load_filetensor(trname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
21 valid = load_filetensor(vname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
22 test = load_filetensor(tename) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
23 return train, valid, test |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
24 |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
25 def load_sparse_dataset(name): |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
26 assert name in ['harry','terry','ule'] |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
27 trname,vname,tename = [os.path.join(config.data_root(), |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
28 'UTLC','sparse', |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
29 name+'_'+subset+'.npy') |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
30 for subset in ['train','valid','test']] |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
31 train = load_sparse(trname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
32 valid = load_sparse(vname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
33 test = load_sparse(tename) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
34 return train, valid, test |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
35 |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
36 def load_filetensor(fname): |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
37 f = None |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
38 try: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
39 if not os.path.exists(fname): |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
40 fname = fname+'.gz' |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
41 assert os.path.exists(fname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
42 f = gzip.open(fname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
43 else: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
44 f = open(fname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
45 d = ft.read(f) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
46 finally: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
47 if f: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
48 f.close() |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
49 |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
50 return d |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
51 |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
52 def load_sparse(fname): |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
53 f = None |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
54 try: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
55 if not os.path.exists(fname): |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
56 fname = fname+'.gz' |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
57 assert os.path.exists(fname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
58 f = gzip.open(fname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
59 else: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
60 f = open(fname) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
61 d = cPickle.load(f) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
62 finally: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
63 if f: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
64 f.close() |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
65 return d |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
66 |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
67 if __name__ == '__main__': |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
68 import numpy |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
69 import scipy.sparse |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
70 for name in ['avicenna','harry','rita','sylvester','ule']: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
71 train, valid, test = load_ndarray_dataset(name) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
72 assert isinstance(train, numpy.ndarray) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
73 assert isinstance(valid, numpy.ndarray) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
74 assert isinstance(test, numpy.ndarray) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
75 import pdb;pdb.set_trace() |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
76 assert train.shape[1]==test.shape[1]==valid.shape[1] |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
77 |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
78 for name in ['harry','terry','ule']: |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
79 train, valid, test = load_sparse_dataset(name) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
80 assert scipy.sparse.issparse(train) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
81 assert scipy.sparse.issparse(valid) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
82 assert scipy.sparse.issparse(test) |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
83 import pdb;pdb.set_trace() |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
84 assert train.shape[1]==test.shape[1]==valid.shape[1] |
b14f3d6f5cd4
first version of a script to load the utlc datasets.
Frederic Bastien <nouiz@nouiz.org>
parents:
diff
changeset
|
85 |