changeset 1402:b14f3d6f5cd4

first version of a script to load the utlc datasets.
author Frederic Bastien <nouiz@nouiz.org>
date Fri, 21 Jan 2011 17:05:46 -0500
parents e06c0ff46d2a
children 6ade5b39b773
files pylearn/datasets/utlc.py
diffstat 1 files changed, 85 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/utlc.py	Fri Jan 21 17:05:46 2011 -0500
@@ -0,0 +1,85 @@
+""" 
+user should use the load _ndarray_dataset or load_sparse_dataset function
+See the file PYLEARN_DB_PATH/UTCL/README for detail on the datasets.
+See the end of this file for an example on how to load the file.
+"""
+
+import cPickle
+import gzip
+import os
+
+import pylearn.io.filetensor as ft
+import config
+
+def load_ndarray_dataset(name):
+    assert name in ['avicenna','harry','rita','sylvester','ule']
+    trname,vname,tename = [os.path.join(config.data_root(),
+                                        'UTLC','filetensor',
+                                        name+'_'+subset+'.ft') 
+                           for subset in ['train','valid','test']]
+    train = load_filetensor(trname)
+    valid = load_filetensor(vname)
+    test = load_filetensor(tename)
+    return train, valid, test
+
+def load_sparse_dataset(name):
+    assert name in ['harry','terry','ule']
+    trname,vname,tename = [os.path.join(config.data_root(),
+                                        'UTLC','sparse',
+                                        name+'_'+subset+'.npy') 
+                           for subset in ['train','valid','test']]
+    train = load_sparse(trname)
+    valid = load_sparse(vname)
+    test = load_sparse(tename)
+    return train, valid, test
+    
+def load_filetensor(fname):
+    f = None
+    try:
+        if not os.path.exists(fname):
+            fname = fname+'.gz'
+            assert os.path.exists(fname)
+            f = gzip.open(fname)
+        else:
+            f = open(fname)
+        d = ft.read(f)
+    finally:
+        if f:
+            f.close()
+
+    return d
+
+def load_sparse(fname):
+    f = None
+    try:
+        if not os.path.exists(fname):
+            fname = fname+'.gz'
+            assert os.path.exists(fname)
+            f = gzip.open(fname)
+        else:
+            f = open(fname)
+        d = cPickle.load(f)
+    finally:
+        if f:
+            f.close()
+    return d
+
+if __name__ == '__main__':
+    import numpy
+    import scipy.sparse
+    for name in ['avicenna','harry','rita','sylvester','ule']:
+        train, valid, test = load_ndarray_dataset(name)
+        assert isinstance(train, numpy.ndarray)
+        assert isinstance(valid, numpy.ndarray)
+        assert isinstance(test, numpy.ndarray)
+        import pdb;pdb.set_trace()
+        assert train.shape[1]==test.shape[1]==valid.shape[1]
+
+    for name in ['harry','terry','ule']:
+        train, valid, test = load_sparse_dataset(name)
+        assert scipy.sparse.issparse(train)
+        assert scipy.sparse.issparse(valid)
+        assert scipy.sparse.issparse(test)
+        import pdb;pdb.set_trace()
+        assert train.shape[1]==test.shape[1]==valid.shape[1]
+