Mercurial > pylearn
changeset 1402:b14f3d6f5cd4
first version of a script to load the utlc datasets.
author | Frederic Bastien <nouiz@nouiz.org> |
---|---|
date | Fri, 21 Jan 2011 17:05:46 -0500 |
parents | e06c0ff46d2a |
children | 6ade5b39b773 |
files | pylearn/datasets/utlc.py |
diffstat | 1 files changed, 85 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/utlc.py Fri Jan 21 17:05:46 2011 -0500 @@ -0,0 +1,85 @@ +""" +user should use the load _ndarray_dataset or load_sparse_dataset function +See the file PYLEARN_DB_PATH/UTCL/README for detail on the datasets. +See the end of this file for an example on how to load the file. +""" + +import cPickle +import gzip +import os + +import pylearn.io.filetensor as ft +import config + +def load_ndarray_dataset(name): + assert name in ['avicenna','harry','rita','sylvester','ule'] + trname,vname,tename = [os.path.join(config.data_root(), + 'UTLC','filetensor', + name+'_'+subset+'.ft') + for subset in ['train','valid','test']] + train = load_filetensor(trname) + valid = load_filetensor(vname) + test = load_filetensor(tename) + return train, valid, test + +def load_sparse_dataset(name): + assert name in ['harry','terry','ule'] + trname,vname,tename = [os.path.join(config.data_root(), + 'UTLC','sparse', + name+'_'+subset+'.npy') + for subset in ['train','valid','test']] + train = load_sparse(trname) + valid = load_sparse(vname) + test = load_sparse(tename) + return train, valid, test + +def load_filetensor(fname): + f = None + try: + if not os.path.exists(fname): + fname = fname+'.gz' + assert os.path.exists(fname) + f = gzip.open(fname) + else: + f = open(fname) + d = ft.read(f) + finally: + if f: + f.close() + + return d + +def load_sparse(fname): + f = None + try: + if not os.path.exists(fname): + fname = fname+'.gz' + assert os.path.exists(fname) + f = gzip.open(fname) + else: + f = open(fname) + d = cPickle.load(f) + finally: + if f: + f.close() + return d + +if __name__ == '__main__': + import numpy + import scipy.sparse + for name in ['avicenna','harry','rita','sylvester','ule']: + train, valid, test = load_ndarray_dataset(name) + assert isinstance(train, numpy.ndarray) + assert isinstance(valid, numpy.ndarray) + assert isinstance(test, numpy.ndarray) + import pdb;pdb.set_trace() + assert train.shape[1]==test.shape[1]==valid.shape[1] + + for name in ['harry','terry','ule']: + train, valid, test = load_sparse_dataset(name) + assert scipy.sparse.issparse(train) + assert scipy.sparse.issparse(valid) + assert scipy.sparse.issparse(test) + import pdb;pdb.set_trace() + assert train.shape[1]==test.shape[1]==valid.shape[1] +