# HG changeset patch # User Frederic Bastien # Date 1295647546 18000 # Node ID b14f3d6f5cd44e9cda24fa01637af48c18319cff # Parent e06c0ff46d2a6feedd1b9e6b226ab067fa6617b5 first version of a script to load the utlc datasets. diff -r e06c0ff46d2a -r b14f3d6f5cd4 pylearn/datasets/utlc.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/utlc.py Fri Jan 21 17:05:46 2011 -0500 @@ -0,0 +1,85 @@ +""" +user should use the load _ndarray_dataset or load_sparse_dataset function +See the file PYLEARN_DB_PATH/UTCL/README for detail on the datasets. +See the end of this file for an example on how to load the file. +""" + +import cPickle +import gzip +import os + +import pylearn.io.filetensor as ft +import config + +def load_ndarray_dataset(name): + assert name in ['avicenna','harry','rita','sylvester','ule'] + trname,vname,tename = [os.path.join(config.data_root(), + 'UTLC','filetensor', + name+'_'+subset+'.ft') + for subset in ['train','valid','test']] + train = load_filetensor(trname) + valid = load_filetensor(vname) + test = load_filetensor(tename) + return train, valid, test + +def load_sparse_dataset(name): + assert name in ['harry','terry','ule'] + trname,vname,tename = [os.path.join(config.data_root(), + 'UTLC','sparse', + name+'_'+subset+'.npy') + for subset in ['train','valid','test']] + train = load_sparse(trname) + valid = load_sparse(vname) + test = load_sparse(tename) + return train, valid, test + +def load_filetensor(fname): + f = None + try: + if not os.path.exists(fname): + fname = fname+'.gz' + assert os.path.exists(fname) + f = gzip.open(fname) + else: + f = open(fname) + d = ft.read(f) + finally: + if f: + f.close() + + return d + +def load_sparse(fname): + f = None + try: + if not os.path.exists(fname): + fname = fname+'.gz' + assert os.path.exists(fname) + f = gzip.open(fname) + else: + f = open(fname) + d = cPickle.load(f) + finally: + if f: + f.close() + return d + +if __name__ == '__main__': + import numpy + import scipy.sparse + for name in ['avicenna','harry','rita','sylvester','ule']: + train, valid, test = load_ndarray_dataset(name) + assert isinstance(train, numpy.ndarray) + assert isinstance(valid, numpy.ndarray) + assert isinstance(test, numpy.ndarray) + import pdb;pdb.set_trace() + assert train.shape[1]==test.shape[1]==valid.shape[1] + + for name in ['harry','terry','ule']: + train, valid, test = load_sparse_dataset(name) + assert scipy.sparse.issparse(train) + assert scipy.sparse.issparse(valid) + assert scipy.sparse.issparse(test) + import pdb;pdb.set_trace() + assert train.shape[1]==test.shape[1]==valid.shape[1] +