# HG changeset patch # User James Bergstra # Date 1232835895 18000 # Node ID af14b1f32882ba0a0f6635812145ab741a9e7b21 # Parent 83397981a118c59add2a1ee333432f3d3d029bef revised tzanetakis, added data centering diff -r 83397981a118 -r af14b1f32882 pylearn/datasets/tzanetakis.py --- a/pylearn/datasets/tzanetakis.py Wed Jan 21 16:02:07 2009 -0500 +++ b/pylearn/datasets/tzanetakis.py Sat Jan 24 17:24:55 2009 -0500 @@ -11,48 +11,17 @@ from .config import data_root from .dataset import dataset_factory, Dataset -def head(n=10, path=None): - """Load the first MNIST examples. - - Returns two matrices: x, y. x has N rows of 784 columns. Each row of x represents the - 28x28 grey-scale pixels in raster order. y is a vector of N integers. Each element y[i] - is the label of the i'th row of x. - - """ - path = os.path.join(data_root(), 'mnist','mnist_with_header.amat') if path is None else path - - dat = AMat(path=path, head=n) - - try: - assert dat.input.shape[0] == n - assert dat.target.shape[0] == n - except Exception , e: - raise Exception("failed to read MNIST data", (dat, e)) - - return dat.input, numpy.asarray(dat.target, dtype='int64').reshape(dat.target.shape[0]) - -def all(path=None): - return head(n=None, path=path) - -def train_valid_test(ntrain=50000, nvalid=10000, ntest=10000, path=None): - all_x, all_targ = head(ntrain+nvalid+ntest, path=path) - - rval = Dataset() - - rval.train = Dataset.Obj(x=all_x[0:ntrain], - y=all_targ[0:ntrain]) - rval.valid = Dataset.Obj(x=all_x[ntrain:ntrain+nvalid], - y=all_targ[ntrain:ntrain+nvalid]) - rval.test = Dataset.Obj(x=all_x[ntrain+nvalid:ntrain+nvalid+ntest], - y=all_targ[ntrain+nvalid:ntrain+nvalid+ntest]) - - rval.n_classes = 10 - rval.img_shape = (28,28) +def centre_data(x, inplace=False): + rval = x if inplace else x.copy() + #zero-mean + rval -= numpy.mean(rval, axis=0) + #unit-variance + rval *= 1.0 / (1.0e-6 + numpy.std(rval, axis=0)) return rval - def mfcc16(segments_per_song = 1, include_covariance = True, random_split = 0, - ntrain = 700, nvalid = 100, ntest = 200): + ntrain = 700, nvalid = 100, ntest = 200, + normalize=True): if segments_per_song != 1: raise NotImplementedError() @@ -75,11 +44,14 @@ #construct a dataset to return rval = Dataset() - rval.train = Dataset.Obj(x=all_input[0:ntrain], + def prepx(x): + return centre_data(x, inplace=True) if normalize else x + + rval.train = Dataset.Obj(x=prepx(all_input[0:ntrain]), y=all_targ[0:ntrain]) - rval.valid = Dataset.Obj(x=all_input[ntrain:ntrain+nvalid], + rval.valid = Dataset.Obj(x=prepx(all_input[ntrain:ntrain+nvalid]), y=all_targ[ntrain:ntrain+nvalid]) - rval.test = Dataset.Obj(x=all_input[ntrain+nvalid:ntrain+nvalid+ntest], + rval.test = Dataset.Obj(x=prepx(all_input[ntrain+nvalid:ntrain+nvalid+ntest]), y=all_targ[ntrain+nvalid:ntrain+nvalid+ntest]) rval.n_classes = 10 @@ -87,16 +59,3 @@ return rval - - -def mnist_factory(variant="", ntrain=None, nvalid=None, ntest=None): - if variant=="": - return train_valid_test() - elif variant=="1k": - return train_valid_test(ntrain=1000, nvalid=200, ntest=200) - elif variant=="10k": - return train_valid_test(ntrain=10000, nvalid=2000, ntest=2000) - elif variant=="custom": - return train_valid_test(ntrain=ntrain, nvalid=nvalid, ntest=ntest) - else: - raise Exception('Unknown MNIST variant', variant)