# HG changeset patch # User Arnaud Bergeron # Date 1267307416 18000 # Node ID 76bc047df5ee6be9a03cc4eac814bd37be49a951 # Parent defd388aba0cab1e681abee3e225c0a62a5e179d Add dtype conversion and rescaling to the read path. diff -r defd388aba0c -r 76bc047df5ee datasets/defs.py --- a/datasets/defs.py Sat Feb 27 16:07:09 2010 -0500 +++ b/datasets/defs.py Sat Feb 27 16:50:16 2010 -0500 @@ -1,6 +1,7 @@ __all__ = ['nist_digits', 'nist_lower', 'nist_upper', 'nist_all'] from ftfile import FTDataSet +import theano NIST_PATH = '/data/lisa/data/nist/by_class/' DATA_PATH = '/data/lisa/data/ift6266h10/' @@ -8,24 +9,26 @@ nist_digits = FTDataSet(train_data = [NIST_PATH+'digits/digits_train_data.ft'], train_lbl = [NIST_PATH+'digits/digits_train_labels.ft'], test_data = [NIST_PATH+'digits/digits_test_data.ft'], - test_lbl = [NIST_PATH+'digits/digits_test_labels.ft']) + test_lbl = [NIST_PATH+'digits/digits_test_labels.ft'], + indtype=theano.config.floatX, inscale=255.) nist_lower = FTDataSet(train_data = [NIST_PATH+'lower/lower_train_data.ft'], train_lbl = [NIST_PATH+'lower/lower_train_labels.ft'], test_data = [NIST_PATH+'lower/lower_test_data.ft'], - test_lbl = [NIST_PATH+'lower/lower_test_labels.ft']) + test_lbl = [NIST_PATH+'lower/lower_test_labels.ft'], + indtype=theano.config.floatX, inscale=255.) nist_upper = FTDataSet(train_data = [NIST_PATH+'upper/upper_train_data.ft'], train_lbl = [NIST_PATH+'upper/upper_train_labels.ft'], test_data = [NIST_PATH+'upper/upper_test_data.ft'], - test_lbl = [NIST_PATH+'upper/upper_test_labels.ft']) - - + test_lbl = [NIST_PATH+'upper/upper_test_labels.ft'], + indtype=theano.config.floatX, inscale=255.) nist_all = FTDataSet(train_data = [DATA_PATH+'train_data.ft'], train_lbl = [DATA_PATH+'train_labels.ft'], test_data = [DATA_PATH+'test_data.ft'], test_lbl = [DATA_PATH+'test_labels.ft'], valid_data = [DATA_PATH+'valid_data.ft'], - valid_lbl = [DATA_PATH+'valid_labels.ft']) + valid_lbl = [DATA_PATH+'valid_labels.ft'], + indtype=theano.config.floatX, inscale=255.) ocr = FTDataSet(train_data = [DATA_PATH+'ocr_train_data.ft'], train_lbl = [DATA_PATH+'ocr_train_labels.ft'], diff -r defd388aba0c -r 76bc047df5ee datasets/dsetiter.py --- a/datasets/dsetiter.py Sat Feb 27 16:07:09 2010 -0500 +++ b/datasets/dsetiter.py Sat Feb 27 16:50:16 2010 -0500 @@ -1,4 +1,4 @@ -import numpy, theano +import numpy class DummyFile(object): def __init__(self, size): @@ -123,7 +123,7 @@ tmpbuf = self.curfile.read(self.bufsize - len(buf)) buf = numpy.row_stack((buf, tmpbuf)) - self.buffer = numpy.asarray(buf, dtype=theano.config.floatX) + self.buffer = buf self.curpos = 0 def __next__(self): diff -r defd388aba0c -r 76bc047df5ee datasets/ftfile.py --- a/datasets/ftfile.py Sat Feb 27 16:07:09 2010 -0500 +++ b/datasets/ftfile.py Sat Feb 27 16:50:16 2010 -0500 @@ -5,7 +5,7 @@ from itertools import izip, imap class FTFile(object): - def __init__(self, fname): + def __init__(self, fname, scale=1, dtype=None): r""" Tests: >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') @@ -13,6 +13,8 @@ self.file = open(fname, 'rb') self.magic_t, self.elsize, _, self.dim, _ = _read_header(self.file, False) self.size = self.dim[0] + self.scale = scale + self.dtype = dtype def skip(self, num): r""" @@ -79,10 +81,15 @@ num = self.size self.dim[0] = num self.size -= num - return numpy.fromfile(self.file, dtype=self.magic_t, count=_prod(self.dim)).reshape(self.dim) + res = numpy.fromfile(self.file, dtype=self.magic_t, count=_prod(self.dim)).reshape(self.dim) + if self.dtype is not None: + res = res.astype(self.dtype) + if self.scale != 1: + res /= self.scale + return res class FTSource(object): - def __init__(self, file, skip=0, size=None): + def __init__(self, file, skip=0, size=None, dtype=None, scale=1): r""" Create a data source from a possible subset of a .ft file. @@ -96,7 +103,12 @@ negative truncates to filesize - size (also after skipping). - + `dtype` (dtype, optional) -- convert the data to this + dtype after reading. + `scale` (number, optional) -- scale (that is divide) the + data by this number (after + dtype conversion, if any). + Tests: >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1000) @@ -106,6 +118,8 @@ self.file = file self.skip = skip self.size = size + self.dtype = dtype + self.scale = scale def open(self): r""" @@ -127,7 +141,7 @@ >>> s.open().size 58636 """ - f = FTFile(self.file) + f = FTFile(self.file, scale=self.scale, dtype=self.dtype) if self.skip != 0: f.skip(self.skip) if self.size is not None and self.size < f.size: @@ -141,9 +155,12 @@ r""" This is a list of FTSources. """ - def __init__(self, datafiles, labelfiles, skip=0, size=None): - self.inputs = [FTSource(f, skip, size) for f in datafiles] - self.outputs = [FTSource(f, skip, size) for f in labelfiles] + def __init__(self, datafiles, labelfiles, skip=0, size=None, + inscale=1, indtype=None, outscale=1, outdtype=None): + self.inputs = [FTSource(f, skip, size, scale=inscale, dtype=indtype) + for f in datafiles] + self.outputs = [FTSource(f, skip, size, scale=outscale, dtype=outdtype) + for f in labelfiles] def open_inputs(self): return [f.open() for f in self.inputs] @@ -153,7 +170,7 @@ class FTDataSet(DataSet): - def __init__(self, train_data, train_lbl, test_data, test_lbl, valid_data=None, valid_lbl=None): + def __init__(self, train_data, train_lbl, test_data, test_lbl, valid_data=None, valid_lbl=None, indtype=None, outdtype=None, inscale=1, outscale=1): r""" Defines a DataSet from a bunch of files. @@ -165,6 +182,9 @@ can differ from train. `valid_data`, `valid_labels` -- same thing again for validation. (optional) + `indtype`, `outdtype`, -- see FTSource.__init__() + `inscale`, `outscale` (optional) + If `valid_data` and `valid_labels` are not supplied then a sample approximately equal in size to the test set is taken from the train