# HG changeset patch # User Arnaud Bergeron # Date 1267290068 18000 # Node ID 954185d6002a9d86ff41b1940f6ec25f65fb70c8 # Parent 4d3d3627df3e8064e570d2a950812815288f3d38 Take the validation set at the end of the training set files rather than at the beginning. diff -r 4d3d3627df3e -r 954185d6002a datasets/ftfile.py --- a/datasets/ftfile.py Fri Feb 26 15:25:44 2010 -0500 +++ b/datasets/ftfile.py Sat Feb 27 12:01:08 2010 -0500 @@ -17,6 +17,8 @@ r""" Skips `num` items in the file. + If `num` is negative, skips size-num. + Tests: >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') >>> f.size @@ -30,7 +32,21 @@ 4020 >>> f.size 57646 + >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') + >>> f.size + 58646 + >>> f.file.tell() + 20 + >>> f.skip(-1000) + >>> f.file.tell() + 230604 + >>> f.size + 1000 """ + if num < 0: + num += self.size + if num < 0: + raise ValueError('Skipping past the start of the file') if num >= self.size: self.size = 0 else: @@ -71,8 +87,14 @@ Parameters: `file` (string) -- the filename - `skip` (int, optional) -- amount of examples to skip from the start of the file - `size` (int, optional) -- truncates number of examples read (after skipping) + `skip` (int, optional) -- amount of examples to skip from + the start of the file. If + negative, skips filesize - skip. + `size` (int, optional) -- truncates number of examples + read (after skipping). If + negative truncates to + filesize - size + (also after skipping). Tests: >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') @@ -100,12 +122,18 @@ >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1) >>> s.open().size 1 + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=-10) + >>> s.open().size + 58636 """ f = FTFile(self.file) if self.skip != 0: f.skip(self.skip) if self.size is not None and self.size < f.size: - f.size = self.size + if self.size < 0: + f.size += self.size + else: + f.size = self.size return f class FTData(object): @@ -144,8 +172,8 @@ if valid_data is None: total_valid_size = sum(FTFile(td).size for td in test_data) valid_size = total_valid_size/len(train_data) - self._train = FTData(train_data, train_lbl, skip=valid_size) - self._valid = FTData(train_data, train_lbl, size=valid_size) + self._train = FTData(train_data, train_lbl, size=-valid_size) + self._valid = FTData(train_data, train_lbl, skip=-valid_size) else: self._train = FTData(train_data, train_lbl) self._valid = FTData(valid_data, valid_lbl)