changeset 173:954185d6002a

Take the validation set at the end of the training set files rather than at the beginning.
author Arnaud Bergeron <abergeron@gmail.com>
date Sat, 27 Feb 2010 12:01:08 -0500
parents 4d3d3627df3e
children ff26436d42d6
files datasets/ftfile.py
diffstat 1 files changed, 33 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/datasets/ftfile.py	Fri Feb 26 15:25:44 2010 -0500
+++ b/datasets/ftfile.py	Sat Feb 27 12:01:08 2010 -0500
@@ -17,6 +17,8 @@
         r"""
         Skips `num` items in the file.
 
+        If `num` is negative, skips size-num.
+
         Tests:
             >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft')
             >>> f.size
@@ -30,7 +32,21 @@
             4020
             >>> f.size
             57646
+            >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft')
+            >>> f.size
+            58646
+            >>> f.file.tell()
+            20
+            >>> f.skip(-1000)
+            >>> f.file.tell()
+            230604
+            >>> f.size
+            1000
         """
+        if num < 0:
+            num += self.size
+        if num < 0:
+            raise ValueError('Skipping past the start of the file')
         if num >= self.size:
             self.size = 0
         else:
@@ -71,8 +87,14 @@
 
         Parameters:
             `file` (string) -- the filename
-            `skip` (int, optional) -- amount of examples to skip from the start of the file
-            `size` (int, optional) -- truncates number of examples read (after skipping)
+            `skip` (int, optional) -- amount of examples to skip from
+                                      the start of the file.  If
+                                      negative, skips filesize - skip.
+            `size` (int, optional) -- truncates number of examples
+                                      read (after skipping).  If
+                                      negative truncates to 
+                                      filesize - size 
+                                      (also after skipping).
         
         Tests:
            >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft')
@@ -100,12 +122,18 @@
            >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1)
            >>> s.open().size
            1
+           >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=-10)
+           >>> s.open().size
+           58636
         """
         f = FTFile(self.file)
         if self.skip != 0:
             f.skip(self.skip)
         if self.size is not None and self.size < f.size:
-            f.size = self.size
+            if self.size < 0:
+                f.size += self.size
+            else:
+                f.size = self.size
         return f
 
 class FTData(object):
@@ -144,8 +172,8 @@
         if valid_data is None:
             total_valid_size = sum(FTFile(td).size for td in test_data)
             valid_size = total_valid_size/len(train_data)
-            self._train = FTData(train_data, train_lbl, skip=valid_size)
-            self._valid = FTData(train_data, train_lbl, size=valid_size)
+            self._train = FTData(train_data, train_lbl, size=-valid_size)
+            self._valid = FTData(train_data, train_lbl, skip=-valid_size)
         else:
             self._train = FTData(train_data, train_lbl)
             self._valid = FTData(valid_data, valid_lbl)