comparison datasets/ftfile.py @ 173:954185d6002a

Take the validation set at the end of the training set files rather than at the beginning.
author Arnaud Bergeron <abergeron@gmail.com>
date Sat, 27 Feb 2010 12:01:08 -0500
parents 4b28d7382dbf
children be714ac9bcbd
comparison
equal deleted inserted replaced
172:4d3d3627df3e 173:954185d6002a
15 15
16 def skip(self, num): 16 def skip(self, num):
17 r""" 17 r"""
18 Skips `num` items in the file. 18 Skips `num` items in the file.
19 19
20 If `num` is negative, skips size-num.
21
20 Tests: 22 Tests:
21 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') 23 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft')
22 >>> f.size 24 >>> f.size
23 58646 25 58646
24 >>> f.elsize 26 >>> f.elsize
28 >>> f.skip(1000) 30 >>> f.skip(1000)
29 >>> f.file.tell() 31 >>> f.file.tell()
30 4020 32 4020
31 >>> f.size 33 >>> f.size
32 57646 34 57646
35 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft')
36 >>> f.size
37 58646
38 >>> f.file.tell()
39 20
40 >>> f.skip(-1000)
41 >>> f.file.tell()
42 230604
43 >>> f.size
44 1000
33 """ 45 """
46 if num < 0:
47 num += self.size
48 if num < 0:
49 raise ValueError('Skipping past the start of the file')
34 if num >= self.size: 50 if num >= self.size:
35 self.size = 0 51 self.size = 0
36 else: 52 else:
37 self.size -= num 53 self.size -= num
38 f_start = self.file.tell() 54 f_start = self.file.tell()
69 r""" 85 r"""
70 Create a data source from a possible subset of a .ft file. 86 Create a data source from a possible subset of a .ft file.
71 87
72 Parameters: 88 Parameters:
73 `file` (string) -- the filename 89 `file` (string) -- the filename
74 `skip` (int, optional) -- amount of examples to skip from the start of the file 90 `skip` (int, optional) -- amount of examples to skip from
75 `size` (int, optional) -- truncates number of examples read (after skipping) 91 the start of the file. If
92 negative, skips filesize - skip.
93 `size` (int, optional) -- truncates number of examples
94 read (after skipping). If
95 negative truncates to
96 filesize - size
97 (also after skipping).
76 98
77 Tests: 99 Tests:
78 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') 100 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft')
79 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1000) 101 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1000)
80 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=10) 102 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=10)
98 >>> s.open().size 120 >>> s.open().size
99 1000 121 1000
100 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1) 122 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1)
101 >>> s.open().size 123 >>> s.open().size
102 1 124 1
125 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=-10)
126 >>> s.open().size
127 58636
103 """ 128 """
104 f = FTFile(self.file) 129 f = FTFile(self.file)
105 if self.skip != 0: 130 if self.skip != 0:
106 f.skip(self.skip) 131 f.skip(self.skip)
107 if self.size is not None and self.size < f.size: 132 if self.size is not None and self.size < f.size:
108 f.size = self.size 133 if self.size < 0:
134 f.size += self.size
135 else:
136 f.size = self.size
109 return f 137 return f
110 138
111 class FTData(object): 139 class FTData(object):
112 r""" 140 r"""
113 This is a list of FTSources. 141 This is a list of FTSources.
142 set. 170 set.
143 """ 171 """
144 if valid_data is None: 172 if valid_data is None:
145 total_valid_size = sum(FTFile(td).size for td in test_data) 173 total_valid_size = sum(FTFile(td).size for td in test_data)
146 valid_size = total_valid_size/len(train_data) 174 valid_size = total_valid_size/len(train_data)
147 self._train = FTData(train_data, train_lbl, skip=valid_size) 175 self._train = FTData(train_data, train_lbl, size=-valid_size)
148 self._valid = FTData(train_data, train_lbl, size=valid_size) 176 self._valid = FTData(train_data, train_lbl, skip=-valid_size)
149 else: 177 else:
150 self._train = FTData(train_data, train_lbl) 178 self._train = FTData(train_data, train_lbl)
151 self._valid = FTData(valid_data, valid_lbl) 179 self._valid = FTData(valid_data, valid_lbl)
152 self._test = FTData(test_data, test_lbl) 180 self._test = FTData(test_data, test_lbl)
153 181