Mercurial > ift6266
comparison datasets/ftfile.py @ 173:954185d6002a
Take the validation set at the end of the training set files rather than at the beginning.
author | Arnaud Bergeron <abergeron@gmail.com> |
---|---|
date | Sat, 27 Feb 2010 12:01:08 -0500 |
parents | 4b28d7382dbf |
children | be714ac9bcbd |
comparison
equal
deleted
inserted
replaced
172:4d3d3627df3e | 173:954185d6002a |
---|---|
15 | 15 |
16 def skip(self, num): | 16 def skip(self, num): |
17 r""" | 17 r""" |
18 Skips `num` items in the file. | 18 Skips `num` items in the file. |
19 | 19 |
20 If `num` is negative, skips size-num. | |
21 | |
20 Tests: | 22 Tests: |
21 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') | 23 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') |
22 >>> f.size | 24 >>> f.size |
23 58646 | 25 58646 |
24 >>> f.elsize | 26 >>> f.elsize |
28 >>> f.skip(1000) | 30 >>> f.skip(1000) |
29 >>> f.file.tell() | 31 >>> f.file.tell() |
30 4020 | 32 4020 |
31 >>> f.size | 33 >>> f.size |
32 57646 | 34 57646 |
35 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') | |
36 >>> f.size | |
37 58646 | |
38 >>> f.file.tell() | |
39 20 | |
40 >>> f.skip(-1000) | |
41 >>> f.file.tell() | |
42 230604 | |
43 >>> f.size | |
44 1000 | |
33 """ | 45 """ |
46 if num < 0: | |
47 num += self.size | |
48 if num < 0: | |
49 raise ValueError('Skipping past the start of the file') | |
34 if num >= self.size: | 50 if num >= self.size: |
35 self.size = 0 | 51 self.size = 0 |
36 else: | 52 else: |
37 self.size -= num | 53 self.size -= num |
38 f_start = self.file.tell() | 54 f_start = self.file.tell() |
69 r""" | 85 r""" |
70 Create a data source from a possible subset of a .ft file. | 86 Create a data source from a possible subset of a .ft file. |
71 | 87 |
72 Parameters: | 88 Parameters: |
73 `file` (string) -- the filename | 89 `file` (string) -- the filename |
74 `skip` (int, optional) -- amount of examples to skip from the start of the file | 90 `skip` (int, optional) -- amount of examples to skip from |
75 `size` (int, optional) -- truncates number of examples read (after skipping) | 91 the start of the file. If |
92 negative, skips filesize - skip. | |
93 `size` (int, optional) -- truncates number of examples | |
94 read (after skipping). If | |
95 negative truncates to | |
96 filesize - size | |
97 (also after skipping). | |
76 | 98 |
77 Tests: | 99 Tests: |
78 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') | 100 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') |
79 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1000) | 101 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1000) |
80 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=10) | 102 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=10) |
98 >>> s.open().size | 120 >>> s.open().size |
99 1000 | 121 1000 |
100 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1) | 122 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1) |
101 >>> s.open().size | 123 >>> s.open().size |
102 1 | 124 1 |
125 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=-10) | |
126 >>> s.open().size | |
127 58636 | |
103 """ | 128 """ |
104 f = FTFile(self.file) | 129 f = FTFile(self.file) |
105 if self.skip != 0: | 130 if self.skip != 0: |
106 f.skip(self.skip) | 131 f.skip(self.skip) |
107 if self.size is not None and self.size < f.size: | 132 if self.size is not None and self.size < f.size: |
108 f.size = self.size | 133 if self.size < 0: |
134 f.size += self.size | |
135 else: | |
136 f.size = self.size | |
109 return f | 137 return f |
110 | 138 |
111 class FTData(object): | 139 class FTData(object): |
112 r""" | 140 r""" |
113 This is a list of FTSources. | 141 This is a list of FTSources. |
142 set. | 170 set. |
143 """ | 171 """ |
144 if valid_data is None: | 172 if valid_data is None: |
145 total_valid_size = sum(FTFile(td).size for td in test_data) | 173 total_valid_size = sum(FTFile(td).size for td in test_data) |
146 valid_size = total_valid_size/len(train_data) | 174 valid_size = total_valid_size/len(train_data) |
147 self._train = FTData(train_data, train_lbl, skip=valid_size) | 175 self._train = FTData(train_data, train_lbl, size=-valid_size) |
148 self._valid = FTData(train_data, train_lbl, size=valid_size) | 176 self._valid = FTData(train_data, train_lbl, skip=-valid_size) |
149 else: | 177 else: |
150 self._train = FTData(train_data, train_lbl) | 178 self._train = FTData(train_data, train_lbl) |
151 self._valid = FTData(valid_data, valid_lbl) | 179 self._valid = FTData(valid_data, valid_lbl) |
152 self._test = FTData(test_data, test_lbl) | 180 self._test = FTData(test_data, test_lbl) |
153 | 181 |