annotate datasets/ftfile.py @ 173:954185d6002a

Take the validation set at the end of the training set files rather than at the beginning.
author Arnaud Bergeron <abergeron@gmail.com>
date Sat, 27 Feb 2010 12:01:08 -0500
parents 4b28d7382dbf
children be714ac9bcbd
rev   line source
163
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
1 from pylearn.io.filetensor import _read_header, _prod
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
2 import numpy
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
3 from dataset import DataSet
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
4 from dsetiter import DataIterator
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
5
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
6 class FTFile(object):
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
7 def __init__(self, fname):
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
8 r"""
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
9 Tests:
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
10 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft')
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
11 """
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
12 self.file = open(fname, 'rb')
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
13 self.magic_t, self.elsize, _, self.dim, _ = _read_header(self.file, False)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
14 self.size = self.dim[0]
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
15
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
16 def skip(self, num):
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
17 r"""
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
18 Skips `num` items in the file.
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
19
173
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
20 If `num` is negative, skips size-num.
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
21
163
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
22 Tests:
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
23 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft')
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
24 >>> f.size
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
25 58646
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
26 >>> f.elsize
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
27 4
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
28 >>> f.file.tell()
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
29 20
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
30 >>> f.skip(1000)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
31 >>> f.file.tell()
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
32 4020
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
33 >>> f.size
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
34 57646
173
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
35 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft')
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
36 >>> f.size
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
37 58646
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
38 >>> f.file.tell()
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
39 20
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
40 >>> f.skip(-1000)
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
41 >>> f.file.tell()
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
42 230604
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
43 >>> f.size
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
44 1000
163
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
45 """
173
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
46 if num < 0:
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
47 num += self.size
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
48 if num < 0:
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
49 raise ValueError('Skipping past the start of the file')
163
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
50 if num >= self.size:
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
51 self.size = 0
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
52 else:
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
53 self.size -= num
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
54 f_start = self.file.tell()
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
55 self.file.seek(f_start + (self.elsize * _prod(self.dim[1:]) * num))
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
56
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
57 def read(self, num):
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
58 r"""
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
59 Reads `num` elements from the file and return the result as a
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
60 numpy matrix. Last read is truncated.
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
61
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
62 Tests:
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
63 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft')
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
64 >>> f.read(1)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
65 array([6], dtype=int32)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
66 >>> f.read(10)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
67 array([7, 4, 7, 5, 6, 4, 8, 0, 9, 6], dtype=int32)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
68 >>> f.skip(58630)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
69 >>> f.read(10)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
70 array([9, 2, 4, 2, 8], dtype=int32)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
71 >>> f.read(10)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
72 array([], dtype=int32)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
73 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_data.ft')
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
74 >>> f.read(1)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
75 array([[0, 0, 0, ..., 0, 0, 0]], dtype=uint8)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
76 """
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
77 if num > self.size:
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
78 num = self.size
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
79 self.dim[0] = num
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
80 self.size -= num
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
81 return numpy.fromfile(self.file, dtype=self.magic_t, count=_prod(self.dim)).reshape(self.dim)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
82
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
83 class FTSource(object):
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
84 def __init__(self, file, skip=0, size=None):
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
85 r"""
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
86 Create a data source from a possible subset of a .ft file.
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
87
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
88 Parameters:
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
89 `file` (string) -- the filename
173
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
90 `skip` (int, optional) -- amount of examples to skip from
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
91 the start of the file. If
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
92 negative, skips filesize - skip.
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
93 `size` (int, optional) -- truncates number of examples
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
94 read (after skipping). If
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
95 negative truncates to
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
96 filesize - size
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
97 (also after skipping).
163
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
98
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
99 Tests:
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
100 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft')
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
101 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1000)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
102 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=10)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
103 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=100, size=120)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
104 """
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
105 self.file = file
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
106 self.skip = skip
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
107 self.size = size
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
108
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
109 def open(self):
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
110 r"""
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
111 Returns an FTFile that corresponds to this dataset.
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
112
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
113 Tests:
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
114 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft')
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
115 >>> f = s.open()
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
116 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
117 >>> len(s.open().read(2))
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
118 1
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
119 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
120 >>> s.open().size
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
121 1000
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
122 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
123 >>> s.open().size
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
124 1
173
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
125 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=-10)
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
126 >>> s.open().size
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
127 58636
163
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
128 """
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
129 f = FTFile(self.file)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
130 if self.skip != 0:
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
131 f.skip(self.skip)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
132 if self.size is not None and self.size < f.size:
173
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
133 if self.size < 0:
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
134 f.size += self.size
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
135 else:
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
136 f.size = self.size
163
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
137 return f
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
138
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
139 class FTData(object):
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
140 r"""
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
141 This is a list of FTSources.
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
142 """
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
143 def __init__(self, datafiles, labelfiles, skip=0, size=None):
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
144 self.inputs = [FTSource(f, skip, size) for f in datafiles]
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
145 self.outputs = [FTSource(f, skip, size) for f in labelfiles]
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
146
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
147 def open_inputs(self):
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
148 return [f.open() for f in self.inputs]
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
149
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
150 def open_outputs(self):
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
151 return [f.open() for f in self.outputs]
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
152
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
153
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
154 class FTDataSet(DataSet):
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
155 def __init__(self, train_data, train_lbl, test_data, test_lbl, valid_data=None, valid_lbl=None):
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
156 r"""
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
157 Defines a DataSet from a bunch of files.
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
158
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
159 Parameters:
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
160 `train_data` -- list of train data files
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
161 `train_label` -- list of train label files (same length as `train_data`)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
162 `test_data`, `test_labels` -- same thing as train, but for
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
163 test. The number of files
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
164 can differ from train.
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
165 `valid_data`, `valid_labels` -- same thing again for validation.
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
166 (optional)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
167
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
168 If `valid_data` and `valid_labels` are not supplied then a sample
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
169 approximately equal in size to the test set is taken from the train
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
170 set.
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
171 """
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
172 if valid_data is None:
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
173 total_valid_size = sum(FTFile(td).size for td in test_data)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
174 valid_size = total_valid_size/len(train_data)
173
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
175 self._train = FTData(train_data, train_lbl, size=-valid_size)
954185d6002a Take the validation set at the end of the training set files rather than at the beginning.
Arnaud Bergeron <abergeron@gmail.com>
parents: 163
diff changeset
176 self._valid = FTData(train_data, train_lbl, skip=-valid_size)
163
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
177 else:
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
178 self._train = FTData(train_data, train_lbl)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
179 self._valid = FTData(valid_data, valid_lbl)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
180 self._test = FTData(test_data, test_lbl)
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
181
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
182 def _return_it(self, batchsize, bufsize, ftdata):
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
183 return zip(DataIterator(ftdata.open_inputs(), batchsize, bufsize),
4b28d7382dbf Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff changeset
184 DataIterator(ftdata.open_outputs(), batchsize, bufsize))