Mercurial > ift6266
annotate datasets/ftfile.py @ 168:5e0e5f1860ec
Pipeline code shuffle
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Fri, 26 Feb 2010 14:23:47 -0500 |
parents | 4b28d7382dbf |
children | 954185d6002a |
rev | line source |
---|---|
163
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
1 from pylearn.io.filetensor import _read_header, _prod |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
2 import numpy |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
3 from dataset import DataSet |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
4 from dsetiter import DataIterator |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
5 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
6 class FTFile(object): |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
7 def __init__(self, fname): |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
8 r""" |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
9 Tests: |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
10 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
11 """ |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
12 self.file = open(fname, 'rb') |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
13 self.magic_t, self.elsize, _, self.dim, _ = _read_header(self.file, False) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
14 self.size = self.dim[0] |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
15 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
16 def skip(self, num): |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
17 r""" |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
18 Skips `num` items in the file. |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
19 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
20 Tests: |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
21 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
22 >>> f.size |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
23 58646 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
24 >>> f.elsize |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
25 4 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
26 >>> f.file.tell() |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
27 20 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
28 >>> f.skip(1000) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
29 >>> f.file.tell() |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
30 4020 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
31 >>> f.size |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
32 57646 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
33 """ |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
34 if num >= self.size: |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
35 self.size = 0 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
36 else: |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
37 self.size -= num |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
38 f_start = self.file.tell() |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
39 self.file.seek(f_start + (self.elsize * _prod(self.dim[1:]) * num)) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
40 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
41 def read(self, num): |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
42 r""" |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
43 Reads `num` elements from the file and return the result as a |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
44 numpy matrix. Last read is truncated. |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
45 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
46 Tests: |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
47 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
48 >>> f.read(1) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
49 array([6], dtype=int32) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
50 >>> f.read(10) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
51 array([7, 4, 7, 5, 6, 4, 8, 0, 9, 6], dtype=int32) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
52 >>> f.skip(58630) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
53 >>> f.read(10) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
54 array([9, 2, 4, 2, 8], dtype=int32) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
55 >>> f.read(10) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
56 array([], dtype=int32) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
57 >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
58 >>> f.read(1) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
59 array([[0, 0, 0, ..., 0, 0, 0]], dtype=uint8) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
60 """ |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
61 if num > self.size: |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
62 num = self.size |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
63 self.dim[0] = num |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
64 self.size -= num |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
65 return numpy.fromfile(self.file, dtype=self.magic_t, count=_prod(self.dim)).reshape(self.dim) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
66 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
67 class FTSource(object): |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
68 def __init__(self, file, skip=0, size=None): |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
69 r""" |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
70 Create a data source from a possible subset of a .ft file. |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
71 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
72 Parameters: |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
73 `file` (string) -- the filename |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
74 `skip` (int, optional) -- amount of examples to skip from the start of the file |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
75 `size` (int, optional) -- truncates number of examples read (after skipping) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
76 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
77 Tests: |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
78 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
79 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1000) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
80 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=10) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
81 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=100, size=120) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
82 """ |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
83 self.file = file |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
84 self.skip = skip |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
85 self.size = size |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
86 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
87 def open(self): |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
88 r""" |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
89 Returns an FTFile that corresponds to this dataset. |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
90 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
91 Tests: |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
92 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
93 >>> f = s.open() |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
94 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
95 >>> len(s.open().read(2)) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
96 1 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
97 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
98 >>> s.open().size |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
99 1000 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
100 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
101 >>> s.open().size |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
102 1 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
103 """ |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
104 f = FTFile(self.file) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
105 if self.skip != 0: |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
106 f.skip(self.skip) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
107 if self.size is not None and self.size < f.size: |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
108 f.size = self.size |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
109 return f |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
110 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
111 class FTData(object): |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
112 r""" |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
113 This is a list of FTSources. |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
114 """ |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
115 def __init__(self, datafiles, labelfiles, skip=0, size=None): |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
116 self.inputs = [FTSource(f, skip, size) for f in datafiles] |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
117 self.outputs = [FTSource(f, skip, size) for f in labelfiles] |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
118 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
119 def open_inputs(self): |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
120 return [f.open() for f in self.inputs] |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
121 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
122 def open_outputs(self): |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
123 return [f.open() for f in self.outputs] |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
124 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
125 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
126 class FTDataSet(DataSet): |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
127 def __init__(self, train_data, train_lbl, test_data, test_lbl, valid_data=None, valid_lbl=None): |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
128 r""" |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
129 Defines a DataSet from a bunch of files. |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
130 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
131 Parameters: |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
132 `train_data` -- list of train data files |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
133 `train_label` -- list of train label files (same length as `train_data`) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
134 `test_data`, `test_labels` -- same thing as train, but for |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
135 test. The number of files |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
136 can differ from train. |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
137 `valid_data`, `valid_labels` -- same thing again for validation. |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
138 (optional) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
139 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
140 If `valid_data` and `valid_labels` are not supplied then a sample |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
141 approximately equal in size to the test set is taken from the train |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
142 set. |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
143 """ |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
144 if valid_data is None: |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
145 total_valid_size = sum(FTFile(td).size for td in test_data) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
146 valid_size = total_valid_size/len(train_data) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
147 self._train = FTData(train_data, train_lbl, skip=valid_size) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
148 self._valid = FTData(train_data, train_lbl, size=valid_size) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
149 else: |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
150 self._train = FTData(train_data, train_lbl) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
151 self._valid = FTData(valid_data, valid_lbl) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
152 self._test = FTData(test_data, test_lbl) |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
153 |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
154 def _return_it(self, batchsize, bufsize, ftdata): |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
155 return zip(DataIterator(ftdata.open_inputs(), batchsize, bufsize), |
4b28d7382dbf
Add inital implementation of datasets.
Arnaud Bergeron <abergeron@gmail.com>
parents:
diff
changeset
|
156 DataIterator(ftdata.open_outputs(), batchsize, bufsize)) |