annotate dataset.py @ 531:90a76a8238e8

Added function length()
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 18 Nov 2008 00:32:39 -0500
parents fb62f0e4bcfe
children
rev   line source
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
1
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
2 from lookup_list import LookupList as Example
356
18702ceb2096 Added more functions
Joseph Turian <turian@iro.umontreal.ca>
parents: 354
diff changeset
3 from common.misc import unique_elements_list_intersection
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
4 from string import join
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
5 from sys import maxint
171
895b4b60f5e8 bugfix. Otherwise the example was writed over and not a new one was returned
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 167
diff changeset
6 import numpy, copy
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
7
166
ee11ed427ba8 Created exceptions.py
Joseph Turian <turian@gmail.com>
parents: 163
diff changeset
8 from exceptions import *
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
9
110
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
10 class AttributesHolder(object):
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
11 def __init__(self): pass
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
12
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
13 def attributeNames(self):
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
14 raise AbstractFunction()
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
15
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
16 def setAttributes(self,attribute_names,attribute_values,make_copies=False):
134
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
17 """
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
18 Allow the attribute_values to not be a list (but a single value) if the attribute_names is of length 1.
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
19 """
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
20 if len(attribute_names)==1 and not (isinstance(attribute_values,list) or isinstance(attribute_values,tuple) ):
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
21 attribute_values = [attribute_values]
110
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
22 if make_copies:
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
23 for name,value in zip(attribute_names,attribute_values):
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
24 self.__setattr__(name,copy.deepcopy(value))
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
25 else:
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
26 for name,value in zip(attribute_names,attribute_values):
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
27 self.__setattr__(name,value)
193
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
28
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
29 def getAttributes(self,attribute_names=None, return_copy=False):
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
30 """
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
31 Return all (if attribute_names=None, in the order of attributeNames()) or a specified subset of attributes.
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
32 """
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
33 if attribute_names is None:
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
34 attribute_names = self.attributeNames()
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
35 if return_copy:
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
36 return [copy.copy(self.__getattribute__(name)) for name in attribute_names]
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
37 else:
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
38 return [self.__getattribute__(name) for name in attribute_names]
110
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
39
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
40 class DataSet(AttributesHolder):
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
41 """A virtual base class for datasets.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
42
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
43 A DataSet can be seen as a generalization of a matrix, meant to be used in conjunction
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
44 with learning algorithms (for training and testing them): rows/records are called examples, and
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
45 columns/attributes are called fields. The field value for a particular example can be an arbitrary
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
46 python object, which depends on the particular dataset.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
47
241
ddb88a8e9fd2 If I understand properly, the length of an unbounded stream is sys.maxint
delallea@opale.iro.umontreal.ca
parents: 231
diff changeset
48 We call a DataSet a 'stream' when its length is unbounded (in which case its __len__ method
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
49 should return sys.maxint).
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
50
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
51 A DataSet is a generator of iterators; these iterators can run through the
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
52 examples or the fields in a variety of ways. A DataSet need not necessarily have a finite
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
53 or known length, so this class can be used to interface to a 'stream' which
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
54 feeds on-line learning (however, as noted below, some operations are not
241
ddb88a8e9fd2 If I understand properly, the length of an unbounded stream is sys.maxint
delallea@opale.iro.umontreal.ca
parents: 231
diff changeset
55 feasible or not recommended on streams).
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
56
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
57 To iterate over examples, there are several possibilities:
90
a289b8bed64c corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 88
diff changeset
58 - for example in dataset:
a289b8bed64c corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 88
diff changeset
59 - for val1,val2,... in dataset:
a289b8bed64c corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 88
diff changeset
60 - for example in dataset(field1, field2,field3, ...):
a289b8bed64c corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 88
diff changeset
61 - for val1,val2,val3 in dataset(field1, field2,field3):
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
62 - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N):
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
63 - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N):
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
64 Each of these is documented below. All of these iterators are expected
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
65 to provide, in addition to the usual 'next()' method, a 'next_index()' method
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
66 which returns a non-negative integer pointing to the position of the next
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
67 example that will be returned by 'next()' (or of the first example in the
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
68 next minibatch returned). This is important because these iterators
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
69 can wrap around the dataset in order to do multiple passes through it,
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
70 in possibly unregular ways if the minibatch size is not a divisor of the
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
71 dataset length.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
72
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
73 To iterate over fields, one can do
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
74 - for field in dataset.fields():
46
c5b07e87b0cb comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 45
diff changeset
75 for field_value in field: # iterate over the values associated to that field for all the dataset examples
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
76 - for field in dataset(field1,field2,...).fields() to select a subset of fields
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
77 - for field in dataset.fields(field1,field2,...) to select a subset of fields
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
78 and each of these fields is iterable over the examples:
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
79 - for field_examples in dataset.fields():
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
80 for example_value in field_examples:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
81 ...
241
ddb88a8e9fd2 If I understand properly, the length of an unbounded stream is sys.maxint
delallea@opale.iro.umontreal.ca
parents: 231
diff changeset
82 but when the dataset is a stream (unbounded length), it is not recommended to do
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
83 such things because the underlying dataset may refuse to access the different fields in
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
84 an unsynchronized ways. Hence the fields() method is illegal for streams, by default.
132
f6505ec32dc3 Updated documentation slightly
Joseph Turian <turian@gmail.com>
parents: 128
diff changeset
85 The result of fields() is a L{DataSetFields} object, which iterates over fields,
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
86 and whose elements are iterable over examples. A DataSetFields object can
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
87 be turned back into a DataSet with its examples() method::
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
88 dataset2 = dataset1.fields().examples()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
89 and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
90
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
91 Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
92
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
93 Note: The content of a field can be of any type. Field values can also be 'missing'
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
94 (e.g. to handle semi-supervised learning), and in the case of numeric (numpy array)
46
c5b07e87b0cb comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 45
diff changeset
95 fields (i.e. an ArrayFieldsDataSet), NaN plays the role of a missing value.
c5b07e87b0cb comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 45
diff changeset
96 What about non-numeric values? None.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
97
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
98 Dataset elements can be indexed and sub-datasets (with a subset
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
99 of examples) can be extracted. These operations are not supported
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
100 by default in the case of streams.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
101
317
14081904d8f3 doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 316
diff changeset
102 - dataset[:n] returns an Example with the n first examples.
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
103
317
14081904d8f3 doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 316
diff changeset
104 - dataset[i1:i2:s] returns an Example with the examples i1,i1+s,...i2-s.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
105
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
106 - dataset[i] returns an Example.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
107
378
835830e52b42 fixing merge
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 356
diff changeset
108 - dataset[[i1,i2,...in]] returns an Example with examples i1,i2,...in.
317
14081904d8f3 doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 316
diff changeset
109
14081904d8f3 doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 316
diff changeset
110 A similar command gives you a DataSet instead of Examples :
14081904d8f3 doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 316
diff changeset
111
14081904d8f3 doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 316
diff changeset
112 - dataset.subset[:n] returns a DataSet with the n first examples.
14081904d8f3 doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 316
diff changeset
113
14081904d8f3 doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 316
diff changeset
114 - dataset.subset[i1:i2:s] returns a DataSet with the examples i1,i1+s,...i2-s.
14081904d8f3 doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 316
diff changeset
115
14081904d8f3 doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 316
diff changeset
116 - dataset.subset[i] returns a DataSet.
14081904d8f3 doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 316
diff changeset
117
14081904d8f3 doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 316
diff changeset
118 - dataset.subset[[i1,i2,...in]] returns a DataSet with examples i1,i2,...in.
14081904d8f3 doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 316
diff changeset
119
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
120
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
121 - dataset.<property> returns the value of a property associated with
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
122 the name <property>. The following properties should be supported:
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
123 - 'description': a textual description or name for the dataset
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
124 - 'fieldtypes': a list of types (one per field)
78
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
125 A DataSet may have other attributes that it makes visible to other objects. These are
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
126 used to store information that is not example-wise but global to the dataset.
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
127 The list of names of these attributes is given by the attribute_names() method.
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
128
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
129 Datasets can be concatenated either vertically (increasing the length) or
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
130 horizontally (augmenting the set of fields), if they are compatible, using
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
131 the following operations (with the same basic semantics as numpy.hstack
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
132 and numpy.vstack):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
133
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
134 - dataset1 | dataset2 | dataset3 == dataset.hstack([dataset1,dataset2,dataset3])
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
135
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
136 creates a new dataset whose list of fields is the concatenation of the list of
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
137 fields of the argument datasets. This only works if they all have the same length.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
138
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
139 - dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3])
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
140
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
141 creates a new dataset that concatenates the examples from the argument datasets
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
142 (and whose length is the sum of the length of the argument datasets). This only
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
143 works if they all have the same fields.
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
144
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
145 According to the same logic, and viewing a DataSetFields object associated to
46
c5b07e87b0cb comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 45
diff changeset
146 a DataSet as a kind of transpose of it, fields1 & fields2 concatenates fields of
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
147 a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
148 examples.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
149
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
150 A dataset can hold arbitrary key-value pairs that may be used to access meta-data
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
151 or other properties of the dataset or associated with the dataset or the result
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
152 of a computation stored in a dataset. These can be accessed through the [key] syntax
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
153 when key is a string (or more specifically, neither an integer, a slice, nor a list).
78
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
154
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
155 A DataSet sub-class should always redefine the following methods:
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
156 - __len__ if it is not a stream
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
157 - fieldNames
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
158 - minibatches_nowrap (called by DataSet.minibatches())
269
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
159 For efficiency of implementation, a sub-class might also want to redefine
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
160 - valuesHStack
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
161 - valuesVStack
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
162 - hasFields
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
163 - __getitem__ may not be feasible with some streams
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
164 - __iter__
78
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
165 A sub-class should also append attributes to self._attribute_names
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
166 (the default value returned by attributeNames()).
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
167 By convention, attributes not in attributeNames() should have a name
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
168 starting with an underscore.
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
169 @todo enforce/test that convention!
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
170 """
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
171
83
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 82
diff changeset
172 numpy_vstack = lambda fieldname,values: numpy.vstack(values)
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 82
diff changeset
173 numpy_hstack = lambda fieldnames,values: numpy.hstack(values)
77
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
174
292
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
175 def __init__(self, description=None, fieldnames=None, fieldtypes=None):
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
176 """
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
177 @type fieldnames: list of strings
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
178 @type fieldtypes: list of python types, same length as fieldnames
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
179 @type description: string
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
180 @param description: description/name for this dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
181 """
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
182 def default_desc():
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
183 return type(self).__name__ \
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
184 + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )"
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
185
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
186 #self.fieldnames = fieldnames
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
187
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
188 self.fieldtypes = fieldtypes if fieldtypes is not None \
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
189 else [None]*1 #len(fieldnames)
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
190
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
191 self.description = default_desc() if description is None \
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
192 else description
78
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
193 self._attribute_names = ["description"]
292
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
194
321
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
195
292
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
196 attributeNames = property(lambda self: copy.copy(self._attribute_names))
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
197
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
198 def __contains__(self, fieldname):
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
199 return (fieldname in self.fieldNames()) \
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
200 or (fieldname in self.attributeNames())
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
201
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
202 def __iter__(self):
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
203 """Supports the syntax "for i in dataset: ..."
78
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
204
292
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
205 Using this syntax, "i" will be an Example instance (or equivalent) with
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
206 all the fields of DataSet self. Every field of "i" will give access to
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
207 a field of a single example. Fields should be accessible via
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
208 i["fielname"] or i[3] (in the order defined by the elements of the
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
209 Example returned by this iterator), but the derived class is free
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
210 to accept any type of identifier, and add extra functionality to the iterator.
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
211
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
212 The default implementation calls the minibatches iterator and extracts the first example of each field.
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
213 """
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
214 return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1))
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
215
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
216 def __len__(self):
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
217 """
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
218 len(dataset) returns the number of examples in the dataset.
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
219 By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint).
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
220 Sub-classes which implement finite-length datasets should redefine this method.
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
221 Some methods only make sense for finite-length datasets.
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
222 """
422
32c5f87bc54e Added __len__ to HStackedDataSet and replaced default len() by sys.maxint instead of None
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 378
diff changeset
223 from sys import maxint
32c5f87bc54e Added __len__ to HStackedDataSet and replaced default len() by sys.maxint instead of None
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 378
diff changeset
224 return maxint
292
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 290
diff changeset
225
78
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
226
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
227 class MinibatchToSingleExampleIterator(object):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
228 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
229 Converts the result of minibatch iterator with minibatch_size==1 into
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
230 single-example values in the result. Therefore the result of
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
231 iterating on the dataset itself gives a sequence of single examples
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
232 (whereas the result of iterating over minibatches gives in each
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
233 Example field an iterable object over the individual examples in
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
234 the minibatch).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
235 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
236 def __init__(self, minibatch_iterator):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
237 self.minibatch_iterator = minibatch_iterator
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
238 self.minibatch = None
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
239 def __iter__(self): #makes for loop work
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
240 return self
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
241 def next(self):
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
242 size1_minibatch = self.minibatch_iterator.next()
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
243 if not self.minibatch:
329
9ce791fb2cbf little hack in MiniBatchToSingleExampleIterator, there was a problem which I think was not a bug, we were receiving [array(3)] and everything was crashing. Hack is kinda slow
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 322
diff changeset
244 names = size1_minibatch.keys()
9ce791fb2cbf little hack in MiniBatchToSingleExampleIterator, there was a problem which I think was not a bug, we were receiving [array(3)] and everything was crashing. Hack is kinda slow
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 322
diff changeset
245 # next lines are a hack, but there was problem when we were getting [array(327)] for instance
332
dada08a6adb8 redone my previous hack in MinibatchToSingleExampleIterator, tests should work again
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 331
diff changeset
246 try:
329
9ce791fb2cbf little hack in MiniBatchToSingleExampleIterator, there was a problem which I think was not a bug, we were receiving [array(3)] and everything was crashing. Hack is kinda slow
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 322
diff changeset
247 values = [value[0] for value in size1_minibatch.values()]
332
dada08a6adb8 redone my previous hack in MinibatchToSingleExampleIterator, tests should work again
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 331
diff changeset
248 except :
329
9ce791fb2cbf little hack in MiniBatchToSingleExampleIterator, there was a problem which I think was not a bug, we were receiving [array(3)] and everything was crashing. Hack is kinda slow
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 322
diff changeset
249 values = [value for value in size1_minibatch.values()]
9ce791fb2cbf little hack in MiniBatchToSingleExampleIterator, there was a problem which I think was not a bug, we were receiving [array(3)] and everything was crashing. Hack is kinda slow
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 322
diff changeset
250 self.minibatch = Example(names,values)
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
251 else:
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
252 self.minibatch._values = [value[0] for value in size1_minibatch.values()]
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
253 return self.minibatch
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
254
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
255 def next_index(self):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
256 return self.minibatch_iterator.next_index()
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
257
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
258 class MinibatchWrapAroundIterator(object):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
259 """
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
260 An iterator for minibatches that handles the case where we need to wrap around the
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
261 dataset because n_batches*minibatch_size > len(dataset). It is constructed from
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
262 a dataset that provides a minibatch iterator that does not need to handle that problem.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
263 This class is a utility for dataset subclass writers, so that they do not have to handle
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
264 this issue multiple times, nor check that fieldnames are valid, nor handle the
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
265 empty fieldnames (meaning 'use all the fields').
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
266 """
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
267 def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
268 self.dataset=dataset
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
269 self.fieldnames=fieldnames
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
270 self.minibatch_size=minibatch_size
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
271 self.n_batches=n_batches
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
272 self.n_batches_done=0
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
273 self.next_row=offset
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
274 self.L=len(dataset)
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
275 self.offset=offset % self.L
98
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
276 ds_nbatches = (self.L-self.next_row)/self.minibatch_size
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
277 if n_batches is not None:
98
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
278 ds_nbatches = min(n_batches,ds_nbatches)
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
279 if fieldnames:
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
280 assert dataset.hasFields(*fieldnames)
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
281 else:
98
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
282 self.fieldnames=dataset.fieldNames()
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
283 self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, ds_nbatches,self.next_row)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
284
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
285 def __iter__(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
286 return self
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
287
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
288 def next_index(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
289 return self.next_row
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
290
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
291 def next(self):
43
e92244f30116 Corrected iterator logic errors
bengioy@grenat.iro.umontreal.ca
parents: 42
diff changeset
292 if self.n_batches and self.n_batches_done==self.n_batches:
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
293 raise StopIteration
101
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
294 elif not self.n_batches and self.next_row ==self.L:
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
295 raise StopIteration
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
296 upper = self.next_row+self.minibatch_size
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
297 if upper <=self.L:
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
298 minibatch = self.iterator.next()
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
299 else:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
300 if not self.n_batches:
101
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
301 upper=min(upper, self.L)
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
302 # if their is not a fixed number of batch, we continue to the end of the dataset.
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
303 # this can create a minibatch that is smaller then the minibatch_size
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
304 assert (self.L-self.next_row)<=self.minibatch_size
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
305 minibatch = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
306 else:
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
307 # we must concatenate (vstack) the bottom and top parts of our minibatch
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
308 # first get the beginning of our minibatch (top of dataset)
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
309 first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
310 second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next()
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
311 minibatch = Example(self.fieldnames,
268
3f1cd8897fda reverting dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 266
diff changeset
312 [self.dataset.valuesVStack(name,[first_part[name],second_part[name]])
101
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
313 for name in self.fieldnames])
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
314 self.next_row=upper
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
315 self.n_batches_done+=1
43
e92244f30116 Corrected iterator logic errors
bengioy@grenat.iro.umontreal.ca
parents: 42
diff changeset
316 if upper >= self.L and self.n_batches:
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
317 self.next_row -= self.L
98
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
318 ds_nbatches = (self.L-self.next_row)/self.minibatch_size
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
319 if self.n_batches is not None:
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
320 ds_nbatches = min(self.n_batches,ds_nbatches)
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
321 self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size,
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
322 ds_nbatches,self.next_row)
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
323 return DataSetFields(MinibatchDataSet(minibatch,self.dataset.valuesVStack,
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
324 self.dataset.valuesHStack),
74
b4159cbdc06b Fixed errors raised by test_dataset
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 73
diff changeset
325 minibatch.keys())
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
326
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
327
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
328 minibatches_fieldnames = None
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
329 minibatches_minibatch_size = 1
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
330 minibatches_n_batches = None
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
331 def minibatches(self,
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
332 fieldnames = minibatches_fieldnames,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
333 minibatch_size = minibatches_minibatch_size,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
334 n_batches = minibatches_n_batches,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
335 offset = 0):
6
d5738b79089a Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents: 5
diff changeset
336 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
337 Return an iterator that supports three forms of syntax:
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
338
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
339 for i in dataset.minibatches(None,**kwargs): ...
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
340
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
341 for i in dataset.minibatches([f1, f2, f3],**kwargs): ...
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
342
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
343 for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ...
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
344
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
345 Using the first two syntaxes, "i" will be an indexable object, such as a list,
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
346 tuple, or Example instance. In both cases, i[k] is a list-like container
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
347 of a batch of current examples. In the second case, i[0] is
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
348 list-like container of the f1 field of a batch current examples, i[1] is
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
349 a list-like container of the f2 field, etc.
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
350
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
351 Using the first syntax, all the fields will be returned in "i".
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
352 Using the third syntax, i1, i2, i3 will be list-like containers of the
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
353 f1, f2, and f3 fields of a batch of examples on each loop iteration.
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
354
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
355 The minibatches iterator is expected to return upon each call to next()
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
356 a DataSetFields object, which is a Example (indexed by the field names) whose
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
357 elements are iterable and indexable over the minibatch examples, and which keeps a pointer to
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
358 a sub-dataset that can be used to iterate over the individual examples
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
359 in the minibatch. Hence a minibatch can be converted back to a regular
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
360 dataset or its fields can be looked at individually (and possibly iterated over).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
361
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
362 PARAMETERS
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
363 - fieldnames (list of any type, default None):
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
364 The loop variables i1, i2, i3 (in the example above) should contain the
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
365 f1, f2, and f3 fields of the current batch of examples. If None, the
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
366 derived class can choose a default, e.g. all fields.
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
367
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
368 - minibatch_size (integer, default 1)
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
369 On every iteration, the variables i1, i2, i3 will have
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
370 exactly minibatch_size elements. e.g. len(i1) == minibatch_size
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
371
331
52aa031e1fe3 IMPORTANT: minibatches now returns minibatch_nowrap with a minimum of assert before. Should implement the good behavior, e.g. returning only complete batches and let the user figure out what he wants.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 330
diff changeset
372 @DEPRECATED n_batches : not used anywhere
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
373 - n_batches (integer, default None)
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
374 The iterator will loop exactly this many times, and then stop. If None,
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
375 the derived class can choose a default. If (-1), then the returned
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
376 iterator should support looping indefinitely.
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
377
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
378 - offset (integer, default 0)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
379 The iterator will start at example 'offset' in the dataset, rather than the default.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
380
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
381 Note: A list-like container is something like a tuple, list, numpy.ndarray or
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
382 any other object that supports integer indexing and slicing.
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
383
331
52aa031e1fe3 IMPORTANT: minibatches now returns minibatch_nowrap with a minimum of assert before. Should implement the good behavior, e.g. returning only complete batches and let the user figure out what he wants.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 330
diff changeset
384 @ATTENTION: now minibatches returns minibatches_nowrap, which is supposed to return complete
353
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
385 batches only, raise StopIteration.
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
386 @ATTENTION: minibatches returns a LookupList, we can't iterate over examples on it.
331
52aa031e1fe3 IMPORTANT: minibatches now returns minibatch_nowrap with a minimum of assert before. Should implement the good behavior, e.g. returning only complete batches and let the user figure out what he wants.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 330
diff changeset
387
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
388 """
453
ce6b4fd3ab29 Fixed typo in help
delallea@valhalla.apstat.com
parents: 428
diff changeset
389 #return DataSet.MinibatchWrapAroundIterator(self, fieldnames, minibatch_size, n_batches,offset)
331
52aa031e1fe3 IMPORTANT: minibatches now returns minibatch_nowrap with a minimum of assert before. Should implement the good behavior, e.g. returning only complete batches and let the user figure out what he wants.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 330
diff changeset
390 assert offset >= 0
52aa031e1fe3 IMPORTANT: minibatches now returns minibatch_nowrap with a minimum of assert before. Should implement the good behavior, e.g. returning only complete batches and let the user figure out what he wants.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 330
diff changeset
391 assert offset < len(self)
334
a0f150a33b0f debug in an assert of minibatches
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 332
diff changeset
392 assert offset + minibatch_size -1 < len(self)
337
5e38ed2b3a75 debugging when fieldnames = None, now -> fieldnames = fieldNames()
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 335
diff changeset
393 if fieldnames == None :
5e38ed2b3a75 debugging when fieldnames = None, now -> fieldnames = fieldNames()
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 335
diff changeset
394 fieldnames = self.fieldNames()
335
89d88807e958 sorry for all the debugging, this push should be the good one (and damn you *self*)
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 334
diff changeset
395 return self.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
396
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
397 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
398 """
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
399 This is the minibatches iterator generator that sub-classes must define.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
400 It does not need to worry about wrapping around multiple times across the dataset,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
401 as this is handled by MinibatchWrapAroundIterator when DataSet.minibatches() is called.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
402 The next() method of the returned iterator does not even need to worry about
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
403 the termination condition (as StopIteration will be raised by DataSet.minibatches
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
404 before an improper call to minibatches_nowrap's next() is made).
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
405 That next() method can assert that its next row will always be within [0,len(dataset)).
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
406 The iterator returned by minibatches_nowrap does not need to implement
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
407 a next_index() method either, as this will be provided by MinibatchWrapAroundIterator.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
408 """
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
409 raise AbstractFunction()
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
410
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
411 def is_unbounded(self):
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
412 """
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
413 Tests whether a dataset is unbounded (e.g. a stream).
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
414 """
123
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 110
diff changeset
415 return len(self)==maxint
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
416
26
672fe4b23032 Fixed dataset errors so that _test_dataset.py works again.
bengioy@grenat.iro.umontreal.ca
parents: 23
diff changeset
417 def hasFields(self,*fieldnames):
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
418 """
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
419 Return true if the given field name (or field names, if multiple arguments are
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
420 given) is recognized by the DataSet (i.e. can be used as a field name in one
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
421 of the iterators).
29
46c5c90019c2 Changed apply_function so that it propagates methods of the source.
bengioy@grenat.iro.umontreal.ca
parents: 28
diff changeset
422
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
423 The default implementation may be inefficient (O(# fields in dataset)), as it calls the fieldNames()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
424 method. Many datasets may store their field names in a dictionary, which would allow more efficiency.
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
425 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
426 return len(unique_elements_list_intersection(fieldnames,self.fieldNames()))>0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
427
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
428 def fieldNames(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
429 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
430 Return the list of field names that are supported by the iterators,
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
431 and for which hasFields(fieldname) would return True.
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
432 """
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
433 raise AbstractFunction()
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
434
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
435 def __call__(self,*fieldnames):
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
436 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
437 Return a dataset that sees only the fields whose name are specified.
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
438 """
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
439 assert self.hasFields(*fieldnames)
354
d580b3a369a4 dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 353
diff changeset
440 #return self.fields(*fieldnames).examples()
d580b3a369a4 dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 353
diff changeset
441 fieldnames_list = list(fieldnames)
d580b3a369a4 dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 353
diff changeset
442 return FieldsSubsetDataSet(self,fieldnames_list)
d580b3a369a4 dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 353
diff changeset
443
d580b3a369a4 dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 353
diff changeset
444 def cached_fields_subset(self,*fieldnames) :
d580b3a369a4 dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 353
diff changeset
445 """
d580b3a369a4 dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 353
diff changeset
446 Behaviour is supposed to be the same as __call__(*fieldnames), but the dataset returned is cached.
d580b3a369a4 dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 353
diff changeset
447 @see : dataset.__call__
d580b3a369a4 dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 353
diff changeset
448 """
d580b3a369a4 dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 353
diff changeset
449 assert self.hasFields(*fieldnames)
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
450 return self.fields(*fieldnames).examples()
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
451
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
452 def fields(self,*fieldnames):
29
46c5c90019c2 Changed apply_function so that it propagates methods of the source.
bengioy@grenat.iro.umontreal.ca
parents: 28
diff changeset
453 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
454 Return a DataSetFields object associated with this dataset.
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
455 """
74
b4159cbdc06b Fixed errors raised by test_dataset
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 73
diff changeset
456 return DataSetFields(self,fieldnames)
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
457
269
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
458 def getitem_key(self, fieldname):
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
459 """A not-so-well thought-out place to put code that used to be in
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
460 getitem.
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
461 """
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
462 #removing as per discussion June 4. --JSB
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
463
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
464 i = fieldname
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
465 # else check for a fieldname
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
466 if self.hasFields(i):
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
467 return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
468 # else we are trying to access a property of the dataset
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
469 assert i in self.__dict__ # else it means we are trying to access a non-existing property
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
470 return self.__dict__[i]
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
471
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
472 def __getitem__(self,i):
28
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
473 """
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
474 @rtype: Example
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
475 @returns: single or multiple examples
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
476
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
477 @type i: integer or slice or <iterable> of integers
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
478 @param i:
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
479 dataset[i] returns the (i+1)-th example of the dataset.
309
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
480 dataset[i:j] returns a LookupList with examples i,i+1,...,j-1.
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
481 dataset[i:j:s] returns a LookupList with examples i,i+2,i+4...,j-2.
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
482 dataset[[i1,i2,..,in]] returns a LookupList with examples i1,i2,...,in.
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
483
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
484 @note:
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
485 Some stream datasets may be unable to implement random access, i.e.
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
486 arbitrary slicing/indexing because they can only iterate through
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
487 examples one or a minibatch at a time and do not actually store or keep
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
488 past (or future) examples.
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
489
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
490 The default implementation of getitem uses the minibatches iterator
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
491 to obtain one example, one slice, or a list of examples. It may not
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
492 always be the most efficient way to obtain the result, especially if
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
493 the data are actually stored in a memory array.
28
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
494 """
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
495
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
496 if type(i) is int:
309
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
497 assert i >= 0 # TBM: see if someone complains and want negative i
313
009ce84e9f52 behaviour is now the same as a list in pylearn, so if len(ds) = 10, ds[10] raise an IndexError, same thing for ds[[1,10]], and ds[0:14:1] returns 10 elements
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 310
diff changeset
498 if i >= len(self) :
009ce84e9f52 behaviour is now the same as a list in pylearn, so if len(ds) = 10, ds[10] raise an IndexError, same thing for ds[[1,10]], and ds[0:14:1] returns 10 elements
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 310
diff changeset
499 raise IndexError
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
500 i_batch = self.minibatches_nowrap(self.fieldNames(),
293
4bfdda107a17 still merging
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 292
diff changeset
501 minibatch_size=1, n_batches=1, offset=i)
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
502 return DataSet.MinibatchToSingleExampleIterator(i_batch).next()
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
503
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
504 #if i is a contiguous slice
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
505 if type(i) is slice and (i.step in (None, 1)):
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
506 offset = 0 if i.start is None else i.start
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
507 upper_bound = len(self) if i.stop is None else i.stop
313
009ce84e9f52 behaviour is now the same as a list in pylearn, so if len(ds) = 10, ds[10] raise an IndexError, same thing for ds[[1,10]], and ds[0:14:1] returns 10 elements
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 310
diff changeset
508 upper_bound = min(len(self) , upper_bound)
309
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
509 #return MinibatchDataSet(self.minibatches_nowrap(self.fieldNames(),
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
510 # minibatch_size=upper_bound - offset,
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
511 # n_batches=1,
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
512 # offset=offset).next())
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
513 # now returns a LookupList
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
514 return self.minibatches_nowrap(self.fieldNames(),
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
515 minibatch_size=upper_bound - offset,
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
516 n_batches=1,
309
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
517 offset=offset).next()
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
518
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
519 # if slice has a step param, convert it to list and handle it with the
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
520 # list code
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
521 if type(i) is slice:
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
522 offset = 0 if i.start is None else i.start
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
523 upper_bound = len(self) if i.stop is None else i.stop
314
105b54ac8260 bug fixed concerning the slicing, now ds[0:len(ds) + 1000 : 2] is accepted, same a python list
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 313
diff changeset
524 upper_bound = min(len(self) , upper_bound)
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
525 i = list(range(offset, upper_bound, i.step))
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
526
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
527 # handle tuples, arrays, lists
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
528 if hasattr(i, '__getitem__'):
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
529 for idx in i:
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
530 #dis-allow nested slices
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
531 if not isinstance(idx, int):
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
532 raise TypeError(idx)
313
009ce84e9f52 behaviour is now the same as a list in pylearn, so if len(ds) = 10, ds[10] raise an IndexError, same thing for ds[[1,10]], and ds[0:14:1] returns 10 elements
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 310
diff changeset
533 if idx >= len(self) :
009ce84e9f52 behaviour is now the same as a list in pylearn, so if len(ds) = 10, ds[10] raise an IndexError, same thing for ds[[1,10]], and ds[0:14:1] returns 10 elements
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 310
diff changeset
534 raise IndexError
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
535 # call back into self.__getitem__
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
536 examples = [self.minibatches_nowrap(self.fieldNames(),
293
4bfdda107a17 still merging
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 292
diff changeset
537 minibatch_size=1, n_batches=1, offset=ii).next()
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
538 for ii in i]
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
539 # re-index the fields in each example by field instead of by example
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
540 field_values = [[] for blah in self.fieldNames()]
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
541 for e in examples:
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
542 for f,v in zip(field_values, e):
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
543 f.append(v)
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
544 #build them into a LookupList (a.ka. Example)
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
545 zz = zip(self.fieldNames(),field_values)
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
546 vst = [self.valuesVStack(fieldname,field_values) for fieldname,field_values in zz]
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
547 example = Example(self.fieldNames(), vst)
309
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
548 #return MinibatchDataSet(example, self.valuesVStack, self.valuesHStack)
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
549 # now returns a LookupList
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
550 return example
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
551
923de30457f0 get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 296
diff changeset
552 # what in the world is i?
269
fdce496c3b56 deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 268
diff changeset
553 raise TypeError(i, type(i))
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
554
310
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
555
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
556 """
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
557 Enables the call dataset.subset[a:b:c] that will return a DataSet
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
558 around the examples returned by __getitem__(slice(a,b,c))
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
559
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
560 @SEE DataSet.__getsubset(self)
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
561 """
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
562 subset = property(lambda s : s.__getsubset(),doc="returns a subset as a DataSet")
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
563
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
564
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
565 def __getsubset(self) :
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
566 """
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
567 Enables the call data.subset[a:b:c], returns a DataSet.
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
568 Default implementation is a simple wrap around __getitem__() using MinibatchDataSet.
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
569
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
570 @RETURN DataSet
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
571 @SEE DataSet.subset = property(lambda s : s.__getsubset())
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
572 """
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
573 _self = self
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
574 class GetSliceReturnsDataSet(object) :
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
575 def __getitem__(self,slice) :
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
576 return MinibatchDataSet(_self.__getitem__(slice))
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
577 return GetSliceReturnsDataSet()
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
578
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
579
ebccfd05ccd5 dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 309
diff changeset
580
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
581 def valuesHStack(self,fieldnames,fieldvalues):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
582 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
583 Return a value that corresponds to concatenating (horizontally) several field values.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
584 This can be useful to merge some fields. The implementation of this operation is likely
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
585 to involve a copy of the original values. When the values are numpy arrays, the
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
586 result should be numpy.hstack(values). If it makes sense, this operation should
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
587 work as well when each value corresponds to multiple examples in a minibatch
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
588 e.g. if each value is a Ni-vector and a minibatch of length L is a LxNi matrix,
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
589 then the result should be a Lx(N1+N2+..) matrix equal to numpy.hstack(values).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
590 The default is to use numpy.hstack for numpy.ndarray values, and a list
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
591 pointing to the original values for other data types.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
592 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
593 all_numpy=True
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
594 for value in fieldvalues:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
595 if not type(value) is numpy.ndarray:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
596 all_numpy=False
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
597 if all_numpy:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
598 return numpy.hstack(fieldvalues)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
599 # the default implementation of horizontal stacking is to put values in a list
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
600 return fieldvalues
26
672fe4b23032 Fixed dataset errors so that _test_dataset.py works again.
bengioy@grenat.iro.umontreal.ca
parents: 23
diff changeset
601
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
602 def valuesVStack(self,fieldname,values):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
603 """
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
604 @param fieldname: the name of the field from which the values were taken
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
605 @type fieldname: any type
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
606
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
607 @param values: bits near the beginning or end of the dataset
452
739612d316a4 Typo fix in help
delallea@valhalla.apstat.com
parents: 428
diff changeset
608 @type values: list of minibatches (returned by minibatches_nowrap)
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
609
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
610 @return: the concatenation (stacking) of the values
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
611 @rtype: something suitable as a minibatch field
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
612 """
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
613 rval = []
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
614 for v in values:
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
615 rval.extend(v)
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
616 return rval
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
617
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
618 def __or__(self,other):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
619 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
620 dataset1 | dataset2 returns a dataset whose list of fields is the concatenation of the list of
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
621 fields of the argument datasets. This only works if they all have the same length.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
622 """
135
0d8e721cc63c Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 134
diff changeset
623 return HStackedDataSet([self,other])
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
624
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
625 def __and__(self,other):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
626 """
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
627 dataset1 & dataset2 is a dataset that concatenates the examples from the argument datasets
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
628 (and whose length is the sum of the length of the argument datasets). This only
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
629 works if they all have the same fields.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
630 """
135
0d8e721cc63c Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 134
diff changeset
631 return VStackedDataSet([self,other])
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
632
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
633 def hstack(datasets):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
634 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
635 hstack(dataset1,dataset2,...) returns dataset1 | datataset2 | ...
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
636 which is a dataset whose fields list is the concatenation of the fields
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
637 of the individual datasets.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
638 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
639 assert len(datasets)>0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
640 if len(datasets)==1:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
641 return datasets[0]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
642 return HStackedDataSet(datasets)
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
643
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
644 def vstack(datasets):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
645 """
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
646 vstack(dataset1,dataset2,...) returns dataset1 & datataset2 & ...
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
647 which is a dataset which iterates first over the examples of dataset1, then
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
648 over those of dataset2, etc.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
649 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
650 assert len(datasets)>0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
651 if len(datasets)==1:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
652 return datasets[0]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
653 return VStackedDataSet(datasets)
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
654
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
655 class FieldsSubsetDataSet(DataSet):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
656 """
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
657 A sub-class of L{DataSet} that selects a subset of the fields.
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
658 """
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
659 def __init__(self,src,fieldnames):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
660 self.src=src
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
661 self.fieldnames=fieldnames
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
662 assert src.hasFields(*fieldnames)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
663 self.valuesHStack = src.valuesHStack
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
664 self.valuesVStack = src.valuesVStack
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
665
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
666 def __len__(self): return len(self.src)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
667
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
668 def fieldNames(self):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
669 return self.fieldnames
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
670
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
671 def __iter__(self):
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
672 class FieldsSubsetIterator(object):
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
673 def __init__(self,ds):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
674 self.ds=ds
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
675 self.src_iter=ds.src.__iter__()
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
676 self.example=None
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
677 def __iter__(self): return self
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
678 def next(self):
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
679 complete_example = self.src_iter.next()
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
680 if self.example:
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
681 self.example._values=[complete_example[field]
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
682 for field in self.ds.fieldnames]
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
683 else:
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
684 self.example=Example(self.ds.fieldnames,
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
685 [complete_example[field] for field in self.ds.fieldnames])
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
686 return self.example
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
687 return FieldsSubsetIterator(self)
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
688
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
689 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
690 assert self.hasFields(*fieldnames)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
691 return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset)
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
692 def dontuse__getitem__(self,i):
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
693 return FieldsSubsetDataSet(self.src[i],self.fieldnames)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
694
328
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
695 class RenamedFieldsDataSet(DataSet):
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
696 """
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
697 A sub-class of L{DataSet} that selects and renames a subset of the fields.
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
698 """
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
699 def __init__(self,src,src_fieldnames,new_fieldnames):
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
700 self.src=src
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
701 self.src_fieldnames=src_fieldnames
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
702 self.new_fieldnames=new_fieldnames
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
703 assert src.hasFields(*src_fieldnames)
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
704 assert len(src_fieldnames)==len(new_fieldnames)
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
705 self.valuesHStack = src.valuesHStack
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
706 self.valuesVStack = src.valuesVStack
351
7545207466d4 debugged RenamedFieldsDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 344
diff changeset
707 self.lookup_fields = Example(new_fieldnames,src_fieldnames)
328
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
708
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
709 def __len__(self): return len(self.src)
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
710
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
711 def fieldNames(self):
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
712 return self.new_fieldnames
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
713
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
714 def __iter__(self):
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
715 class FieldsSubsetIterator(object):
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
716 def __init__(self,ds):
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
717 self.ds=ds
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
718 self.src_iter=ds.src.__iter__()
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
719 self.example=None
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
720 def __iter__(self): return self
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
721 def next(self):
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
722 complete_example = self.src_iter.next()
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
723 if self.example:
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
724 self.example._values=[complete_example[field]
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
725 for field in self.ds.src_fieldnames]
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
726 else:
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
727 self.example=Example(self.ds.new_fieldnames,
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
728 [complete_example[field]
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
729 for field in self.ds.src_fieldnames])
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
730 return self.example
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
731 return FieldsSubsetIterator(self)
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
732
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
733 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
734 assert self.hasFields(*fieldnames)
351
7545207466d4 debugged RenamedFieldsDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 344
diff changeset
735 cursor = Example(fieldnames,[0]*len(fieldnames))
7545207466d4 debugged RenamedFieldsDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 344
diff changeset
736 for batch in self.src.minibatches_nowrap([self.lookup_fields[f] for f in fieldnames],minibatch_size,n_batches,offset):
7545207466d4 debugged RenamedFieldsDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 344
diff changeset
737 cursor._values=batch._values
7545207466d4 debugged RenamedFieldsDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 344
diff changeset
738 yield cursor
7545207466d4 debugged RenamedFieldsDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 344
diff changeset
739
328
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
740 def __getitem__(self,i):
333
7d2f7b8fe213 bugfix in RenamedDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 332
diff changeset
741 # return FieldsSubsetDataSet(self.src[i],self.new_fieldnames)
7d2f7b8fe213 bugfix in RenamedDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 332
diff changeset
742 complete_example = self.src[i]
7d2f7b8fe213 bugfix in RenamedDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 332
diff changeset
743 return Example(self.new_fieldnames,
7d2f7b8fe213 bugfix in RenamedDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 332
diff changeset
744 [complete_example[field]
7d2f7b8fe213 bugfix in RenamedDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 332
diff changeset
745 for field in self.src_fieldnames])
7d2f7b8fe213 bugfix in RenamedDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 332
diff changeset
746
328
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
747
09140ba68e17 Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 322
diff changeset
748
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
749 class DataSetFields(Example):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
750 """
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
751 Although a L{DataSet} iterates over examples (like rows of a matrix), an associated
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
752 DataSetFields iterates over fields (like columns of a matrix), and can be understood
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
753 as a transpose of the associated dataset.
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
754
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
755 To iterate over fields, one can do
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
756 * for fields in dataset.fields()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
757 * for fields in dataset(field1,field2,...).fields() to select a subset of fields
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
758 * for fields in dataset.fields(field1,field2,...) to select a subset of fields
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
759 and each of these fields is iterable over the examples:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
760 * for field_examples in dataset.fields():
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
761 for example_value in field_examples:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
762 ...
241
ddb88a8e9fd2 If I understand properly, the length of an unbounded stream is sys.maxint
delallea@opale.iro.umontreal.ca
parents: 231
diff changeset
763 but when the dataset is a stream (unbounded length), it is not recommended to do
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
764 such things because the underlying dataset may refuse to access the different fields in
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
765 an unsynchronized ways. Hence the fields() method is illegal for streams, by default.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
766 The result of fields() is a DataSetFields object, which iterates over fields,
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
767 and whose elements are iterable over examples. A DataSetFields object can
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
768 be turned back into a DataSet with its examples() method:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
769 dataset2 = dataset1.fields().examples()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
770 and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1).
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
771
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
772 DataSetFields can be concatenated vertically or horizontally. To be consistent with
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
773 the syntax used for DataSets, the | concatenates the fields and the & concatenates
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
774 the examples.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
775 """
74
b4159cbdc06b Fixed errors raised by test_dataset
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 73
diff changeset
776 def __init__(self,dataset,fieldnames):
65
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
777 original_dataset=dataset
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
778 if not fieldnames:
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
779 fieldnames=dataset.fieldNames()
274
ed70580f2324 bugfix in FieldSubsetDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 273
diff changeset
780 elif not list(fieldnames)==list(dataset.fieldNames()):
ed70580f2324 bugfix in FieldSubsetDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 273
diff changeset
781 #we must cast to list, othersize('x','y')!=['x','y']
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
782 dataset = FieldsSubsetDataSet(dataset,fieldnames)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
783 assert dataset.hasFields(*fieldnames)
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
784 self.dataset=dataset
66
dde1fb1b63ba fixed test and removed print
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 65
diff changeset
785
64
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
786 if isinstance(dataset,MinibatchDataSet):
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
787 Example.__init__(self,fieldnames,list(dataset._fields))
65
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
788 elif isinstance(original_dataset,MinibatchDataSet):
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
789 Example.__init__(self,fieldnames,
65
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
790 [original_dataset._fields[field]
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
791 for field in fieldnames])
64
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
792 else:
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
793 minibatch_iterator = dataset.minibatches(fieldnames,
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
794 minibatch_size=len(dataset),
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
795 n_batches=1)
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
796 minibatch=minibatch_iterator.next()
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
797 Example.__init__(self,fieldnames,minibatch)
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
798
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
799 def examples(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
800 return self.dataset
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
801
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
802 def __or__(self,other):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
803 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
804 fields1 | fields2 is a DataSetFields that whose list of examples is the concatenation
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
805 of the list of examples of DataSetFields fields1 and fields2.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
806 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
807 return (self.examples() + other.examples()).fields()
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
808
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
809 def __and__(self,other):
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
810 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
811 fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
812 of the fields of DataSetFields fields1 and fields2.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
813 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
814 return (self.examples() | other.examples()).fields()
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
815
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
816
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
817 class MinibatchDataSet(DataSet):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
818 """
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
819 Turn a L{Example} of same-length (iterable) fields into an example-iterable dataset.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
820 Each element of the lookup-list should be an iterable and sliceable, all of the same length.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
821 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
822 def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack,
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
823 values_hstack=DataSet().valuesHStack):
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
824 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
825 The user can (and generally should) also provide values_vstack(fieldname,fieldvalues)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
826 and a values_hstack(fieldnames,fieldvalues) functions behaving with the same
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
827 semantics as the DataSet methods of the same name (but without the self argument).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
828 """
211
bd728c83faff in __get__, problem if the i.stop was None, i being the slice, added one line replacing None by the len(self)
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 203
diff changeset
829
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
830 self._fields=fields_lookuplist
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
831 assert len(fields_lookuplist)>0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
832 self.length=len(fields_lookuplist[0])
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
833 for field in fields_lookuplist[1:]:
223
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
834 if self.length != len(field) :
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
835 print 'self.length = ',self.length
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
836 print 'len(field) = ', len(field)
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
837 print 'self._fields.keys() = ', self._fields.keys()
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
838 print 'field=',field
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
839 print 'fields_lookuplist=', fields_lookuplist
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
840 assert self.length==len(field)
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
841 self.valuesVStack=values_vstack
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
842 self.valuesHStack=values_hstack
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
843
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
844 def __len__(self):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
845 return self.length
28
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
846
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
847 def dontuse__getitem__(self,i):
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
848 if type(i) in (slice,list):
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
849 return DataSetFields(MinibatchDataSet(
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
850 Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames())
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
851 if type(i) is int:
85
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 83
diff changeset
852 return Example(self._fields.keys(),[field[i] for field in self._fields])
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
853 if self.hasFields(i):
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
854 return self._fields[i]
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
855 assert i in self.__dict__ # else it means we are trying to access a non-existing property
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
856 return self.__dict__[i]
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
857
29
46c5c90019c2 Changed apply_function so that it propagates methods of the source.
bengioy@grenat.iro.umontreal.ca
parents: 28
diff changeset
858 def fieldNames(self):
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
859 return self._fields.keys()
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
860
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
861 def hasFields(self,*fieldnames):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
862 for fieldname in fieldnames:
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
863 if fieldname not in self._fields.keys():
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
864 return False
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
865 return True
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
866
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
867 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
223
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
868 #@TODO bug somewhere here, fieldnames doesnt seem to be well handled
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
869 class Iterator(object):
223
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
870 def __init__(self,ds,fieldnames):
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
871 # tbm: added two next lines to handle fieldnames
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
872 if fieldnames is None: fieldnames = ds._fields.keys()
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
873 self.fieldnames = fieldnames
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
874
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
875 self.ds=ds
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
876 self.next_example=offset
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
877 assert minibatch_size >= 0
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
878 if offset+minibatch_size > ds.length:
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
879 raise NotImplementedError()
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
880 def __iter__(self):
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
881 return self
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
882 def next(self):
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
883 upper = self.next_example+minibatch_size
353
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
884 if upper > len(self.ds) :
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
885 raise StopIteration()
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
886 assert upper<=len(self.ds) # instead of self.ds.length
223
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
887 #minibatch = Example(self.ds._fields.keys(),
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
888 # [field[self.next_example:upper]
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
889 # for field in self.ds._fields])
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
890 # tbm: modif to use fieldnames
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
891 values = []
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
892 for f in self.fieldnames :
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
893 #print 'we have field',f,'in fieldnames'
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
894 values.append( self.ds._fields[f][self.next_example:upper] )
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
895 minibatch = Example(self.fieldnames,values)
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
896 #print minibatch
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
897 self.next_example+=minibatch_size
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
898 return minibatch
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
899
223
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
900 # tbm: added fieldnames to handle subset of fieldnames
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
901 return Iterator(self,fieldnames)
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
902
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
903 class HStackedDataSet(DataSet):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
904 """
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
905 A L{DataSet} that wraps several datasets and shows a view that includes all their fields,
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
906 i.e. whose list of fields is the concatenation of their lists of fields.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
907
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
908 If a field name is found in more than one of the datasets, then either an error is
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
909 raised or the fields are renamed (either by prefixing the __name__ attribute
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
910 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
911
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
912 @todo: automatically detect a chain of stacked datasets due to A | B | C | D ...
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
913 """
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
914 def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None):
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
915 DataSet.__init__(self,description,field_types)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
916 self.datasets=datasets
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
917 self.accept_nonunique_names=accept_nonunique_names
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
918 self.fieldname2dataset={}
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
919
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
920 def rename_field(fieldname,dataset,i):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
921 if hasattr(dataset,"__name__"):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
922 return dataset.__name__ + "." + fieldname
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
923 return fieldname+"."+str(i)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
924
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
925 # make sure all datasets have the same length and unique field names
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
926 self.length=None
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
927 names_to_change=[]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
928 for i in xrange(len(datasets)):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
929 dataset = datasets[i]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
930 length=len(dataset)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
931 if self.length:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
932 assert self.length==length
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
933 else:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
934 self.length=length
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
935 for fieldname in dataset.fieldNames():
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
936 if fieldname in self.fieldname2dataset: # name conflict!
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
937 if accept_nonunique_names:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
938 fieldname=rename_field(fieldname,dataset,i)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
939 names2change.append((fieldname,i))
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
940 else:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
941 raise ValueError("Incompatible datasets: non-unique field name = "+fieldname)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
942 self.fieldname2dataset[fieldname]=i
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
943 for fieldname,i in names_to_change:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
944 del self.fieldname2dataset[fieldname]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
945 self.fieldname2dataset[rename_field(fieldname,self.datasets[i],i)]=i
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
946
422
32c5f87bc54e Added __len__ to HStackedDataSet and replaced default len() by sys.maxint instead of None
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 378
diff changeset
947 def __len__(self):
32c5f87bc54e Added __len__ to HStackedDataSet and replaced default len() by sys.maxint instead of None
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 378
diff changeset
948 return len(self.datasets[0])
32c5f87bc54e Added __len__ to HStackedDataSet and replaced default len() by sys.maxint instead of None
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 378
diff changeset
949
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
950 def hasFields(self,*fieldnames):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
951 for fieldname in fieldnames:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
952 if not fieldname in self.fieldname2dataset:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
953 return False
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
954 return True
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
955
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
956 def fieldNames(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
957 return self.fieldname2dataset.keys()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
958
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
959 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
960
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
961 class HStackedIterator(object):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
962 def __init__(self,hsds,iterators):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
963 self.hsds=hsds
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
964 self.iterators=iterators
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
965 def __iter__(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
966 return self
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
967 def next(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
968 # concatenate all the fields of the minibatches
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
969 l=Example()
140
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 136
diff changeset
970 for iter in self.iterators:
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 136
diff changeset
971 l.append_lookuplist(iter.next())
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 136
diff changeset
972 return l
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
973
125
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 123
diff changeset
974 assert self.hasFields(*fieldnames)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
975 # find out which underlying datasets are necessary to service the required fields
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
976 # and construct corresponding minibatch iterators
140
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 136
diff changeset
977 if fieldnames and fieldnames!=self.fieldNames():
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
978 datasets=set([])
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
979 fields_in_dataset=dict([(dataset,[]) for dataset in datasets])
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
980 for fieldname in fieldnames:
136
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 125
diff changeset
981 dataset=self.datasets[self.fieldname2dataset[fieldname]]
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
982 datasets.add(dataset)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
983 fields_in_dataset[dataset].append(fieldname)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
984 datasets=list(datasets)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
985 iterators=[dataset.minibatches(fields_in_dataset[dataset],minibatch_size,n_batches,offset)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
986 for dataset in datasets]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
987 else:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
988 datasets=self.datasets
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
989 iterators=[dataset.minibatches(None,minibatch_size,n_batches,offset) for dataset in datasets]
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
990 return HStackedIterator(self,iterators)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
991
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
992
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
993 def untested_valuesVStack(self,fieldname,fieldvalues):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
994 return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
995
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
996 def untested_valuesHStack(self,fieldnames,fieldvalues):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
997 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
998 We will use the sub-dataset associated with the first fieldname in the fieldnames list
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
999 to do the work, hoping that it can cope with the other values (i.e. won't care
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1000 about the incompatible fieldnames). Hence this heuristic will always work if
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1001 all the fieldnames are of the same sub-dataset.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1002 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1003 return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1004
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1005 class VStackedDataSet(DataSet):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1006 """
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
1007 A L{DataSet} that wraps several datasets and shows a view that includes all their examples,
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1008 in the order provided. This clearly assumes that they all have the same field names
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1009 and all (except possibly the last one) are of finite length.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1010
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
1011 @todo: automatically detect a chain of stacked datasets due to A + B + C + D ...
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1012 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1013 def __init__(self,datasets):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1014 self.datasets=datasets
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1015 self.length=0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1016 self.index2dataset={}
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1017 assert len(datasets)>0
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1018 fieldnames = datasets[-1].fieldNames()
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1019 self.datasets_start_row=[]
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1020 # We use this map from row index to dataset index for constant-time random access of examples,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1021 # to avoid having to search for the appropriate dataset each time and slice is asked for.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1022 for dataset,k in enumerate(datasets[0:-1]):
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
1023 assert dataset.is_unbounded() # All VStacked datasets (except possibly the last) must be bounded (have a length).
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
1024 L=len(dataset)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1025 for i in xrange(L):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1026 self.index2dataset[self.length+i]=k
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1027 self.datasets_start_row.append(self.length)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1028 self.length+=L
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1029 assert dataset.fieldNames()==fieldnames
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1030 self.datasets_start_row.append(self.length)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1031 self.length+=len(datasets[-1])
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1032 # If length is very large, we should use a more memory-efficient mechanism
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1033 # that does not store all indices
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1034 if self.length>1000000:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1035 # 1 million entries would require about 60 meg for the index2dataset map
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1036 # TODO
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1037 print "A more efficient mechanism for index2dataset should be implemented"
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1038
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1039 def __len__(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1040 return self.length
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1041
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1042 def fieldNames(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1043 return self.datasets[0].fieldNames()
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1044
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1045 def hasFields(self,*fieldnames):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1046 return self.datasets[0].hasFields(*fieldnames)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1047
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1048 def locate_row(self,row):
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1049 """Return (dataset_index, row_within_dataset) for global row number"""
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1050 dataset_index = self.index2dataset[row]
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1051 row_within_dataset = self.datasets_start_row[dataset_index]
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1052 return dataset_index, row_within_dataset
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1053
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1054 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1055
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1056 class VStackedIterator(object):
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1057 def __init__(self,vsds):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1058 self.vsds=vsds
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1059 self.next_row=offset
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1060 self.next_dataset_index,self.next_dataset_row=self.vsds.locate_row(offset)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1061 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1062 self.next_iterator(vsds.datasets[0],offset,n_batches)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1063
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1064 def next_iterator(self,dataset,starting_offset,batches_left):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1065 L=len(dataset)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1066 ds_nbatches = (L-starting_offset)/minibatch_size
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1067 if batches_left is not None:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1068 ds_nbatches = max(batches_left,ds_nbatches)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1069 if minibatch_size>L:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1070 ds_minibatch_size=L
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1071 n_left_in_mb=minibatch_size-L
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1072 ds_nbatches=1
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1073 else:
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1074 n_left_in_mb=0
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1075 return dataset.minibatches(fieldnames,minibatch_size,ds_nbatches,starting_offset), \
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1076 L-(starting_offset+ds_nbatches*minibatch_size), n_left_in_mb
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1077
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1078 def move_to_next_dataset(self):
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1079 if self.n_left_at_the_end_of_ds>0:
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1080 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1081 self.next_iterator(vsds.datasets[self.next_dataset_index],
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1082 self.n_left_at_the_end_of_ds,1)
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1083 else:
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1084 self.next_dataset_index +=1
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1085 if self.next_dataset_index==len(self.vsds.datasets):
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1086 self.next_dataset_index = 0
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1087 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1088 self.next_iterator(vsds.datasets[self.next_dataset_index],starting_offset,n_batches)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1089
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1090 def __iter__(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1091 return self
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1092
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1093 def next(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1094 dataset=self.vsds.datasets[self.next_dataset_index]
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1095 mb = self.next_iterator.next()
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1096 if self.n_left_in_mb:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1097 extra_mb = []
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1098 while self.n_left_in_mb>0:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1099 self.move_to_next_dataset()
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1100 extra_mb.append(self.next_iterator.next())
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1101 mb = Example(fieldnames,
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
1102 [dataset.valuesVStack(name,
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
1103 [mb[name]]+[b[name] for b in extra_mb])
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
1104 for name in fieldnames])
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
1105
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1106 self.next_row+=minibatch_size
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1107 self.next_dataset_row+=minibatch_size
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1108 if self.next_row+minibatch_size>len(dataset):
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
1109 self.move_to_next_dataset()
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1110 return examples
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1111 return VStackedIterator(self)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
1112
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1113 class ArrayFieldsDataSet(DataSet):
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1114 """
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1115 Virtual super-class of datasets whose field values are numpy array,
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1116 thus defining valuesHStack and valuesVStack for sub-classes.
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1117 """
268
3f1cd8897fda reverting dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 266
diff changeset
1118 def __init__(self,description=None,field_types=None):
3f1cd8897fda reverting dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 266
diff changeset
1119 DataSet.__init__(self,description,field_types)
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1120 def untested_valuesHStack(self,fieldnames,fieldvalues):
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1121 """Concatenate field values horizontally, e.g. two vectors
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1122 become a longer vector, two matrices become a wider matrix, etc."""
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1123 return numpy.hstack(fieldvalues)
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1124 def untested_valuesVStack(self,fieldname,values):
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1125 """Concatenate field values vertically, e.g. two vectors
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1126 become a two-row matrix, two matrices become a longer matrix, etc."""
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1127 return numpy.vstack(values)
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1128
316
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1129
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1130
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1131 class NArraysDataSet(ArrayFieldsDataSet) :
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1132 """
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1133 An NArraysDataSet stores fields that are numpy tensor, whose first axis
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1134 iterates over examples. It's a generalization of ArrayDataSet.
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1135 """
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1136 #@TODO not completely implemented yet
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1137 def __init__(self, data_arrays, fieldnames, **kwargs) :
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1138 """
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1139 Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1140 of fieldnames. The number of arrays must be the same as the number of
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1141 fieldnames. Each set of numpy tensor must have the same first dimension (first
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1142 axis) corresponding to the number of examples.
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1143
321
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1144 Every tensor is treated as a numpy array (using numpy.asarray)
316
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1145 """
318
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1146 ArrayFieldsDataSet.__init__(self,**kwargs)
316
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1147 assert len(data_arrays) == len(fieldnames)
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1148 assert len(fieldnames) > 0
322
ad8be93b3c55 small bugs fixed with NArrayDataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 321
diff changeset
1149 ndarrays = [numpy.asarray(a) for a in data_arrays]
321
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1150 lens = [a.shape[0] for a in ndarrays]
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1151 num_examples = lens[0] #they must all be equal anyway
316
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1152 self._fieldnames = fieldnames
322
ad8be93b3c55 small bugs fixed with NArrayDataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 321
diff changeset
1153 for k in ndarrays :
321
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1154 assert k.shape[0] == num_examples
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1155 self._datas = ndarrays
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1156 # create dict
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1157 self.map_field_idx = dict()
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1158 for k in range(len(fieldnames)):
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1159 self.map_field_idx[fieldnames[k]] = k
316
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1160
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1161
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1162 def __len__(self) :
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1163 """
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1164 Length of the dataset is based on the first array = data_arrays[0], using its shape
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1165 """
318
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1166 return self._datas[0].shape[0]
316
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1167
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1168 def fieldNames(self) :
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1169 """
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1170 Returns the fieldnames as set in self.__init__
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1171 """
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1172 return self._fieldnames
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1173
321
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1174 def field_pos(self,fieldname) :
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1175 """
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1176 Returns the index of a given fieldname. Fieldname must exists! see fieldNames().
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1177 """
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1178 return self.map_field_idx[fieldname]
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1179
318
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1180 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1181 cursor = Example(fieldnames,[0]*len(fieldnames))
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1182 fieldnames = self.fieldNames() if fieldnames is None else fieldnames
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1183 for n in xrange(n_batches):
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1184 if offset == len(self):
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1185 break
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1186 for f in range(len(cursor._names)) :
321
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1187 idx = self.field_pos(cursor._names[f])
f03ae06fadc8 NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 320
diff changeset
1188 sub_data = self._datas[idx][offset : offset+minibatch_size]
318
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1189 cursor._values[f] = sub_data
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1190 offset += len(sub_data) #can be less than minibatch_size at end
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1191 yield cursor
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1192
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1193 #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1194
e2eab74b6a28 NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 317
diff changeset
1195
316
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1196
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1197
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1198 class ArrayDataSet(ArrayFieldsDataSet):
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1199 """
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1200 An ArrayDataSet stores the fields as groups of columns in a numpy tensor,
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1201 whose first axis iterates over examples, second axis determines fields.
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1202 If the underlying array is N-dimensional (has N axes), then the field
455
fb62f0e4bcfe Reverted change ce6b4fd3ab29 (I do not believe anymore it was a typo)
delallea@valhalla.apstat.com
parents: 454
diff changeset
1203 values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2).
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1204 """
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1205
188
f01ac276c6fb added __contains__ to Dataset, added parent constructor call to ArrayDataSet
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 171
diff changeset
1206 def __init__(self, data_array, fields_columns, **kwargs):
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1207 """
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1208 Construct an ArrayDataSet from the underlying numpy array (data) and
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1209 a map (fields_columns) from fieldnames to field columns. The columns of a field are specified
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1210 using the standard arguments for indexing/slicing: integer for a column index,
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1211 slice for an interval of columns (with possible stride), or iterable of column indices.
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1212 """
188
f01ac276c6fb added __contains__ to Dataset, added parent constructor call to ArrayDataSet
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 171
diff changeset
1213 ArrayFieldsDataSet.__init__(self, **kwargs)
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1214 self.data=data_array
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1215 self.fields_columns=fields_columns
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1216
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1217 # check consistency and complete slices definitions
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1218 for fieldname, fieldcolumns in self.fields_columns.items():
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1219 if type(fieldcolumns) is int:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1220 assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1]
268
3f1cd8897fda reverting dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 266
diff changeset
1221 if 1:
227
17c5d080964b reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 223
diff changeset
1222 #I changed this because it didn't make sense to me,
17c5d080964b reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 223
diff changeset
1223 # and it made it more difficult to write my learner.
17c5d080964b reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 223
diff changeset
1224 # If it breaks stuff, let's talk about it.
17c5d080964b reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 223
diff changeset
1225 # - James 22/05/2008
17c5d080964b reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 223
diff changeset
1226 self.fields_columns[fieldname]=[fieldcolumns]
17c5d080964b reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 223
diff changeset
1227 else:
17c5d080964b reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 223
diff changeset
1228 self.fields_columns[fieldname]=fieldcolumns
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1229 elif type(fieldcolumns) is slice:
424
0231eeac11c9 Fixed incorrect handling of slice column specifications in ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 422
diff changeset
1230 start,step=fieldcolumns.start,fieldcolumns.step
0231eeac11c9 Fixed incorrect handling of slice column specifications in ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 422
diff changeset
1231 if not start:
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1232 start=0
424
0231eeac11c9 Fixed incorrect handling of slice column specifications in ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 422
diff changeset
1233 if not step:
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1234 step=1
424
0231eeac11c9 Fixed incorrect handling of slice column specifications in ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 422
diff changeset
1235 self.fields_columns[fieldname]=slice(start,fieldcolumns.stop,step)
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1236 elif hasattr(fieldcolumns,"__iter__"): # something like a list
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1237 for i in fieldcolumns:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1238 assert i>=0 and i<data_array.shape[1]
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1239
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1240 def fieldNames(self):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1241 return self.fields_columns.keys()
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1242
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1243 def __len__(self):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1244 return len(self.data)
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1245
316
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1246 def __getitem__(self,key):
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1247 """More efficient implementation than the default __getitem__"""
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1248 fieldnames=self.fields_columns.keys()
243
c8f19a9eb10f Optimisation in ArrayDataSet::__getitem__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 242
diff changeset
1249 values=self.fields_columns.values()
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1250 if type(key) is int:
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1251 return Example(fieldnames,
243
c8f19a9eb10f Optimisation in ArrayDataSet::__getitem__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 242
diff changeset
1252 [self.data[key,col] for col in values])
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1253 if type(key) is slice:
316
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1254 return Example(fieldnames,[self.data[key,col] for col in values])
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1255 if type(key) is list:
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1256 for i in range(len(key)):
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1257 if self.hasFields(key[i]):
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1258 key[i]=self.fields_columns[key[i]]
316
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1259 return Example(fieldnames,
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1260 #we must separate differently for list as numpy
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1261 # doesn't support self.data[[i1,...],[i2,...]]
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1262 # when their is more then two i1 and i2
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1263 [self.data[key,:][:,col]
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1264 if isinstance(col,list) else
5fe6d0c93109 getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 314
diff changeset
1265 self.data[key,col] for col in values])
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1266
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1267 # else check for a fieldname
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1268 if self.hasFields(key):
105
8c0a1b11b007 bugfix, we keep all the line, but only a some columns
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 101
diff changeset
1269 return self.data[:,self.fields_columns[key]]
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1270 # else we are trying to access a property of the dataset
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1271 assert key in self.__dict__ # else it means we are trying to access a non-existing property
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1272 return self.__dict__[key]
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1273
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1274 def dontuse__iter__(self):
270
1cafd495098c code cleanup and small optimisation
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 254
diff changeset
1275 class ArrayDataSetIteratorIter(object):
1cafd495098c code cleanup and small optimisation
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 254
diff changeset
1276 def __init__(self,dataset,fieldnames):
228
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1277 if fieldnames is None: fieldnames = dataset.fieldNames()
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1278 # store the resulting minibatch in a lookup-list of values
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1279 self.minibatch = Example(fieldnames,[0]*len(fieldnames))
228
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1280 self.dataset=dataset
270
1cafd495098c code cleanup and small optimisation
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 254
diff changeset
1281 self.current=0
238
ae1d85aca858 optimization in ArrayDataSet::__iter__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 235
diff changeset
1282 self.columns = [self.dataset.fields_columns[f]
ae1d85aca858 optimization in ArrayDataSet::__iter__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 235
diff changeset
1283 for f in self.minibatch._names]
270
1cafd495098c code cleanup and small optimisation
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 254
diff changeset
1284 self.l = self.dataset.data.shape[0]
228
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1285 def __iter__(self):
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1286 return self
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1287 def next(self):
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1288 #@todo: we suppose that we need to stop only when minibatch_size == 1.
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1289 # Otherwise, MinibatchWrapAroundIterator do it.
270
1cafd495098c code cleanup and small optimisation
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 254
diff changeset
1290 if self.current>=self.l:
228
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1291 raise StopIteration
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1292 sub_data = self.dataset.data[self.current]
238
ae1d85aca858 optimization in ArrayDataSet::__iter__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 235
diff changeset
1293 self.minibatch._values = [sub_data[c] for c in self.columns]
ae1d85aca858 optimization in ArrayDataSet::__iter__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 235
diff changeset
1294
270
1cafd495098c code cleanup and small optimisation
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 254
diff changeset
1295 self.current+=1
228
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1296 return self.minibatch
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1297
270
1cafd495098c code cleanup and small optimisation
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 254
diff changeset
1298 return ArrayDataSetIteratorIter(self,self.fieldNames())
228
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1299
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1300 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1301 cursor = Example(fieldnames,[0]*len(fieldnames))
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1302 fieldnames = self.fieldNames() if fieldnames is None else fieldnames
339
aa8aff6abbf7 n_minibatches in ArrayDataSet automatically computed
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 337
diff changeset
1303 if n_batches == None:
aa8aff6abbf7 n_minibatches in ArrayDataSet automatically computed
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 337
diff changeset
1304 n_batches = (len(self) - offset) / minibatch_size
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1305 for n in xrange(n_batches):
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1306 if offset == len(self):
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1307 break
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1308 sub_data = self.data[offset : offset+minibatch_size]
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1309 offset += len(sub_data) #can be less than minibatch_size at end
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1310 cursor._values = [sub_data[:,self.fields_columns[f]] for f in cursor._names]
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1311 yield cursor
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1312
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1313 #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1314
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1315
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1316 class CachedDataSet(DataSet):
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1317 """
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
1318 Wrap a L{DataSet} whose values are computationally expensive to obtain
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1319 (e.g. because they involve some computation, or disk access),
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1320 so that repeated accesses to the same example are done cheaply,
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1321 by caching every example value that has been accessed at least once.
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1322
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1323 Optionally, for finite-length dataset, all the values can be computed
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1324 (and cached) upon construction of the CachedDataSet, rather at the
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1325 first access.
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1326
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
1327 @todo: when cache_all_upon_construction create mini-batches that are as
77
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
1328 large as possible but not so large as to fill up memory.
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
1329
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
1330 @todo: add disk-buffering capability, so that when the cache becomes too
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1331 big for memory, we cache things on disk, trying to keep in memory only
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1332 the record most likely to be accessed next.
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1333 """
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1334 def __init__(self,source_dataset,cache_all_upon_construction=False):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1335 self.source_dataset=source_dataset
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1336 self.cache_all_upon_construction=cache_all_upon_construction
152
3f627e844cba Fixes in CacheDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 151
diff changeset
1337 self.cached_examples = []
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1338 if cache_all_upon_construction:
77
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
1339 # this potentially brings all the source examples
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
1340 # into memory at once, which may be too much
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
1341 # the work could possibly be done by minibatches
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
1342 # that are as large as possible but no more than what memory allows.
353
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
1343 #
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
1344 # field_values is supposed to be an DataSetFields, that inherits from LookupList
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
1345 #fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
1346 fields_values = DataSetFields(source_dataset,None)
152
3f627e844cba Fixes in CacheDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 151
diff changeset
1347 assert all([len(self)==len(field_values) for field_values in fields_values])
3f627e844cba Fixes in CacheDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 151
diff changeset
1348 for example in fields_values.examples():
171
895b4b60f5e8 bugfix. Otherwise the example was writed over and not a new one was returned
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 167
diff changeset
1349 self.cached_examples.append(copy.copy(example))
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1350
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1351 self.fieldNames = source_dataset.fieldNames
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1352 self.hasFields = source_dataset.hasFields
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1353 self.valuesHStack = source_dataset.valuesHStack
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1354 self.valuesVStack = source_dataset.valuesVStack
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1355
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1356 def __len__(self):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1357 return len(self.source_dataset)
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1358
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1359 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1360 class CacheIterator(object):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1361 def __init__(self,dataset):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1362 self.dataset=dataset
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1363 self.current=offset
254
8ec867d12428 optimication in CachedDataSet.minibatches_nowrap
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 253
diff changeset
1364 self.all_fields = self.dataset.fieldNames()==fieldnames
353
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
1365 self.n_batches = n_batches
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
1366 self.batch_counter = 0
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1367 def __iter__(self): return self
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1368 def next(self):
353
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
1369 self.batch_counter += 1
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
1370 if self.n_batches and self.batch_counter > self.n_batches :
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
1371 raise StopIteration()
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1372 upper = self.current+minibatch_size
353
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
1373 if upper > len(self.dataset.source_dataset):
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
1374 raise StopIteration()
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1375 cache_len = len(self.dataset.cached_examples)
135
0d8e721cc63c Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 134
diff changeset
1376 if upper>cache_len: # whole minibatch is not already in cache
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1377 # cache everything from current length to upper
353
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
1378 #for example in self.dataset.source_dataset[cache_len:upper]:
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
1379 for example in self.dataset.source_dataset.subset[cache_len:upper]:
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1380 self.dataset.cached_examples.append(example)
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1381 all_fields_minibatch = Example(self.dataset.fieldNames(),
152
3f627e844cba Fixes in CacheDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 151
diff changeset
1382 zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size]))
353
47538a45b878 Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 351
diff changeset
1383
163
d7d67651d67c bugfix, we should advence by the minibatch size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 159
diff changeset
1384 self.current+=minibatch_size
254
8ec867d12428 optimication in CachedDataSet.minibatches_nowrap
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 253
diff changeset
1385 if self.all_fields:
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1386 return all_fields_minibatch
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1387 return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1388 return CacheIterator(self)
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1389
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1390 def dontuse__getitem__(self,i):
153
71107b0ac860 optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 151
diff changeset
1391 if type(i)==int and len(self.cached_examples)>i:
71107b0ac860 optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 151
diff changeset
1392 return self.cached_examples[i]
71107b0ac860 optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 151
diff changeset
1393 else:
251
7e6edee187e3 optimization of CachedDataSet__getitem__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 243
diff changeset
1394 return self.source_dataset[i]
252
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1395
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1396 def __iter__(self):
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1397 class CacheIteratorIter(object):
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1398 def __init__(self,dataset):
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1399 self.dataset=dataset
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1400 self.l = len(dataset)
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1401 self.current = 0
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1402 self.fieldnames = self.dataset.fieldNames()
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1403 self.example = Example(self.fieldnames,[0]*len(self.fieldnames))
252
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1404 def __iter__(self): return self
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1405 def next(self):
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1406 if self.current>=self.l:
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1407 raise StopIteration
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1408 cache_len = len(self.dataset.cached_examples)
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1409 if self.current>=cache_len: # whole minibatch is not already in cache
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1410 # cache everything from current length to upper
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1411 self.dataset.cached_examples.append(
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1412 self.dataset.source_dataset[self.current])
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1413 self.example._values = self.dataset.cached_examples[self.current]
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1414 self.current+=1
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1415 return self.example
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1416
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1417 return CacheIteratorIter(self)
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1418
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1419 class ApplyFunctionDataSet(DataSet):
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1420 """
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1421 A L{DataSet} that contains as fields the results of applying a
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1422 given function example-wise or minibatch-wise to all the fields of
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1423 an input dataset. The output of the function should be an iterable
296
f5d33f9c0b9c ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 293
diff changeset
1424 (e.g. a list or a LookupList) over the resulting values.
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1425
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1426 The function take as input the fields of the dataset, not the examples.
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1427
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1428 In minibatch mode, the function is expected to work on minibatches
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1429 (takes a minibatch in input and returns a minibatch in output). More
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1430 precisely, it means that each element of the input or output list
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1431 should be iterable and indexable over the individual example values
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1432 (typically these elements will be numpy arrays). All of the elements
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1433 in the input and output lists should have the same length, which is
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1434 the length of the minibatch.
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1435
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1436 The function is applied each time an example or a minibatch is accessed.
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1437 To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1438
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1439 If the values_{h,v}stack functions are not provided, then
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1440 the input_dataset.values{H,V}Stack functions are used by default.
296
f5d33f9c0b9c ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 293
diff changeset
1441
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1442 """
296
f5d33f9c0b9c ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 293
diff changeset
1443
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1444 def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1445 values_hstack=None,values_vstack=None,
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1446 description=None,fieldtypes=None):
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1447 """
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1448 Constructor takes an input dataset that has as many fields as the function
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1449 expects as inputs. The resulting dataset has as many fields as the function
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1450 produces as outputs, and that should correspond to the number of output names
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1451 (provided in a list).
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1452
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1453 Note that the expected semantics of the function differs in minibatch mode
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1454 (it takes minibatches of inputs and produces minibatches of outputs, as
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1455 documented in the class comment).
211
bd728c83faff in __get__, problem if the i.stop was None, i being the slice, added one line replacing None by the len(self)
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 203
diff changeset
1456
428
52b4908d8971 simple example of theano
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 424
diff changeset
1457 TBM: are fieldtypes the old field types (from input_dataset) or the new ones
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1458 (for the new dataset created)?
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1459 """
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1460 self.input_dataset=input_dataset
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1461 self.function=function
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1462 self.output_names=output_names
428
52b4908d8971 simple example of theano
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 424
diff changeset
1463 #print 'self.output_names in afds:', self.output_names
52b4908d8971 simple example of theano
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 424
diff changeset
1464 #print 'length in afds:', len(self.output_names)
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1465 self.minibatch_mode=minibatch_mode
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1466 DataSet.__init__(self,description,fieldtypes)
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1467 self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1468 self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1469
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1470 def __len__(self):
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1471 return len(self.input_dataset)
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1472
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1473 def fieldNames(self):
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1474 return self.output_names
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1475
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1476 def minibatches_nowrap(self, fieldnames, *args, **kwargs):
296
f5d33f9c0b9c ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 293
diff changeset
1477 all_input_fieldNames = self.input_dataset.fieldNames()
f5d33f9c0b9c ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 293
diff changeset
1478 mbnw = self.input_dataset.minibatches_nowrap
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1479
296
f5d33f9c0b9c ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 293
diff changeset
1480 for input_fields in mbnw(all_input_fieldNames, *args, **kwargs):
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1481 if self.minibatch_mode:
296
f5d33f9c0b9c ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 293
diff changeset
1482 all_output_fields = self.function(*input_fields)
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1483 else:
296
f5d33f9c0b9c ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 293
diff changeset
1484 input_examples = zip(*input_fields) #makes so that [i] means example i
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1485 output_examples = [self.function(*input_example)
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1486 for input_example in input_examples]
296
f5d33f9c0b9c ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 293
diff changeset
1487 all_output_fields = zip(*output_examples)
f5d33f9c0b9c ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 293
diff changeset
1488
428
52b4908d8971 simple example of theano
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 424
diff changeset
1489 #print 'output_names=', self.output_names
52b4908d8971 simple example of theano
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 424
diff changeset
1490 #print 'all_output_fields', all_output_fields
52b4908d8971 simple example of theano
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 424
diff changeset
1491 #print 'len(all_output_fields)=', len(all_output_fields)
296
f5d33f9c0b9c ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 293
diff changeset
1492 all_outputs = Example(self.output_names, all_output_fields)
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1493 if fieldnames==self.output_names:
293
4bfdda107a17 still merging
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 292
diff changeset
1494 rval = all_outputs
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1495 else:
293
4bfdda107a17 still merging
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 292
diff changeset
1496 rval = Example(fieldnames,[all_outputs[name] for name in fieldnames])
296
f5d33f9c0b9c ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 293
diff changeset
1497 #print 'rval', rval
f5d33f9c0b9c ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 293
diff changeset
1498 #print '--------'
293
4bfdda107a17 still merging
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 292
diff changeset
1499 yield rval
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1500
290
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1501 def untested__iter__(self): # only implemented for increased efficiency
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1502 class ApplyFunctionSingleExampleIterator(object):
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1503 def __init__(self,output_dataset):
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1504 self.current=0
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1505 self.output_dataset=output_dataset
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1506 self.input_iterator=output_dataset.input_dataset.__iter__()
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1507 def __iter__(self): return self
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1508 def next(self):
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1509 if self.output_dataset.minibatch_mode:
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1510 function_inputs = [[input] for input in self.input_iterator.next()]
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1511 outputs = self.output_dataset.function(*function_inputs)
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1512 assert all([hasattr(output,'__iter__') for output in outputs])
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1513 function_outputs = [output[0] for output in outputs]
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1514 else:
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1515 function_inputs = self.input_iterator.next()
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1516 function_outputs = self.output_dataset.function(*function_inputs)
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1517 return Example(self.output_dataset.output_names,function_outputs)
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1518 return ApplyFunctionSingleExampleIterator(self)
9b533cc7874a trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 274
diff changeset
1519
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1520 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1521 """
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
1522 Wraps an arbitrary L{DataSet} into one for supervised learning tasks
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
1523 by forcing the user to define a set of fields as the 'input' field
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
1524 and a set of fields as the 'target' field. Optionally, a single
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
1525 weight_field can also be defined.
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1526 """
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1527 args = ((input_fields,'input'),(output_fields,'target'))
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1528 if weight_field: args+=(([weight_field],'weight'))
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1529 return src_dataset.merge_fields(*args)
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1530
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1531
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1532
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1533